Created
July 26, 2024 17:34
-
-
Save Xnuvers007/5d12cd47ce98a4e60e39d43a360275b1 to your computer and use it in GitHub Desktop.
exiftool from scraping website
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from flask import Flask, request, jsonify | |
import requests | |
from bs4 import BeautifulSoup | |
from urllib.parse import urlparse, parse_qs | |
import re | |
app = Flask(__name__) | |
# Regular expression to validate URLs | |
URL_REGEX = re.compile( | |
r'^(?:http|ftp)s?://' # http:// or https:// | |
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}|[A-Z0-9-]{2,})|' # domain... | |
r'localhost|' # localhost... | |
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|' # ...or ipv4 | |
r'\[?[A-F0-9]*:[A-F0-9:]+\]?)' # ...or ipv6 | |
r'(?::\d+)?' # optional port | |
r'(?:/?|[/?]\S+)$', re.IGNORECASE) | |
def remove_spaces_from_keys(data): | |
if isinstance(data, dict): | |
new_data = {} | |
for key, value in data.items(): | |
new_key = key.replace(" ", "") | |
new_data[new_key] = remove_spaces_from_keys(value) | |
return new_data | |
elif isinstance(data, list): | |
return [remove_spaces_from_keys(item) for item in data] | |
else: | |
return data | |
def is_valid_url(url): | |
return re.match(URL_REGEX, url) is not None | |
def prepend_scheme(url): | |
if not urlparse(url).scheme: | |
return 'https://' + url | |
return url | |
@app.route('/exiftool', methods=['GET']) | |
def exiftool(): | |
image_url = request.args.get('url') | |
if not image_url: | |
return jsonify({"error": "Missing 'url' parameter", | |
"example": request.host_url + "exiftool?url=https://example.com/image.jpg"}), 400 | |
image_url = prepend_scheme(image_url) | |
# Validate URL format | |
if not is_valid_url(image_url): | |
return jsonify({"error": "Invalid URL format. Please provide a valid URL.", | |
"example": request.host_url + "exiftool?url=https://example.com/image.jpg"}), 400 | |
parsed_url = urlparse(image_url) | |
if parsed_url.query: | |
query_params = parse_qs(parsed_url.query) | |
image_url = query_params.get('imgurl', [None])[0] | |
if not image_url or not (image_url.endswith('.jpg') or image_url.endswith('.png') or image_url.endswith('.webp')): | |
return jsonify({"error": "Invalid image URL. Please provide a link to a .jpg, .png, or .webp image.", | |
"Exjpg": request.host_url + "exiftool?url=https://example.com/image.jpg", | |
"Expng": request.host_url + "exiftool?url=https://example.com/image.png", | |
"Exwebp": request.host_url + "exiftool?url=https://example.com/image.webp", | |
}), 400 | |
headers = { | |
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8', | |
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36', | |
} | |
params = { | |
'url': image_url, | |
} | |
try: | |
response = requests.get('https://exifinfo.org/url', params=params, headers=headers, timeout=30) | |
response.raise_for_status() | |
except requests.exceptions.RequestException as e: | |
return jsonify({"error": str(e)}), 500 | |
soup = BeautifulSoup(response.text, 'html.parser') | |
sections = soup.find_all('div', class_='section') | |
exif_data = {} | |
for section in sections: | |
section_image = section.find('img') | |
if section_image: | |
exif_data["Image"] = f"https://exifinfo.org{section_image['src']}" | |
section_title = section.find('h2') | |
if section_title: | |
title = section_title.get_text() | |
details = section.find_all(['dt', 'dd']) | |
section_data = {} | |
for i in range(0, len(details), 2): | |
dt = details[i].get_text().strip().replace(" ", "") | |
dd = details[i + 1].get_text().strip() | |
section_data[dt] = dd | |
exif_data[title.replace(" ", "")] = section_data | |
exif_data = remove_spaces_from_keys(exif_data) | |
return jsonify(exif_data), 200 | |
if __name__ == '__main__': | |
app.run(debug=False, host='0.0.0.0', port=5000) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
masukanlink = input("Link Images (.jpg/.png/.webp/) etc : ") | |
headers = { | |
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8', | |
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36', | |
} | |
params = { | |
# 'url': 'https://ppcexpo.com/blog/wp-content/uploads/2020/11/what-is-a-meta-search-engine-.jpg', | |
'url': f'{masukanlink}', | |
} | |
response = requests.get('https://exifinfo.org/url', params=params, headers=headers) | |
soup = BeautifulSoup(response.text, 'html.parser') | |
sections = soup.find_all('div', class_='section') | |
for section in sections: | |
section_image = section.find('img') | |
if section_image: | |
print(f"Image: https://exifinfo.org{section_image['src']}") | |
section_title = section.find('h2') | |
if section_title: | |
print(f"{section_title.get_text()}:") | |
details = section.find_all(['dt', 'dd']) | |
for i in range(0, len(details), 2): | |
dt = details[i].get_text().strip() | |
dd = details[i+1].get_text().strip() | |
print(f" {dt}: {dd}") | |
print() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment