Skip to content

Instantly share code, notes, and snippets.

@Xnuvers007
Created July 26, 2024 17:34
Show Gist options
  • Save Xnuvers007/5d12cd47ce98a4e60e39d43a360275b1 to your computer and use it in GitHub Desktop.
Save Xnuvers007/5d12cd47ce98a4e60e39d43a360275b1 to your computer and use it in GitHub Desktop.
exiftool from scraping website
from flask import Flask, request, jsonify
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, parse_qs
import re
app = Flask(__name__)
# Regular expression to validate URLs
URL_REGEX = re.compile(
r'^(?:http|ftp)s?://' # http:// or https://
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}|[A-Z0-9-]{2,})|' # domain...
r'localhost|' # localhost...
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|' # ...or ipv4
r'\[?[A-F0-9]*:[A-F0-9:]+\]?)' # ...or ipv6
r'(?::\d+)?' # optional port
r'(?:/?|[/?]\S+)$', re.IGNORECASE)
def remove_spaces_from_keys(data):
if isinstance(data, dict):
new_data = {}
for key, value in data.items():
new_key = key.replace(" ", "")
new_data[new_key] = remove_spaces_from_keys(value)
return new_data
elif isinstance(data, list):
return [remove_spaces_from_keys(item) for item in data]
else:
return data
def is_valid_url(url):
return re.match(URL_REGEX, url) is not None
def prepend_scheme(url):
if not urlparse(url).scheme:
return 'https://' + url
return url
@app.route('/exiftool', methods=['GET'])
def exiftool():
image_url = request.args.get('url')
if not image_url:
return jsonify({"error": "Missing 'url' parameter",
"example": request.host_url + "exiftool?url=https://example.com/image.jpg"}), 400
image_url = prepend_scheme(image_url)
# Validate URL format
if not is_valid_url(image_url):
return jsonify({"error": "Invalid URL format. Please provide a valid URL.",
"example": request.host_url + "exiftool?url=https://example.com/image.jpg"}), 400
parsed_url = urlparse(image_url)
if parsed_url.query:
query_params = parse_qs(parsed_url.query)
image_url = query_params.get('imgurl', [None])[0]
if not image_url or not (image_url.endswith('.jpg') or image_url.endswith('.png') or image_url.endswith('.webp')):
return jsonify({"error": "Invalid image URL. Please provide a link to a .jpg, .png, or .webp image.",
"Exjpg": request.host_url + "exiftool?url=https://example.com/image.jpg",
"Expng": request.host_url + "exiftool?url=https://example.com/image.png",
"Exwebp": request.host_url + "exiftool?url=https://example.com/image.webp",
}), 400
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36',
}
params = {
'url': image_url,
}
try:
response = requests.get('https://exifinfo.org/url', params=params, headers=headers, timeout=30)
response.raise_for_status()
except requests.exceptions.RequestException as e:
return jsonify({"error": str(e)}), 500
soup = BeautifulSoup(response.text, 'html.parser')
sections = soup.find_all('div', class_='section')
exif_data = {}
for section in sections:
section_image = section.find('img')
if section_image:
exif_data["Image"] = f"https://exifinfo.org{section_image['src']}"
section_title = section.find('h2')
if section_title:
title = section_title.get_text()
details = section.find_all(['dt', 'dd'])
section_data = {}
for i in range(0, len(details), 2):
dt = details[i].get_text().strip().replace(" ", "")
dd = details[i + 1].get_text().strip()
section_data[dt] = dd
exif_data[title.replace(" ", "")] = section_data
exif_data = remove_spaces_from_keys(exif_data)
return jsonify(exif_data), 200
if __name__ == '__main__':
app.run(debug=False, host='0.0.0.0', port=5000)
import requests
from bs4 import BeautifulSoup
masukanlink = input("Link Images (.jpg/.png/.webp/) etc : ")
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36',
}
params = {
# 'url': 'https://ppcexpo.com/blog/wp-content/uploads/2020/11/what-is-a-meta-search-engine-.jpg',
'url': f'{masukanlink}',
}
response = requests.get('https://exifinfo.org/url', params=params, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
sections = soup.find_all('div', class_='section')
for section in sections:
section_image = section.find('img')
if section_image:
print(f"Image: https://exifinfo.org{section_image['src']}")
section_title = section.find('h2')
if section_title:
print(f"{section_title.get_text()}:")
details = section.find_all(['dt', 'dd'])
for i in range(0, len(details), 2):
dt = details[i].get_text().strip()
dd = details[i+1].get_text().strip()
print(f" {dt}: {dd}")
print()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment