Last active
March 18, 2025 06:38
-
-
Save salvatorecapolupo/8e13c6b10be24c415d2925b8de3276ce to your computer and use it in GitHub Desktop.
Cercare gli errori di status 4xx e 5xx in python su qualsiasi sito web :-) - https://trovalost.it/http-status/
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
import openpyxl | |
import xml.etree.ElementTree as ET | |
from concurrent.futures import ThreadPoolExecutor, as_completed | |
import re | |
import random | |
import time | |
proxies = [ | |
# lista proxy da editare | |
# consiglio di metterne almeno un centinaio | |
# | |
# | |
# esempio | |
# 'IPv4:porta' | |
# | |
# '109.167.34.67:3028', | |
# '164.167.36.108:3118', | |
# ... | |
] | |
def fetch_url(url, proxies): | |
proxy = random.choice(proxies) if proxies else None | |
try: | |
response = requests.get(url, proxies={"http": proxy, "https": proxy} if proxy else None) | |
return url, response.status_code | |
except requests.RequestException as e: | |
return url, None | |
def read_sitemap(sitemap_url): | |
try: | |
response = requests.get(sitemap_url) | |
if response.status_code == 200: | |
sitemap_content = response.text | |
root = ET.fromstring(sitemap_content) | |
# Check if it's a sitemap index | |
sitemapindex = root.findall(".//{http://www.sitemaps.org/schemas/sitemap/0.9}sitemap") | |
if sitemapindex: | |
urls = [] | |
for sitemap in sitemapindex: | |
sub_sitemap_url = sitemap.find("{http://www.sitemaps.org/schemas/sitemap/0.9}loc").text | |
urls.extend(read_sitemap(sub_sitemap_url)) | |
return urls | |
else: | |
urls = [element.text for element in root.findall(".//{http://www.sitemaps.org/schemas/sitemap/0.9}loc")] | |
return urls | |
else: | |
print(f"Failed to fetch sitemap: {response.status_code}") | |
return [] | |
except Exception as e: | |
print(f"Error reading sitemap: {e}") | |
return [] | |
def check_urls(urls, proxies): | |
visited = set() | |
error_urls = [] | |
total_urls = len(urls) | |
count = 0 | |
block_size = 100 | |
with ThreadPoolExecutor(max_workers=10) as executor: | |
future_to_url = {executor.submit(fetch_url, url, proxies): url for url in urls} | |
for future in as_completed(future_to_url): | |
url = future_to_url[future] | |
try: | |
url, status_code = future.result() | |
visited.add(url) | |
count += 1 | |
if status_code and (400 <= status_code < 600): | |
error_urls.append((url, status_code)) | |
if count % 10 == 0: # Print progress every 10 URLs checked | |
print(f"Progress: Checked {count} of {total_urls} URLs...") | |
if count % block_size == 0: # Write to file every 100 URLs | |
write_partial_report(error_urls, f"error_report_{count//block_size}.xlsx") | |
error_urls.clear() | |
# Random pause between 1 to 3 seconds | |
time.sleep(random.uniform(1, 3)) | |
except Exception as e: | |
print(f"Error processing URL {url}: {e}") | |
# Write remaining errors if any | |
if error_urls: | |
write_partial_report(error_urls, f"error_report_{(count//block_size) + 1}.xlsx") | |
def write_partial_report(error_urls, filename): | |
workbook = openpyxl.Workbook() | |
sheet = workbook.active | |
sheet.title = "Error Report" | |
headers = ["URL", "Status Code", "Suggested Fix"] | |
sheet.append(headers) | |
suggestions = { | |
400: "Check the request syntax.", | |
401: "Check authentication credentials.", | |
403: "Check permissions.", | |
404: "Check if the URL is correct.", | |
500: "Check server logs for more details.", | |
502: "Check the proxy server.", | |
503: "Check server load and availability.", | |
504: "Check gateway timeout settings." | |
} | |
for url, status_code in error_urls: | |
suggestion = suggestions.get(status_code, "Check server logs for more details.") | |
sheet.append([url, status_code, suggestion]) | |
workbook.save(filename) | |
def normalize_base_url(base_url): | |
normalized_url = re.sub(r'^https?:\/\/', '', base_url) # Remove http:// or https:// | |
normalized_url = re.sub(r'[^\w\-]', '-', normalized_url) # Replace non-alphanumeric characters with - | |
return normalized_url | |
def main(): | |
base_url = input("Enter the base URL of the site to scan: ") | |
sitemap_url = input("Enter the URL of the sitemap: ") | |
normalized_base_url = normalize_base_url(base_url) | |
print("Fetching URLs from sitemap...") | |
urls = read_sitemap(sitemap_url) | |
if not urls: | |
print("No URLs found in sitemap or failed to fetch sitemap.") | |
return | |
print("Starting URL scan...") | |
check_urls(urls, proxies) | |
print("URL scan completed.") | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment