Skip to content

Instantly share code, notes, and snippets.

@salvatorecapolupo
Last active March 18, 2025 06:38
Show Gist options
  • Save salvatorecapolupo/8e13c6b10be24c415d2925b8de3276ce to your computer and use it in GitHub Desktop.
Save salvatorecapolupo/8e13c6b10be24c415d2925b8de3276ce to your computer and use it in GitHub Desktop.
Cercare gli errori di status 4xx e 5xx in python su qualsiasi sito web :-) - https://trovalost.it/http-status/
import requests
from bs4 import BeautifulSoup
import openpyxl
import xml.etree.ElementTree as ET
from concurrent.futures import ThreadPoolExecutor, as_completed
import re
import random
import time
proxies = [
# lista proxy da editare
# consiglio di metterne almeno un centinaio
#
#
# esempio
# 'IPv4:porta'
#
# '109.167.34.67:3028',
# '164.167.36.108:3118',
# ...
]
def fetch_url(url, proxies):
proxy = random.choice(proxies) if proxies else None
try:
response = requests.get(url, proxies={"http": proxy, "https": proxy} if proxy else None)
return url, response.status_code
except requests.RequestException as e:
return url, None
def read_sitemap(sitemap_url):
try:
response = requests.get(sitemap_url)
if response.status_code == 200:
sitemap_content = response.text
root = ET.fromstring(sitemap_content)
# Check if it's a sitemap index
sitemapindex = root.findall(".//{http://www.sitemaps.org/schemas/sitemap/0.9}sitemap")
if sitemapindex:
urls = []
for sitemap in sitemapindex:
sub_sitemap_url = sitemap.find("{http://www.sitemaps.org/schemas/sitemap/0.9}loc").text
urls.extend(read_sitemap(sub_sitemap_url))
return urls
else:
urls = [element.text for element in root.findall(".//{http://www.sitemaps.org/schemas/sitemap/0.9}loc")]
return urls
else:
print(f"Failed to fetch sitemap: {response.status_code}")
return []
except Exception as e:
print(f"Error reading sitemap: {e}")
return []
def check_urls(urls, proxies):
visited = set()
error_urls = []
total_urls = len(urls)
count = 0
block_size = 100
with ThreadPoolExecutor(max_workers=10) as executor:
future_to_url = {executor.submit(fetch_url, url, proxies): url for url in urls}
for future in as_completed(future_to_url):
url = future_to_url[future]
try:
url, status_code = future.result()
visited.add(url)
count += 1
if status_code and (400 <= status_code < 600):
error_urls.append((url, status_code))
if count % 10 == 0: # Print progress every 10 URLs checked
print(f"Progress: Checked {count} of {total_urls} URLs...")
if count % block_size == 0: # Write to file every 100 URLs
write_partial_report(error_urls, f"error_report_{count//block_size}.xlsx")
error_urls.clear()
# Random pause between 1 to 3 seconds
time.sleep(random.uniform(1, 3))
except Exception as e:
print(f"Error processing URL {url}: {e}")
# Write remaining errors if any
if error_urls:
write_partial_report(error_urls, f"error_report_{(count//block_size) + 1}.xlsx")
def write_partial_report(error_urls, filename):
workbook = openpyxl.Workbook()
sheet = workbook.active
sheet.title = "Error Report"
headers = ["URL", "Status Code", "Suggested Fix"]
sheet.append(headers)
suggestions = {
400: "Check the request syntax.",
401: "Check authentication credentials.",
403: "Check permissions.",
404: "Check if the URL is correct.",
500: "Check server logs for more details.",
502: "Check the proxy server.",
503: "Check server load and availability.",
504: "Check gateway timeout settings."
}
for url, status_code in error_urls:
suggestion = suggestions.get(status_code, "Check server logs for more details.")
sheet.append([url, status_code, suggestion])
workbook.save(filename)
def normalize_base_url(base_url):
normalized_url = re.sub(r'^https?:\/\/', '', base_url) # Remove http:// or https://
normalized_url = re.sub(r'[^\w\-]', '-', normalized_url) # Replace non-alphanumeric characters with -
return normalized_url
def main():
base_url = input("Enter the base URL of the site to scan: ")
sitemap_url = input("Enter the URL of the sitemap: ")
normalized_base_url = normalize_base_url(base_url)
print("Fetching URLs from sitemap...")
urls = read_sitemap(sitemap_url)
if not urls:
print("No URLs found in sitemap or failed to fetch sitemap.")
return
print("Starting URL scan...")
check_urls(urls, proxies)
print("URL scan completed.")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment