salvatorecapolupo · March 18, 2025 06:38
diff --git a/checkSEO.py b/checkSEO.py
 import requests
 from bs4 import BeautifulSoup
 import openpyxl
 import xml.etree.ElementTree as ET
 from concurrent.futures import ThreadPoolExecutor, as_completed
 import re
 import random
 import time

 proxies = [
    # lista proxy da editare
    # consiglio di metterne almeno un centinaio
    #
    #
    # esempio
    # 'IPv4:porta'
    #
    # '109.167.34.67:3028',
    # '164.167.36.108:3118',
    # ...
 ]

 def fetch_url(url, proxies):
    proxy = random.choice(proxies) if proxies else None
    try:
        response = requests.get(url, proxies={"http": proxy, "https": proxy} if proxy else None)
        return url, response.status_code
    except requests.RequestException as e:
        return url, None

 def read_sitemap(sitemap_url):
    try:
        response = requests.get(sitemap_url)
        if response.status_code == 200:
            sitemap_content = response.text
            root = ET.fromstring(sitemap_content)

            # Check if it's a sitemap index
            sitemapindex = root.findall(".//{http://www.sitemaps.org/schemas/sitemap/0.9}sitemap")
            if sitemapindex:
                urls = []
                for sitemap in sitemapindex:
                    sub_sitemap_url = sitemap.find("{http://www.sitemaps.org/schemas/sitemap/0.9}loc").text
                    urls.extend(read_sitemap(sub_sitemap_url))
                return urls
            else:
                urls = [element.text for element in root.findall(".//{http://www.sitemaps.org/schemas/sitemap/0.9}loc")]
                return urls
        else:
            print(f"Failed to fetch sitemap: {response.status_code}")
            return []
    except Exception as e:
        print(f"Error reading sitemap: {e}")
        return []

 def check_urls(urls, proxies):
    visited = set()
    error_urls = []
    total_urls = len(urls)
    count = 0
    block_size = 100

    with ThreadPoolExecutor(max_workers=10) as executor:
        future_to_url = {executor.submit(fetch_url, url, proxies): url for url in urls}

        for future in as_completed(future_to_url):
            url = future_to_url[future]
            try:
                url, status_code = future.result()
                visited.add(url)
                count += 1

                if status_code and (400 <= status_code < 600):
                    error_urls.append((url, status_code))

                if count % 10 == 0:  # Print progress every 10 URLs checked
                    print(f"Progress: Checked {count} of {total_urls} URLs...")

                if count % block_size == 0:  # Write to file every 100 URLs
                    write_partial_report(error_urls, f"error_report_{count//block_size}.xlsx")
                    error_urls.clear()

                # Random pause between 1 to 3 seconds
                time.sleep(random.uniform(1, 3))

            except Exception as e:
                print(f"Error processing URL {url}: {e}")

    # Write remaining errors if any
    if error_urls:
        write_partial_report(error_urls, f"error_report_{(count//block_size) + 1}.xlsx")

 def write_partial_report(error_urls, filename):
    workbook = openpyxl.Workbook()
    sheet = workbook.active
    sheet.title = "Error Report"

    headers = ["URL", "Status Code", "Suggested Fix"]
    sheet.append(headers)

    suggestions = {
        400: "Check the request syntax.",
        401: "Check authentication credentials.",
        403: "Check permissions.",
        404: "Check if the URL is correct.",
        500: "Check server logs for more details.",
        502: "Check the proxy server.",
        503: "Check server load and availability.",
        504: "Check gateway timeout settings."
    }

    for url, status_code in error_urls:
        suggestion = suggestions.get(status_code, "Check server logs for more details.")
        sheet.append([url, status_code, suggestion])

    workbook.save(filename)

 def normalize_base_url(base_url):
    normalized_url = re.sub(r'^https?:\/\/', '', base_url)  # Remove http:// or https://
    normalized_url = re.sub(r'[^\w\-]', '-', normalized_url)  # Replace non-alphanumeric characters with -
    return normalized_url

 def main():
    base_url = input("Enter the base URL of the site to scan: ")
    sitemap_url = input("Enter the URL of the sitemap: ")
    normalized_base_url = normalize_base_url(base_url)

    print("Fetching URLs from sitemap...")
    urls = read_sitemap(sitemap_url)
    if not urls:
        print("No URLs found in sitemap or failed to fetch sitemap.")
        return

    print("Starting URL scan...")
    check_urls(urls, proxies)
    print("URL scan completed.")

 if __name__ == "__main__":
    main()
	import requests
	from bs4 import BeautifulSoup
	import openpyxl
	import xml.etree.ElementTree as ET
	from concurrent.futures import ThreadPoolExecutor, as_completed
	import re
	import random
	import time

	proxies = [
	# lista proxy da editare
	# consiglio di metterne almeno un centinaio
	#
	#
	# esempio
	# 'IPv4:porta'
	#
	# '109.167.34.67:3028',
	# '164.167.36.108:3118',
	# ...
	]

	def fetch_url(url, proxies):
	proxy = random.choice(proxies) if proxies else None
	try:
	response = requests.get(url, proxies={"http": proxy, "https": proxy} if proxy else None)
	return url, response.status_code
	except requests.RequestException as e:
	return url, None

	def read_sitemap(sitemap_url):
	try:
	response = requests.get(sitemap_url)
	if response.status_code == 200:
	sitemap_content = response.text
	root = ET.fromstring(sitemap_content)

	# Check if it's a sitemap index
	sitemapindex = root.findall(".//{http://www.sitemaps.org/schemas/sitemap/0.9}sitemap")
	if sitemapindex:
	urls = []
	for sitemap in sitemapindex:
	sub_sitemap_url = sitemap.find("{http://www.sitemaps.org/schemas/sitemap/0.9}loc").text
	urls.extend(read_sitemap(sub_sitemap_url))
	return urls
	else:
	urls = [element.text for element in root.findall(".//{http://www.sitemaps.org/schemas/sitemap/0.9}loc")]
	return urls
	else:
	print(f"Failed to fetch sitemap: {response.status_code}")
	return []
	except Exception as e:
	print(f"Error reading sitemap: {e}")
	return []

	def check_urls(urls, proxies):
	visited = set()
	error_urls = []
	total_urls = len(urls)
	count = 0
	block_size = 100

	with ThreadPoolExecutor(max_workers=10) as executor:
	future_to_url = {executor.submit(fetch_url, url, proxies): url for url in urls}

	for future in as_completed(future_to_url):
	url = future_to_url[future]
	try:
	url, status_code = future.result()
	visited.add(url)
	count += 1

	if status_code and (400 <= status_code < 600):
	error_urls.append((url, status_code))

	if count % 10 == 0: # Print progress every 10 URLs checked
	print(f"Progress: Checked {count} of {total_urls} URLs...")

	if count % block_size == 0: # Write to file every 100 URLs
	write_partial_report(error_urls, f"error_report_{count//block_size}.xlsx")
	error_urls.clear()

	# Random pause between 1 to 3 seconds
	time.sleep(random.uniform(1, 3))

	except Exception as e:
	print(f"Error processing URL {url}: {e}")

	# Write remaining errors if any
	if error_urls:
	write_partial_report(error_urls, f"error_report_{(count//block_size) + 1}.xlsx")

	def write_partial_report(error_urls, filename):
	workbook = openpyxl.Workbook()
	sheet = workbook.active
	sheet.title = "Error Report"

	headers = ["URL", "Status Code", "Suggested Fix"]
	sheet.append(headers)

	suggestions = {
	400: "Check the request syntax.",
	401: "Check authentication credentials.",
	403: "Check permissions.",
	404: "Check if the URL is correct.",
	500: "Check server logs for more details.",
	502: "Check the proxy server.",
	503: "Check server load and availability.",
	504: "Check gateway timeout settings."
	}

	for url, status_code in error_urls:
	suggestion = suggestions.get(status_code, "Check server logs for more details.")
	sheet.append([url, status_code, suggestion])

	workbook.save(filename)

	def normalize_base_url(base_url):
	normalized_url = re.sub(r'^https?:\/\/', '', base_url) # Remove http:// or https://
	normalized_url = re.sub(r'[^\w\-]', '-', normalized_url) # Replace non-alphanumeric characters with -
	return normalized_url

	def main():
	base_url = input("Enter the base URL of the site to scan: ")
	sitemap_url = input("Enter the URL of the sitemap: ")
	normalized_base_url = normalize_base_url(base_url)

	print("Fetching URLs from sitemap...")
	urls = read_sitemap(sitemap_url)
	if not urls:
	print("No URLs found in sitemap or failed to fetch sitemap.")
	return

	print("Starting URL scan...")
	check_urls(urls, proxies)
	print("URL scan completed.")

	if __name__ == "__main__":
	main()