Created
April 15, 2025 21:35
-
-
Save awatertrevi/6a985dd207da61928458e732a2e23e00 to your computer and use it in GitHub Desktop.
sitemap_comparer.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import xml.etree.ElementTree as ET | |
from urllib.parse import urlparse, urljoin | |
visited_sitemaps = set() | |
def normalize_url(url): | |
"""Return only the path and query part of a URL, for cross-host matching.""" | |
parsed = urlparse(url.strip()) | |
return parsed.path + ('?' + parsed.query if parsed.query else '') | |
def fetch_xml(url): | |
try: | |
response = requests.get(url, timeout=10) | |
response.raise_for_status() | |
return ET.fromstring(response.content), url | |
except Exception as e: | |
print(f"β οΈ Failed to fetch {url}: {e}") | |
return None, url | |
def collect_urls_from_sitemap(url, base=None): | |
if url in visited_sitemaps: | |
return set() | |
full_url = url if base is None else urljoin(base, url) | |
visited_sitemaps.add(full_url) | |
print(f"π Visiting: {full_url}") | |
root, resolved_url = fetch_xml(full_url) | |
if root is None: | |
return set() | |
urls = set() | |
if root.tag.endswith("urlset"): | |
for loc in root.findall(".//{*}loc"): | |
if loc is None or not loc.text: | |
continue | |
href = loc.text.strip() | |
if not href: | |
continue | |
if href.endswith(".xml"): | |
print(f" βͺ Treating {href} as nested sitemap") | |
urls |= collect_urls_from_sitemap(href, base=resolved_url) | |
else: | |
normalized = normalize_url(href) | |
if normalized: | |
urls.add(normalized) | |
print(f" β³ Final URLs collected: {len(urls)}") | |
return urls | |
elif root.tag.endswith("sitemapindex"): | |
for sitemap in root.findall(".//{*}sitemap"): | |
loc = sitemap.find("{*}loc") | |
if loc is not None and loc.text: | |
urls |= collect_urls_from_sitemap(loc.text.strip(), base=resolved_url) | |
return urls | |
else: | |
print(f"β Unknown root tag: {root.tag}") | |
return set() | |
def compare_sitemaps(flat_url, index_url): | |
print("π₯ Fetching full sitemap...") | |
flat_urls = collect_urls_from_sitemap(flat_url) | |
print(f"β Total in flat sitemap: {len(flat_urls)}\n") | |
print("π₯ Fetching split sitemap recursively...") | |
visited_sitemaps.clear() | |
split_urls = collect_urls_from_sitemap(index_url) | |
print(f"β Total in split sitemaps: {len(split_urls)}\n") | |
missing = flat_urls - split_urls | |
extra = split_urls - flat_urls | |
if missing: | |
print(f"β Missing URLs in split sitemap ({len(missing)}):") | |
for url in sorted(missing)[:20]: | |
print(f" [{url}]") | |
if len(missing) > 20: | |
print(f" ...and {len(missing) - 20} more") | |
else: | |
print("β No missing URLs from the flat sitemap!") | |
if extra: | |
print(f"\nβ οΈ Extra URLs in split sitemap ({len(extra)}):") | |
for url in sorted(extra)[:20]: | |
print(f" [{url}]") | |
if len(extra) > 20: | |
print(f" ...and {len(extra) - 20} more") | |
if not missing and not extra: | |
print("π All URLs from the flat sitemap are covered!") | |
# Example usage | |
if __name__ == "__main__": | |
compare_sitemaps( | |
'https://example.com/sitemap-a.xml', | |
'https://example.com/sitemap-b.xml' | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment