Skip to content

Instantly share code, notes, and snippets.

@awatertrevi
Created April 15, 2025 21:35
Show Gist options
  • Save awatertrevi/6a985dd207da61928458e732a2e23e00 to your computer and use it in GitHub Desktop.
Save awatertrevi/6a985dd207da61928458e732a2e23e00 to your computer and use it in GitHub Desktop.
sitemap_comparer.py
import requests
import xml.etree.ElementTree as ET
from urllib.parse import urlparse, urljoin
visited_sitemaps = set()
def normalize_url(url):
"""Return only the path and query part of a URL, for cross-host matching."""
parsed = urlparse(url.strip())
return parsed.path + ('?' + parsed.query if parsed.query else '')
def fetch_xml(url):
try:
response = requests.get(url, timeout=10)
response.raise_for_status()
return ET.fromstring(response.content), url
except Exception as e:
print(f"⚠️ Failed to fetch {url}: {e}")
return None, url
def collect_urls_from_sitemap(url, base=None):
if url in visited_sitemaps:
return set()
full_url = url if base is None else urljoin(base, url)
visited_sitemaps.add(full_url)
print(f"πŸ” Visiting: {full_url}")
root, resolved_url = fetch_xml(full_url)
if root is None:
return set()
urls = set()
if root.tag.endswith("urlset"):
for loc in root.findall(".//{*}loc"):
if loc is None or not loc.text:
continue
href = loc.text.strip()
if not href:
continue
if href.endswith(".xml"):
print(f" β†ͺ Treating {href} as nested sitemap")
urls |= collect_urls_from_sitemap(href, base=resolved_url)
else:
normalized = normalize_url(href)
if normalized:
urls.add(normalized)
print(f" ↳ Final URLs collected: {len(urls)}")
return urls
elif root.tag.endswith("sitemapindex"):
for sitemap in root.findall(".//{*}sitemap"):
loc = sitemap.find("{*}loc")
if loc is not None and loc.text:
urls |= collect_urls_from_sitemap(loc.text.strip(), base=resolved_url)
return urls
else:
print(f"❌ Unknown root tag: {root.tag}")
return set()
def compare_sitemaps(flat_url, index_url):
print("πŸ“₯ Fetching full sitemap...")
flat_urls = collect_urls_from_sitemap(flat_url)
print(f"βœ… Total in flat sitemap: {len(flat_urls)}\n")
print("πŸ“₯ Fetching split sitemap recursively...")
visited_sitemaps.clear()
split_urls = collect_urls_from_sitemap(index_url)
print(f"βœ… Total in split sitemaps: {len(split_urls)}\n")
missing = flat_urls - split_urls
extra = split_urls - flat_urls
if missing:
print(f"❌ Missing URLs in split sitemap ({len(missing)}):")
for url in sorted(missing)[:20]:
print(f" [{url}]")
if len(missing) > 20:
print(f" ...and {len(missing) - 20} more")
else:
print("βœ… No missing URLs from the flat sitemap!")
if extra:
print(f"\n⚠️ Extra URLs in split sitemap ({len(extra)}):")
for url in sorted(extra)[:20]:
print(f" [{url}]")
if len(extra) > 20:
print(f" ...and {len(extra) - 20} more")
if not missing and not extra:
print("πŸŽ‰ All URLs from the flat sitemap are covered!")
# Example usage
if __name__ == "__main__":
compare_sitemaps(
'https://example.com/sitemap-a.xml',
'https://example.com/sitemap-b.xml'
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment