awatertrevi · April 15, 2025 21:35
diff --git a/sitemap_comparer.py b/sitemap_comparer.py
 import requests
 import xml.etree.ElementTree as ET
 from urllib.parse import urlparse, urljoin

 visited_sitemaps = set()

 def normalize_url(url):
    """Return only the path and query part of a URL, for cross-host matching."""
    parsed = urlparse(url.strip())
    return parsed.path + ('?' + parsed.query if parsed.query else '')

 def fetch_xml(url):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        return ET.fromstring(response.content), url
    except Exception as e:
        print(f"⚠️ Failed to fetch {url}: {e}")
        return None, url

 def collect_urls_from_sitemap(url, base=None):
    if url in visited_sitemaps:
        return set()

    full_url = url if base is None else urljoin(base, url)
    visited_sitemaps.add(full_url)

    print(f"🔍 Visiting: {full_url}")
    root, resolved_url = fetch_xml(full_url)
    if root is None:
        return set()

    urls = set()

    if root.tag.endswith("urlset"):
        for loc in root.findall(".//{*}loc"):
            if loc is None or not loc.text:
                continue
            href = loc.text.strip()
            if not href:
                continue

            if href.endswith(".xml"):
                print(f"   ↪ Treating {href} as nested sitemap")
                urls |= collect_urls_from_sitemap(href, base=resolved_url)
            else:
                normalized = normalize_url(href)
                if normalized:
                    urls.add(normalized)

        print(f"   ↳ Final URLs collected: {len(urls)}")
        return urls

    elif root.tag.endswith("sitemapindex"):
        for sitemap in root.findall(".//{*}sitemap"):
            loc = sitemap.find("{*}loc")
            if loc is not None and loc.text:
                urls |= collect_urls_from_sitemap(loc.text.strip(), base=resolved_url)
        return urls

    else:
        print(f"❌ Unknown root tag: {root.tag}")
        return set()

 def compare_sitemaps(flat_url, index_url):
    print("📥 Fetching full sitemap...")
    flat_urls = collect_urls_from_sitemap(flat_url)
    print(f"✅ Total in flat sitemap: {len(flat_urls)}\n")

    print("📥 Fetching split sitemap recursively...")
    visited_sitemaps.clear()
    split_urls = collect_urls_from_sitemap(index_url)
    print(f"✅ Total in split sitemaps: {len(split_urls)}\n")

    missing = flat_urls - split_urls
    extra = split_urls - flat_urls

    if missing:
        print(f"❌ Missing URLs in split sitemap ({len(missing)}):")
        for url in sorted(missing)[:20]:
            print(f"   [{url}]")
        if len(missing) > 20:
            print(f"   ...and {len(missing) - 20} more")
    else:
        print("✅ No missing URLs from the flat sitemap!")

    if extra:
        print(f"\n⚠️ Extra URLs in split sitemap ({len(extra)}):")
        for url in sorted(extra)[:20]:
            print(f"   [{url}]")
        if len(extra) > 20:
            print(f"   ...and {len(extra) - 20} more")

    if not missing and not extra:
        print("🎉 All URLs from the flat sitemap are covered!")

 # Example usage
 if __name__ == "__main__":
    compare_sitemaps(
        'https://example.com/sitemap-a.xml',
        'https://example.com/sitemap-b.xml'
    )
	import requests
	import xml.etree.ElementTree as ET
	from urllib.parse import urlparse, urljoin

	visited_sitemaps = set()

	def normalize_url(url):
	"""Return only the path and query part of a URL, for cross-host matching."""
	parsed = urlparse(url.strip())
	return parsed.path + ('?' + parsed.query if parsed.query else '')

	def fetch_xml(url):
	try:
	response = requests.get(url, timeout=10)
	response.raise_for_status()
	return ET.fromstring(response.content), url
	except Exception as e:
	print(f"⚠️ Failed to fetch {url}: {e}")
	return None, url

	def collect_urls_from_sitemap(url, base=None):
	if url in visited_sitemaps:
	return set()

	full_url = url if base is None else urljoin(base, url)
	visited_sitemaps.add(full_url)

	print(f"🔍 Visiting: {full_url}")
	root, resolved_url = fetch_xml(full_url)
	if root is None:
	return set()

	urls = set()

	if root.tag.endswith("urlset"):
	for loc in root.findall(".//{*}loc"):
	if loc is None or not loc.text:
	continue
	href = loc.text.strip()
	if not href:
	continue

	if href.endswith(".xml"):
	print(f" ↪ Treating {href} as nested sitemap")
	urls \|= collect_urls_from_sitemap(href, base=resolved_url)
	else:
	normalized = normalize_url(href)
	if normalized:
	urls.add(normalized)

	print(f" ↳ Final URLs collected: {len(urls)}")
	return urls

	elif root.tag.endswith("sitemapindex"):
	for sitemap in root.findall(".//{*}sitemap"):
	loc = sitemap.find("{*}loc")
	if loc is not None and loc.text:
	urls \|= collect_urls_from_sitemap(loc.text.strip(), base=resolved_url)
	return urls

	else:
	print(f"❌ Unknown root tag: {root.tag}")
	return set()

	def compare_sitemaps(flat_url, index_url):
	print("📥 Fetching full sitemap...")
	flat_urls = collect_urls_from_sitemap(flat_url)
	print(f"✅ Total in flat sitemap: {len(flat_urls)}\n")

	print("📥 Fetching split sitemap recursively...")
	visited_sitemaps.clear()
	split_urls = collect_urls_from_sitemap(index_url)
	print(f"✅ Total in split sitemaps: {len(split_urls)}\n")

	missing = flat_urls - split_urls
	extra = split_urls - flat_urls

	if missing:
	print(f"❌ Missing URLs in split sitemap ({len(missing)}):")
	for url in sorted(missing)[:20]:
	print(f" [{url}]")
	if len(missing) > 20:
	print(f" ...and {len(missing) - 20} more")
	else:
	print("✅ No missing URLs from the flat sitemap!")

	if extra:
	print(f"\n⚠️ Extra URLs in split sitemap ({len(extra)}):")
	for url in sorted(extra)[:20]:
	print(f" [{url}]")
	if len(extra) > 20:
	print(f" ...and {len(extra) - 20} more")

	if not missing and not extra:
	print("🎉 All URLs from the flat sitemap are covered!")

	# Example usage
	if __name__ == "__main__":
	compare_sitemaps(
	'https://example.com/sitemap-a.xml',
	'https://example.com/sitemap-b.xml'
	)