evaldeslacasa · May 21, 2025 08:24 · evaldeslacasa · May 6, 2025
diff --git a/sitemap_friendly.py b/sitemap_friendly.py
 #!/usr/bin/env python3
 """
 sitemap_friendly.py

 Parse a sitemapindex XML (local file or remote URL), fetch each child sitemap (<urlset>), extract friendly page URLs from each <loc>, and optionally include the page <title>.
 Supports optional HTTP(S) proxy, sanitizes unescaped ampersands, can ignore one or more specified language prefixes (e.g. /ast/, /en/),
 ignores Liferay detail pages containing '/-/' in the path,
 and include the source sitemap URL via --with-sitemap, page titles via --with-title, and output as tab-separated values.
 """
 import argparse
 import sys
 import csv
 import re
 import xml.etree.ElementTree as ET
 from urllib.parse import urlparse
 import requests
 from bs4 import BeautifulSoup

 SITEMAP_NS = '{http://www.sitemaps.org/schemas/sitemap/0.9}'


 def sanitize_xml(text):
    """
    Escape unescaped ampersands in query parameters to ensure well-formed XML.
    """
    return re.sub(r'&(?!amp;|lt;|gt;|apos;|quot;)(?=[\w]+=)', '&amp;', text)


 def fetch_xml(source, session):
    """
    Retrieve raw XML text from a URL or local file and sanitize it.
    """
    try:
        if source.startswith(('http://', 'https://')):
            resp = session.get(source, timeout=15)
            resp.raise_for_status()
            raw = resp.content.decode('utf-8', errors='replace')
        else:
            with open(source, 'r', encoding='utf-8') as f:
                raw = f.read()
        return sanitize_xml(raw)
    except requests.RequestException as e:
        print(f"Error fetching URL '{source}': {e}", file=sys.stderr)
        return None
    except IOError as e:
        print(f"Error reading file '{source}': {e}", file=sys.stderr)
        return None


 def parse_sitemapindex(xml_text):
    """
    Parse a <sitemapindex> XML string and return all child sitemap URLs.
    """
    root = ET.fromstring(xml_text)
    return [elem.text for elem in root.findall(f'.//{SITEMAP_NS}loc') if elem.text]


 def parse_urlset(xml_text):
    """
    Parse a <urlset> XML string and return all page URLs in <loc>.
    """
    root = ET.fromstring(xml_text)
    return [elem.text for elem in root.findall(f'.//{SITEMAP_NS}loc') if elem.text]


 def get_title(url, session):
    """
    Fetch a page URL and return its <title> text (stripped), or empty string on failure.
    """
    try:
        resp = session.get(url, timeout=15)
        resp.raise_for_status()
    except requests.RequestException:
        return ''
    soup = BeautifulSoup(resp.text, 'html.parser')
    if soup.title and soup.title.string:
        return soup.title.string.strip()
    return ''


 def main():
    parser = argparse.ArgumentParser(description='Extract friendly URLs (and optional titles) from a sitemapindex')
    parser.add_argument('source', help='Path or URL of the sitemapindex XML')
    parser.add_argument('--proxy', help='HTTP(S) proxy URL (e.g. http://user:pass@proxy:port)')
    parser.add_argument('--ignore-lang', nargs='+',
                        help='One or more language codes to ignore (e.g. ast en for paths starting /ast/, /en/)')
    parser.add_argument('--with-sitemap', action='store_true',
                        help='Include the source sitemap URL alongside each page URL')
    parser.add_argument('--with-title', action='store_true',
                        help='Fetch each page and include its HTML <title> in the output')
    parser.add_argument('--output', help='Output TSV file to write results (default stdout)')
    args = parser.parse_args()

    session = requests.Session()
    if args.proxy:
        session.proxies.update({'http': args.proxy, 'https': args.proxy})

    xml_index = fetch_xml(args.source, session)
    if xml_index is None:
        sys.exit(1)

    sitemap_urls = parse_sitemapindex(xml_index)

    out = open(args.output, 'w', newline='', encoding='utf-8') if args.output else sys.stdout
    writer = csv.writer(out, delimiter='\t')

    # Build header
    header = []
    if args.with_sitemap:
        header.append('sitemap_url')
    if args.with_title:
        header.append('title')
    header.append('page_url')
    writer.writerow(header)

    seen = set()
    for sitemap_url in sitemap_urls:
        xml_child = fetch_xml(sitemap_url, session)
        if not xml_child:
            continue
        try:
            page_urls = parse_urlset(xml_child)
        except ET.ParseError as e:
            print(f"Failed to parse child sitemap '{sitemap_url}': {e}", file=sys.stderr)
            continue
        for page_url in page_urls:
            if page_url in seen:
                continue
            # ignore specified languages
            if args.ignore_lang:
                path = urlparse(page_url).path.lstrip('/')
                if any(path.startswith(f"{lang}/") for lang in args.ignore_lang):
                    continue
            # ignore Liferay detail pages
            if '/-/' in urlparse(page_url).path:
                continue
            seen.add(page_url)

            row = []
            if args.with_sitemap:
                row.append(sitemap_url)
            if args.with_title:
                title = get_title(page_url, session)
                row.append(title)
            row.append(page_url)
            writer.writerow(row)

    if args.output:
        out.close()

 if __name__ == '__main__':
    main()
	#!/usr/bin/env python3
	"""
	sitemap_friendly.py

	Parse a sitemapindex XML (local file or remote URL), fetch each child sitemap (<urlset>), extract friendly page URLs from each <loc>, and optionally include the page <title>.
	Supports optional HTTP(S) proxy, sanitizes unescaped ampersands, can ignore one or more specified language prefixes (e.g. /ast/, /en/),
	ignores Liferay detail pages containing '/-/' in the path,
	and include the source sitemap URL via --with-sitemap, page titles via --with-title, and output as tab-separated values.
	"""
	import argparse
	import sys
	import csv
	import re
	import xml.etree.ElementTree as ET
	from urllib.parse import urlparse
	import requests
	from bs4 import BeautifulSoup

	SITEMAP_NS = '{http://www.sitemaps.org/schemas/sitemap/0.9}'


	def sanitize_xml(text):
	"""
	Escape unescaped ampersands in query parameters to ensure well-formed XML.
	"""
	return re.sub(r'&(?!amp;\|lt;\|gt;\|apos;\|quot;)(?=[\w]+=)', '&', text)


	def fetch_xml(source, session):
	"""
	Retrieve raw XML text from a URL or local file and sanitize it.
	"""
	try:
	if source.startswith(('http://', 'https://')):
	resp = session.get(source, timeout=15)
	resp.raise_for_status()
	raw = resp.content.decode('utf-8', errors='replace')
	else:
	with open(source, 'r', encoding='utf-8') as f:
	raw = f.read()
	return sanitize_xml(raw)
	except requests.RequestException as e:
	print(f"Error fetching URL '{source}': {e}", file=sys.stderr)
	return None
	except IOError as e:
	print(f"Error reading file '{source}': {e}", file=sys.stderr)
	return None


	def parse_sitemapindex(xml_text):
	"""
	Parse a <sitemapindex> XML string and return all child sitemap URLs.
	"""
	root = ET.fromstring(xml_text)
	return [elem.text for elem in root.findall(f'.//{SITEMAP_NS}loc') if elem.text]


	def parse_urlset(xml_text):
	"""
	Parse a <urlset> XML string and return all page URLs in <loc>.
	"""
	root = ET.fromstring(xml_text)
	return [elem.text for elem in root.findall(f'.//{SITEMAP_NS}loc') if elem.text]


	def get_title(url, session):
	"""
	Fetch a page URL and return its <title> text (stripped), or empty string on failure.
	"""
	try:
	resp = session.get(url, timeout=15)
	resp.raise_for_status()
	except requests.RequestException:
	return ''
	soup = BeautifulSoup(resp.text, 'html.parser')
	if soup.title and soup.title.string:
	return soup.title.string.strip()
	return ''


	def main():
	parser = argparse.ArgumentParser(description='Extract friendly URLs (and optional titles) from a sitemapindex')
	parser.add_argument('source', help='Path or URL of the sitemapindex XML')
	parser.add_argument('--proxy', help='HTTP(S) proxy URL (e.g. http://user:pass@proxy:port)')
	parser.add_argument('--ignore-lang', nargs='+',
	help='One or more language codes to ignore (e.g. ast en for paths starting /ast/, /en/)')
	parser.add_argument('--with-sitemap', action='store_true',
	help='Include the source sitemap URL alongside each page URL')
	parser.add_argument('--with-title', action='store_true',
	help='Fetch each page and include its HTML <title> in the output')
	parser.add_argument('--output', help='Output TSV file to write results (default stdout)')
	args = parser.parse_args()

	session = requests.Session()
	if args.proxy:
	session.proxies.update({'http': args.proxy, 'https': args.proxy})

	xml_index = fetch_xml(args.source, session)
	if xml_index is None:
	sys.exit(1)

	sitemap_urls = parse_sitemapindex(xml_index)

	out = open(args.output, 'w', newline='', encoding='utf-8') if args.output else sys.stdout
	writer = csv.writer(out, delimiter='\t')

	# Build header
	header = []
	if args.with_sitemap:
	header.append('sitemap_url')
	if args.with_title:
	header.append('title')
	header.append('page_url')
	writer.writerow(header)

	seen = set()
	for sitemap_url in sitemap_urls:
	xml_child = fetch_xml(sitemap_url, session)
	if not xml_child:
	continue
	try:
	page_urls = parse_urlset(xml_child)
	except ET.ParseError as e:
	print(f"Failed to parse child sitemap '{sitemap_url}': {e}", file=sys.stderr)
	continue
	for page_url in page_urls:
	if page_url in seen:
	continue
	# ignore specified languages
	if args.ignore_lang:
	path = urlparse(page_url).path.lstrip('/')
	if any(path.startswith(f"{lang}/") for lang in args.ignore_lang):
	continue
	# ignore Liferay detail pages
	if '/-/' in urlparse(page_url).path:
	continue
	seen.add(page_url)

	row = []
	if args.with_sitemap:
	row.append(sitemap_url)
	if args.with_title:
	title = get_title(page_url, session)
	row.append(title)
	row.append(page_url)
	writer.writerow(row)

	if args.output:
	out.close()

	if __name__ == '__main__':
	main()