Last active
May 21, 2025 08:24
-
-
Save evaldeslacasa/18a4831a32571ada9e4e0258829beab0 to your computer and use it in GitHub Desktop.
Python 3 script to translate a Liferay sitemap to output a CSV with the fetched URL (optional), page titles and the corresponding friendly URLs.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
""" | |
sitemap_friendly.py | |
Parse a sitemapindex XML (local file or remote URL), fetch each child sitemap (<urlset>), extract friendly page URLs from each <loc>, and optionally include the page <title>. | |
Supports optional HTTP(S) proxy, sanitizes unescaped ampersands, can ignore one or more specified language prefixes (e.g. /ast/, /en/), | |
ignores Liferay detail pages containing '/-/' in the path, | |
and include the source sitemap URL via --with-sitemap, page titles via --with-title, and output as tab-separated values. | |
""" | |
import argparse | |
import sys | |
import csv | |
import re | |
import xml.etree.ElementTree as ET | |
from urllib.parse import urlparse | |
import requests | |
from bs4 import BeautifulSoup | |
SITEMAP_NS = '{http://www.sitemaps.org/schemas/sitemap/0.9}' | |
def sanitize_xml(text): | |
""" | |
Escape unescaped ampersands in query parameters to ensure well-formed XML. | |
""" | |
return re.sub(r'&(?!amp;|lt;|gt;|apos;|quot;)(?=[\w]+=)', '&', text) | |
def fetch_xml(source, session): | |
""" | |
Retrieve raw XML text from a URL or local file and sanitize it. | |
""" | |
try: | |
if source.startswith(('http://', 'https://')): | |
resp = session.get(source, timeout=15) | |
resp.raise_for_status() | |
raw = resp.content.decode('utf-8', errors='replace') | |
else: | |
with open(source, 'r', encoding='utf-8') as f: | |
raw = f.read() | |
return sanitize_xml(raw) | |
except requests.RequestException as e: | |
print(f"Error fetching URL '{source}': {e}", file=sys.stderr) | |
return None | |
except IOError as e: | |
print(f"Error reading file '{source}': {e}", file=sys.stderr) | |
return None | |
def parse_sitemapindex(xml_text): | |
""" | |
Parse a <sitemapindex> XML string and return all child sitemap URLs. | |
""" | |
root = ET.fromstring(xml_text) | |
return [elem.text for elem in root.findall(f'.//{SITEMAP_NS}loc') if elem.text] | |
def parse_urlset(xml_text): | |
""" | |
Parse a <urlset> XML string and return all page URLs in <loc>. | |
""" | |
root = ET.fromstring(xml_text) | |
return [elem.text for elem in root.findall(f'.//{SITEMAP_NS}loc') if elem.text] | |
def get_title(url, session): | |
""" | |
Fetch a page URL and return its <title> text (stripped), or empty string on failure. | |
""" | |
try: | |
resp = session.get(url, timeout=15) | |
resp.raise_for_status() | |
except requests.RequestException: | |
return '' | |
soup = BeautifulSoup(resp.text, 'html.parser') | |
if soup.title and soup.title.string: | |
return soup.title.string.strip() | |
return '' | |
def main(): | |
parser = argparse.ArgumentParser(description='Extract friendly URLs (and optional titles) from a sitemapindex') | |
parser.add_argument('source', help='Path or URL of the sitemapindex XML') | |
parser.add_argument('--proxy', help='HTTP(S) proxy URL (e.g. http://user:pass@proxy:port)') | |
parser.add_argument('--ignore-lang', nargs='+', | |
help='One or more language codes to ignore (e.g. ast en for paths starting /ast/, /en/)') | |
parser.add_argument('--with-sitemap', action='store_true', | |
help='Include the source sitemap URL alongside each page URL') | |
parser.add_argument('--with-title', action='store_true', | |
help='Fetch each page and include its HTML <title> in the output') | |
parser.add_argument('--output', help='Output TSV file to write results (default stdout)') | |
args = parser.parse_args() | |
session = requests.Session() | |
if args.proxy: | |
session.proxies.update({'http': args.proxy, 'https': args.proxy}) | |
xml_index = fetch_xml(args.source, session) | |
if xml_index is None: | |
sys.exit(1) | |
sitemap_urls = parse_sitemapindex(xml_index) | |
out = open(args.output, 'w', newline='', encoding='utf-8') if args.output else sys.stdout | |
writer = csv.writer(out, delimiter='\t') | |
# Build header | |
header = [] | |
if args.with_sitemap: | |
header.append('sitemap_url') | |
if args.with_title: | |
header.append('title') | |
header.append('page_url') | |
writer.writerow(header) | |
seen = set() | |
for sitemap_url in sitemap_urls: | |
xml_child = fetch_xml(sitemap_url, session) | |
if not xml_child: | |
continue | |
try: | |
page_urls = parse_urlset(xml_child) | |
except ET.ParseError as e: | |
print(f"Failed to parse child sitemap '{sitemap_url}': {e}", file=sys.stderr) | |
continue | |
for page_url in page_urls: | |
if page_url in seen: | |
continue | |
# ignore specified languages | |
if args.ignore_lang: | |
path = urlparse(page_url).path.lstrip('/') | |
if any(path.startswith(f"{lang}/") for lang in args.ignore_lang): | |
continue | |
# ignore Liferay detail pages | |
if '/-/' in urlparse(page_url).path: | |
continue | |
seen.add(page_url) | |
row = [] | |
if args.with_sitemap: | |
row.append(sitemap_url) | |
if args.with_title: | |
title = get_title(page_url, session) | |
row.append(title) | |
row.append(page_url) | |
writer.writerow(row) | |
if args.output: | |
out.close() | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
sitemap.xml Analyzer – Python
A Python 3 script that you run by passing the path to the
sitemap.xml
file as a parameter, with several options to adjust its behavior.Introduction
A sitemap is a protocol (https://www.sitemaps.org/protocol.html) that allows you to expose listings of pages available on a website. Liferay dynamically generates a
sitemap.xml
based on the portal’s pages, and it can be customized via portal settings and properties.By default, it’s exposed at
/sitemap.xml
(for example, see Asturias’s site at https://www.asturias.es/sitemap.xml).Scenario
Analyzing the
sitemap.xml
is useful when you don’t have credentials or there isn’t a “Site Map” component available to show the full page tree of the portal. It gives you a clear view of how many accessible pages exist in a portal, along with their URLs, which aids in analysis.Tool
Execution Options
The script offers several flags to make its use more convenient:
--ignore-lang
Ignore URLs whose path begins with any of the specified language codes (space-separated).
Example:
--with-sitemap
Include the sitemap URL that contained each page (column
sitemap_url
) in the output.--with-title
Fetch the
<title>
text of each page and include it in thetitle
column.--output friendly.tsv
Specify the output file in TSV (tab-separated values) format.
Example run:
liferay_sitemap_analyzer.py "C:\work\MyLiferaySite\sitemap.xml" \ --ignore-lang ar fr pt en sw \ --output friendly_sitemap.tsv \ --with-title \ --with-sitemap
Additional Notes
/-/
, which are typically internal Liferay detail pages.sitemap.xml
may not include private, hidden, or inaccessible pages. Therefore, it isn’t the best tool for exploring those areas of a portal (e.g. a user’s personal section).