vpetersson · October 8, 2019 13:54 · Apr 1, 2015 · Apr 1, 2015 · Apr 1, 2015 · Mar 31, 2015
diff --git a/gistfile1.py b/gistfile1.py
@@ -16,14 +16,14 @@ def get_sitemap(url):
     if get_url.status_code == 200:
         return get_url.text
     else:
-        print "Unable to fetch sitemap: %s." % url
+        print 'Unable to fetch sitemap: %s.' % url
 
 
 def process_sitemap(s):
     soup = BeautifulSoup(s)
     result = []
 
-    for loc in soup.findAll("loc"):
+    for loc in soup.findAll('loc'):
         result.append(loc.text)
 
     return result

diff --git a/gistfile1.py b/gistfile1.py
@@ -2,7 +2,8 @@
 # -*- coding: utf-8 -*-
 
 """
-Based on http://www.craigaddyman.com/parse-an-xml-sitemap-with-python/
+Inspired by Craig Addyman (http://www.craigaddyman.com/parse-an-xml-sitemap-with-python/)
+Enhanced by Viktor Petersson (http://viktorpetersson.com) / @vpetersson
 """
 
 from bs4 import BeautifulSoup

diff --git a/gistfile1.py b/gistfile1.py
@@ -8,13 +8,54 @@
 from bs4 import BeautifulSoup
 import requests
 
-url = "https://www.domain.com/sitemap.xml"
-get_url = requests.get(url)
 
-if get_url.status_code == 200:
-    soup = BeautifulSoup(get_url.text)
+def get_sitemap(url):
+    get_url = requests.get(url)
+
+    if get_url.status_code == 200:
+        return get_url.text
+    else:
+        print "Unable to fetch sitemap: %s." % url
+
+
+def process_sitemap(s):
+    soup = BeautifulSoup(s)
+    result = []
 
     for loc in soup.findAll("loc"):
-        print loc.text
-else:
-    print "Unable to fetch sitemap."
+        result.append(loc.text)
+
+    return result
+
+
+def is_sub_sitemap(s):
+    if s.endswith('.xml') and 'sitemap' in s:
+        return True
+    else:
+        return False
+
+
+def parse_sitemap(s):
+    sitemap = process_sitemap(s)
+    result = []
+
+    while sitemap:
+        candidate = sitemap.pop()
+
+        if is_sub_sitemap(candidate):
+            sub_sitemap = get_sitemap(candidate)
+            for i in process_sitemap(sub_sitemap):
+                sitemap.append(i)
+        else:
+            result.append(candidate)
+
+    return result
+
+
+def main():
+    sitemap = get_sitemap('https://www.cloudsigma.com/sitemap.xml')
+    print '\n'.join(parse_sitemap(sitemap))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/gistfile1.py b/gistfile1.py
@@ -14,7 +14,7 @@
 if get_url.status_code == 200:
     soup = BeautifulSoup(get_url.text)
 
-    for url in soup.findAll("loc"):
-        print url.text
+    for loc in soup.findAll("loc"):
+        print loc.text
 else:
     print "Unable to fetch sitemap."
diff --git a/gistfile1.py b/gistfile1.py
@@ -0,0 +1,20 @@
+#! /usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""
+Based on http://www.craigaddyman.com/parse-an-xml-sitemap-with-python/
+"""
+
+from bs4 import BeautifulSoup
+import requests
+
+url = "https://www.domain.com/sitemap.xml"
+get_url = requests.get(url)
+
+if get_url.status_code == 200:
+    soup = BeautifulSoup(get_url.text)
+
+    for url in soup.findAll("loc"):
+        print url.text
+else:
+    print "Unable to fetch sitemap."