Last active
October 8, 2019 13:54
Revisions
-
vpetersson revised this gist
Apr 1, 2015 . 1 changed file with 2 additions and 2 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -16,14 +16,14 @@ def get_sitemap(url): if get_url.status_code == 200: return get_url.text else: print 'Unable to fetch sitemap: %s.' % url def process_sitemap(s): soup = BeautifulSoup(s) result = [] for loc in soup.findAll('loc'): result.append(loc.text) return result -
vpetersson revised this gist
Apr 1, 2015 . 1 changed file with 2 additions and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -2,7 +2,8 @@ # -*- coding: utf-8 -*- """ Inspired by Craig Addyman (http://www.craigaddyman.com/parse-an-xml-sitemap-with-python/) Enhanced by Viktor Petersson (http://viktorpetersson.com) / @vpetersson """ from bs4 import BeautifulSoup -
vpetersson revised this gist
Apr 1, 2015 . 1 changed file with 48 additions and 7 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -8,13 +8,54 @@ from bs4 import BeautifulSoup import requests def get_sitemap(url): get_url = requests.get(url) if get_url.status_code == 200: return get_url.text else: print "Unable to fetch sitemap: %s." % url def process_sitemap(s): soup = BeautifulSoup(s) result = [] for loc in soup.findAll("loc"): result.append(loc.text) return result def is_sub_sitemap(s): if s.endswith('.xml') and 'sitemap' in s: return True else: return False def parse_sitemap(s): sitemap = process_sitemap(s) result = [] while sitemap: candidate = sitemap.pop() if is_sub_sitemap(candidate): sub_sitemap = get_sitemap(candidate) for i in process_sitemap(sub_sitemap): sitemap.append(i) else: result.append(candidate) return result def main(): sitemap = get_sitemap('https://www.cloudsigma.com/sitemap.xml') print '\n'.join(parse_sitemap(sitemap)) if __name__ == '__main__': main() -
vpetersson revised this gist
Mar 31, 2015 . 1 changed file with 2 additions and 2 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -14,7 +14,7 @@ if get_url.status_code == 200: soup = BeautifulSoup(get_url.text) for loc in soup.findAll("loc"): print loc.text else: print "Unable to fetch sitemap." -
vpetersson revised this gist
Mar 31, 2015 . No changes.There are no files selected for viewing
-
vpetersson created this gist
Mar 31, 2015 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,20 @@ #! /usr/bin/env python # -*- coding: utf-8 -*- """ Based on http://www.craigaddyman.com/parse-an-xml-sitemap-with-python/ """ from bs4 import BeautifulSoup import requests url = "https://www.domain.com/sitemap.xml" get_url = requests.get(url) if get_url.status_code == 200: soup = BeautifulSoup(get_url.text) for url in soup.findAll("loc"): print url.text else: print "Unable to fetch sitemap."