Skip to content

Instantly share code, notes, and snippets.

@vpetersson
Last active October 8, 2019 13:54

Revisions

  1. vpetersson revised this gist Apr 1, 2015. 1 changed file with 2 additions and 2 deletions.
    4 changes: 2 additions & 2 deletions gistfile1.py
    Original file line number Diff line number Diff line change
    @@ -16,14 +16,14 @@ def get_sitemap(url):
    if get_url.status_code == 200:
    return get_url.text
    else:
    print "Unable to fetch sitemap: %s." % url
    print 'Unable to fetch sitemap: %s.' % url


    def process_sitemap(s):
    soup = BeautifulSoup(s)
    result = []

    for loc in soup.findAll("loc"):
    for loc in soup.findAll('loc'):
    result.append(loc.text)

    return result
  2. vpetersson revised this gist Apr 1, 2015. 1 changed file with 2 additions and 1 deletion.
    3 changes: 2 additions & 1 deletion gistfile1.py
    Original file line number Diff line number Diff line change
    @@ -2,7 +2,8 @@
    # -*- coding: utf-8 -*-

    """
    Based on http://www.craigaddyman.com/parse-an-xml-sitemap-with-python/
    Inspired by Craig Addyman (http://www.craigaddyman.com/parse-an-xml-sitemap-with-python/)
    Enhanced by Viktor Petersson (http://viktorpetersson.com) / @vpetersson
    """

    from bs4 import BeautifulSoup
  3. vpetersson revised this gist Apr 1, 2015. 1 changed file with 48 additions and 7 deletions.
    55 changes: 48 additions & 7 deletions gistfile1.py
    Original file line number Diff line number Diff line change
    @@ -8,13 +8,54 @@
    from bs4 import BeautifulSoup
    import requests

    url = "https://www.domain.com/sitemap.xml"
    get_url = requests.get(url)

    if get_url.status_code == 200:
    soup = BeautifulSoup(get_url.text)
    def get_sitemap(url):
    get_url = requests.get(url)

    if get_url.status_code == 200:
    return get_url.text
    else:
    print "Unable to fetch sitemap: %s." % url


    def process_sitemap(s):
    soup = BeautifulSoup(s)
    result = []

    for loc in soup.findAll("loc"):
    print loc.text
    else:
    print "Unable to fetch sitemap."
    result.append(loc.text)

    return result


    def is_sub_sitemap(s):
    if s.endswith('.xml') and 'sitemap' in s:
    return True
    else:
    return False


    def parse_sitemap(s):
    sitemap = process_sitemap(s)
    result = []

    while sitemap:
    candidate = sitemap.pop()

    if is_sub_sitemap(candidate):
    sub_sitemap = get_sitemap(candidate)
    for i in process_sitemap(sub_sitemap):
    sitemap.append(i)
    else:
    result.append(candidate)

    return result


    def main():
    sitemap = get_sitemap('https://www.cloudsigma.com/sitemap.xml')
    print '\n'.join(parse_sitemap(sitemap))


    if __name__ == '__main__':
    main()
  4. vpetersson revised this gist Mar 31, 2015. 1 changed file with 2 additions and 2 deletions.
    4 changes: 2 additions & 2 deletions gistfile1.py
    Original file line number Diff line number Diff line change
    @@ -14,7 +14,7 @@
    if get_url.status_code == 200:
    soup = BeautifulSoup(get_url.text)

    for url in soup.findAll("loc"):
    print url.text
    for loc in soup.findAll("loc"):
    print loc.text
    else:
    print "Unable to fetch sitemap."
  5. vpetersson revised this gist Mar 31, 2015. No changes.
  6. vpetersson created this gist Mar 31, 2015.
    20 changes: 20 additions & 0 deletions gistfile1.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,20 @@
    #! /usr/bin/env python
    # -*- coding: utf-8 -*-

    """
    Based on http://www.craigaddyman.com/parse-an-xml-sitemap-with-python/
    """

    from bs4 import BeautifulSoup
    import requests

    url = "https://www.domain.com/sitemap.xml"
    get_url = requests.get(url)

    if get_url.status_code == 200:
    soup = BeautifulSoup(get_url.text)

    for url in soup.findAll("loc"):
    print url.text
    else:
    print "Unable to fetch sitemap."