Skip to content

Instantly share code, notes, and snippets.

@mara-schulke
Last active September 7, 2020 19:20
Show Gist options
  • Save mara-schulke/467a61a02966738d0b1cdbb933a0b464 to your computer and use it in GitHub Desktop.
Save mara-schulke/467a61a02966738d0b1cdbb933a0b464 to your computer and use it in GitHub Desktop.
web scraper for moccu.com
import asyncio
import requests
from bs4 import BeautifulSoup
base_url = 'https://moccu.com'
def scrape_page_for_elements(html):
content_elements = html.findAll(class_='content')
elements = []
for el in content_elements:
content_classes = filter(lambda c: 'content-' in c, el['class'])
for el_name in content_classes:
elements.append(el_name)
return elements
def format_element_occurance(name, occurences):
return name.replace('content-', '') + ' -> ' + str(occurences)
class PageSummary():
def __init__(self, location, elements):
self.location = location
self.elements = elements
def elements_into_occurences(self):
occurences = {}
for el in self.elements:
if el in occurences:
occurences[el] += 1
else:
occurences[el] = 1
return sorted(occurences.items(), key=lambda el: el[1], reverse=True)
def __str__(self):
show = self.location
show += '\n'
for key, value in self.elements_into_occurences():
show += '\t' + format_element_occurance(key, value) + '\n'
show += '\n'
return show
async def main():
res = requests.get(base_url + '/sitemap.xml')
sitemap = BeautifulSoup(res.text, features='html.parser')
urls = sitemap.findAll('url')
loop = asyncio.get_event_loop()
# count local occurences
summaries = []
for url in urls:
res = await loop.run_in_executor(None, requests.get, url.loc.text)
html = BeautifulSoup(res.text, features='html.parser')
elements = scrape_page_for_elements(html)
summaries.append(PageSummary(url.loc.text, elements))
for summary in summaries:
print(summary)
# count global occurences
all_content_elements = []
for summary in summaries:
for el in summary.elements:
all_content_elements.append(el)
print(PageSummary('Alle Seiten', all_content_elements))
if (__name__ == '__main__'):
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment