Last active
September 7, 2020 19:20
-
-
Save mara-schulke/467a61a02966738d0b1cdbb933a0b464 to your computer and use it in GitHub Desktop.
web scraper for moccu.com
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import asyncio | |
import requests | |
from bs4 import BeautifulSoup | |
base_url = 'https://moccu.com' | |
def scrape_page_for_elements(html): | |
content_elements = html.findAll(class_='content') | |
elements = [] | |
for el in content_elements: | |
content_classes = filter(lambda c: 'content-' in c, el['class']) | |
for el_name in content_classes: | |
elements.append(el_name) | |
return elements | |
def format_element_occurance(name, occurences): | |
return name.replace('content-', '') + ' -> ' + str(occurences) | |
class PageSummary(): | |
def __init__(self, location, elements): | |
self.location = location | |
self.elements = elements | |
def elements_into_occurences(self): | |
occurences = {} | |
for el in self.elements: | |
if el in occurences: | |
occurences[el] += 1 | |
else: | |
occurences[el] = 1 | |
return sorted(occurences.items(), key=lambda el: el[1], reverse=True) | |
def __str__(self): | |
show = self.location | |
show += '\n' | |
for key, value in self.elements_into_occurences(): | |
show += '\t' + format_element_occurance(key, value) + '\n' | |
show += '\n' | |
return show | |
async def main(): | |
res = requests.get(base_url + '/sitemap.xml') | |
sitemap = BeautifulSoup(res.text, features='html.parser') | |
urls = sitemap.findAll('url') | |
loop = asyncio.get_event_loop() | |
# count local occurences | |
summaries = [] | |
for url in urls: | |
res = await loop.run_in_executor(None, requests.get, url.loc.text) | |
html = BeautifulSoup(res.text, features='html.parser') | |
elements = scrape_page_for_elements(html) | |
summaries.append(PageSummary(url.loc.text, elements)) | |
for summary in summaries: | |
print(summary) | |
# count global occurences | |
all_content_elements = [] | |
for summary in summaries: | |
for el in summary.elements: | |
all_content_elements.append(el) | |
print(PageSummary('Alle Seiten', all_content_elements)) | |
if (__name__ == '__main__'): | |
loop = asyncio.get_event_loop() | |
loop.run_until_complete(main()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment