Last active
January 28, 2019 21:47
-
-
Save JimFawkes/76a649c7bdf8bfbbc2b2051c98995789 to your computer and use it in GitHub Desktop.
Example Code used in my Metis Investigation Presentation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Better Exceptions and Loguru | |
Metis Investigation by Moritz Eilfort | |
January 28th, 2019 | |
Example code for presentation purposes. | |
Code: https://gist.github.com/JimFawkes/76a649c7bdf8bfbbc2b2051c98995789 | |
Summary: https://gist.github.com/JimFawkes/e5f767288e6d8e2df8fa53b5862db9d6 | |
""" | |
import random | |
from loguru import logger | |
logger.add("logs/my_first_log_file.log") | |
def scrape_data(url): | |
"""Get data from a website.""" | |
data_option_1 = { | |
"status_code": 200.0, | |
"content": " Yeah#it#worked.\n# ", | |
"url": url, | |
} | |
data_option_2 = {"status_code": 200, "content": None, "url": url} | |
data_option_3 = { | |
"status_code": 403, | |
"error": {"msg": "API RATE LIMIT EXCEEDED!"}, | |
"url": url, | |
} | |
return random.choice([data_option_1, data_option_2, data_option_3]) | |
def clean_content(content): | |
"""Remove leading and trailing whitespaces and remove strange characters. | |
""" | |
data_content = content.replace("#", " ").replace("\n", "").strip() | |
return data_content | |
def clean(data): | |
"""Clean the status_code and content.""" | |
data["status_code"] = int(data["status_code"]) | |
data["content"] = clean_content(data["content"]) | |
return data | |
def get_clean_data_from_url(url="www.some-website.com"): | |
"""Run our entire pipeline. | |
1. Get the data by scraping the url | |
2. Clean the data | |
3. Return the data | |
""" | |
data = scrape_data(url) | |
clean_data = clean(data) | |
return clean_data | |
def scrape_all_pages(pages): | |
"""Scrape all sub-pages of a website.""" | |
results = [] | |
for page in pages: | |
logger.info(f"Get data for page {page}") | |
try: | |
clean_data = get_clean_data_from_url(page) | |
results.append((page, clean_data)) | |
except AttributeError as e: | |
logger.warning( | |
f"Caught an AttributeError when retrieving website: {page}. Re-trying..." | |
) | |
results += scrape_all_pages([page]) | |
except KeyError as e: | |
logger.exception( | |
f"Caught a KeyError when retrieving website: {page}. Ignoring this site. Solve later..." | |
) | |
# logger.error(e) | |
continue | |
return results | |
if __name__ == "__main__": | |
pages = [ | |
"www.thisismetis.com", | |
"www.google.com", | |
"www.github.com", | |
"www.stackoverflow.com", | |
"www.some-website.com", | |
] | |
results = scrape_all_pages(pages) | |
print(f"Got {len(results)}/{len(pages)} results:") | |
for result in results: | |
print(f"{result[0]} - {result[1]}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment