Created
July 18, 2020 12:55
-
-
Save pablospizzamiglio/412412bc8de2f1e08cb3a2dc8f781eae to your computer and use it in GitHub Desktop.
Web Scraper that gets product information from Lenovo's e-commerce site
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from contextlib import closing | |
# pip install beautifulsoup4 requests | |
from bs4 import BeautifulSoup | |
from requests import get | |
from requests.exceptions import RequestException | |
def simple_get(url): | |
""" | |
Attempts to get the content at `url` by making an HTTP GET request. | |
If the content-type of response is some kind of HTML/XML, return the | |
text content, otherwise return None. | |
""" | |
try: | |
with closing(get(url, stream=True)) as response: | |
if is_good_response(response): | |
return response.content | |
else: | |
return None | |
except RequestException as e: | |
log_error("Error during requests to {0} : {1}".format(url, str(e))) | |
return None | |
def is_good_response(response): | |
""" | |
Returns True if the response seems to be HTML, False otherwise. | |
""" | |
content_type = response.headers["Content-Type"].lower() | |
return ( | |
response.status_code == 200 | |
and content_type is not None | |
and "html" in content_type | |
) | |
def log_error(e): | |
""" | |
It is always a good idea to log errors. | |
This function just prints them, but you can | |
make it do anything. | |
""" | |
print(e) | |
def get_product_metadata(url): | |
""" | |
Downloads the page where the Product detail is found and returns a | |
dictionary containing the relevant metadata. | |
""" | |
response = simple_get(url) | |
if response is not None: | |
html = BeautifulSoup(response, "html.parser") | |
META_NAMES = [ | |
"description", | |
"productcode", | |
"productid", | |
"productprice", | |
"productsaleprice" | |
"productstatus", | |
] | |
return { | |
meta.attrs["name"]: meta.attrs["content"] | |
for meta in html.select("meta") | |
if "name" in meta.attrs and meta.attrs["name"] in META_NAMES | |
} | |
# Raise an exception if we failed to get any data from the url | |
raise Exception("Error retrieving contents at {}".format(url)) | |
if __name__ == "__main__": | |
get_product_metadata("https://www.lenovo.com/us/en/laptops/thinkpad/thinkpad-x/ThinkPad-X1-Carbon-6th-Gen/p/22TP2TXX16G") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment