Created
June 11, 2018 07:10
-
-
Save julien-h2/f1f9eb7af87376bd5baf393c66daedee to your computer and use it in GitHub Desktop.
Scrapping with urllib and BeautifulSoup / python3
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# First, use URLLIB to fetch HTML files | |
# ----------------------------------------------------------------------- | |
from urllib.request import Request, urlopen | |
from urllib.error import URLError | |
def get_html(url): | |
# construct an http request for the given url | |
req = Request(url, | |
data=None, | |
headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'}) | |
# send request and fetch html | |
html = None | |
try: | |
html = urlopen(req) | |
except URLError as e: | |
if hasattr(e, 'reason'): | |
print('We failed to reach a server.') | |
print('Reason: ', e.reason) | |
elif hasattr(e, 'code'): | |
print('The server couldn\'t fulfill the request.') | |
print('Error code: ', e.code) | |
# on error, simply return an empty binary string | |
if html is None: | |
print('Server not found') | |
html = b'' | |
# on success, read the html content into a binary string | |
else: | |
html = html.read() | |
return html | |
# Then, use BEAUTIFULSOUP to parse HTML | |
# ----------------------------------------------------------------------- | |
from bs4 import BeautifulSoup | |
# Fetch html from url | |
url = 'https://medium.com/personal-growth/there-are-two-ways-to-read-one-is-useless-cc152cf4f51b' | |
html = get_html(url) | |
soup = BeautifulSoup(html, 'html.parser') | |
# Search for the main div, which is the div with the most paragraphs | |
ps = soup.select('p') | |
parents = [p.parent for p in ps] | |
def count_child_paragraphs(element): | |
return len(element.findAll('p', recurvise=False)) | |
parents.sort(key = count_child_paragraphs, reverse=True) | |
main_div = parents[0] | |
# Add the main title (h1) if it's not already there | |
if not main_div.findAll('h1'): | |
titles = soup.findAll('h1') | |
if titles: | |
main_title = titles[0] | |
main_div.insert(0, main_title) | |
# That's it, we have the main content, let's write it to a new file | |
with open('output.html', 'w') as file: | |
file.write(str(main_div)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment