julien-h2 · June 11, 2018 07:10
diff --git a/BeautifulSoup scraping in python3 b/BeautifulSoup scraping in python3
 # First, use URLLIB to fetch HTML files
 # -----------------------------------------------------------------------

 from urllib.request import Request, urlopen
 from urllib.error import URLError

 def get_html(url):
    # construct an http request for the given url 
    req = Request(url,
              data=None, 
              headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'})
    
    # send request and fetch html
    html = None
    try:
        html = urlopen(req)
    except URLError as e:
        if hasattr(e, 'reason'):
            print('We failed to reach a server.')
            print('Reason: ', e.reason)
        elif hasattr(e, 'code'):
            print('The server couldn\'t fulfill the request.')
            print('Error code: ', e.code)
    
    # on error, simply return an empty binary string
    if html is None:
        print('Server not found')
        html = b''
    
    # on success, read the html content into a binary string
    else: 
        html  = html.read()

    return html




 # Then, use BEAUTIFULSOUP to parse HTML
 # -----------------------------------------------------------------------

 from bs4 import BeautifulSoup

 # Fetch html from url

 url = 'https://medium.com/personal-growth/there-are-two-ways-to-read-one-is-useless-cc152cf4f51b'
 html = get_html(url)
 soup = BeautifulSoup(html, 'html.parser')

 # Search for the main div, which is the div with the most paragraphs

 ps = soup.select('p')
 parents = [p.parent for p in ps]

 def count_child_paragraphs(element):
    return len(element.findAll('p', recurvise=False))

 parents.sort(key = count_child_paragraphs, reverse=True)
 main_div = parents[0]

 # Add the main title (h1) if it's not already there

 if not main_div.findAll('h1'):
    titles = soup.findAll('h1')
    if titles:
        main_title = titles[0]
        main_div.insert(0, main_title)

 # That's it, we have the main content, let's write it to a new file

 with open('output.html', 'w') as file:
    file.write(str(main_div))
	# First, use URLLIB to fetch HTML files
	# -----------------------------------------------------------------------

	from urllib.request import Request, urlopen
	from urllib.error import URLError

	def get_html(url):
	# construct an http request for the given url
	req = Request(url,
	data=None,
	headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'})

	# send request and fetch html
	html = None
	try:
	html = urlopen(req)
	except URLError as e:
	if hasattr(e, 'reason'):
	print('We failed to reach a server.')
	print('Reason: ', e.reason)
	elif hasattr(e, 'code'):
	print('The server couldn\'t fulfill the request.')
	print('Error code: ', e.code)

	# on error, simply return an empty binary string
	if html is None:
	print('Server not found')
	html = b''

	# on success, read the html content into a binary string
	else:
	html = html.read()

	return html




	# Then, use BEAUTIFULSOUP to parse HTML
	# -----------------------------------------------------------------------

	from bs4 import BeautifulSoup

	# Fetch html from url

	url = 'https://medium.com/personal-growth/there-are-two-ways-to-read-one-is-useless-cc152cf4f51b'
	html = get_html(url)
	soup = BeautifulSoup(html, 'html.parser')

	# Search for the main div, which is the div with the most paragraphs

	ps = soup.select('p')
	parents = [p.parent for p in ps]

	def count_child_paragraphs(element):
	return len(element.findAll('p', recurvise=False))

	parents.sort(key = count_child_paragraphs, reverse=True)
	main_div = parents[0]

	# Add the main title (h1) if it's not already there

	if not main_div.findAll('h1'):
	titles = soup.findAll('h1')
	if titles:
	main_title = titles[0]
	main_div.insert(0, main_title)

	# That's it, we have the main content, let's write it to a new file

	with open('output.html', 'w') as file:
	file.write(str(main_div))