tarekziade · May 18, 2023 15:57
diff --git a/scrape.py b/scrape.py
 """
 Demo of extracting text and links from a rendered web page.

 $ brew install geckodriver
 $ python3 -m venv .
 $ bin/pip install bs4 selenium
 $ bin/python scrap.py

 The script looks for an element of a specific id on the page.
 This can be used to make sure we wait for all JS to execute, and
 fall back on waiting a few seconds.

 The website can set such value at the end of its JS execution.

 This demo uses BeautifulSoup to extract the content and links.
 """
 import contextlib
 import sys
 import traceback
 import time
 import io

 from selenium import webdriver
 from selenium.webdriver.common.by import By
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
 from selenium.common.exceptions import TimeoutException

 from bs4 import BeautifulSoup


 @contextlib.contextmanager
 def silence():
    old_stderr = sys.stderr
    sys.stderr = io.BytesIO()
    old_stdout = sys.stdout
    sys.stdout = io.BytesIO()
    last_exc = None
    try:
        yield
    except Exception as exc:
        last_exc = exc
    finally:
        sys.stdout = old_stdout
        sys.stderr = old_stderr
    if last_exc is not None:
        traceback.print_tb(last_exc.__traceback__)
        raise last_exc


 def bs_parse(content="", parser="html.parser"):
    start = time.time()
    with silence():
        soup = BeautifulSoup(content, parser)
        text = soup.find_all(string=True)
        lines = [(t.parent.name, str(t).lower().strip()) for t in text]
        links = [a_tag.attrs.get("href") for a_tag in soup.findAll("a")]
        return (time.time() - start, len(text), lines, links)


 def get_page_source(url, element_id, timeout=5):
    driver = webdriver.Firefox()
    driver.get(url)
    try:
        elem = WebDriverWait(driver, timeout).until(
            EC.presence_of_element_located((By.ID, element_id))
        )
    except TimeoutException:
        # could not find the item, giving up after 5 seconds
        pass
    finally:
        source = driver.page_source
        driver.quit()
    return source


 print(bs_parse(content=get_page_source("https://www.elastic.co", "myBar")))
	"""
	Demo of extracting text and links from a rendered web page.

	$ brew install geckodriver
	$ python3 -m venv .
	$ bin/pip install bs4 selenium
	$ bin/python scrap.py

	The script looks for an element of a specific id on the page.
	This can be used to make sure we wait for all JS to execute, and
	fall back on waiting a few seconds.

	The website can set such value at the end of its JS execution.

	This demo uses BeautifulSoup to extract the content and links.
	"""
	import contextlib
	import sys
	import traceback
	import time
	import io

	from selenium import webdriver
	from selenium.webdriver.common.by import By
	from selenium.webdriver.support.ui import WebDriverWait
	from selenium.webdriver.support import expected_conditions as EC
	from selenium.common.exceptions import TimeoutException

	from bs4 import BeautifulSoup


	@contextlib.contextmanager
	def silence():
	old_stderr = sys.stderr
	sys.stderr = io.BytesIO()
	old_stdout = sys.stdout
	sys.stdout = io.BytesIO()
	last_exc = None
	try:
	yield
	except Exception as exc:
	last_exc = exc
	finally:
	sys.stdout = old_stdout
	sys.stderr = old_stderr
	if last_exc is not None:
	traceback.print_tb(last_exc.__traceback__)
	raise last_exc


	def bs_parse(content="", parser="html.parser"):
	start = time.time()
	with silence():
	soup = BeautifulSoup(content, parser)
	text = soup.find_all(string=True)
	lines = [(t.parent.name, str(t).lower().strip()) for t in text]
	links = [a_tag.attrs.get("href") for a_tag in soup.findAll("a")]
	return (time.time() - start, len(text), lines, links)


	def get_page_source(url, element_id, timeout=5):
	driver = webdriver.Firefox()
	driver.get(url)
	try:
	elem = WebDriverWait(driver, timeout).until(
	EC.presence_of_element_located((By.ID, element_id))
	)
	except TimeoutException:
	# could not find the item, giving up after 5 seconds
	pass
	finally:
	source = driver.page_source
	driver.quit()
	return source


	print(bs_parse(content=get_page_source("https://www.elastic.co", "myBar")))