Created
April 8, 2024 07:39
-
-
Save AlexanderCollins/7da3071c56d49333687ca4d781ad243d to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
from bs4.element import Comment | |
def tag_visible(element): | |
""" | |
Determines if a tag is visible based on its parent's name and whether it is a comment. | |
Parameters: | |
element: The BeautifulSoup element to check visibility for. | |
Returns: | |
bool: True if the element is visible, False otherwise. | |
""" | |
if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']: | |
return False | |
if isinstance(element, Comment): | |
return False | |
return True | |
class SiteTextFetcherUtility: | |
""" | |
A utility class for fetching the content of a website by its URL and extracting visible text using BeautifulSoup. | |
""" | |
@classmethod | |
def fetch_site(cls, url): | |
""" | |
Class method to fetch the content of a website given its URL. | |
Parameters: | |
url (str): The URL of the website to fetch. | |
Returns: | |
bytes: The raw HTML content of the website. | |
""" | |
response = requests.get(url) | |
response.raise_for_status() # Raises HTTPError for bad responses | |
return response.content | |
@classmethod | |
def extract_text(cls, html_content): | |
""" | |
Class method to extract visible text from HTML content. | |
Parameters: | |
html_content (bytes): The HTML content from which to extract text. | |
Returns: | |
str: The extracted visible text. | |
""" | |
soup = BeautifulSoup(html_content, 'html.parser') | |
texts = soup.findAll(text=True) | |
visible_texts = filter(tag_visible, texts) | |
return " ".join(t.strip() for t in visible_texts) | |
@classmethod | |
def fetch_and_extract(cls, url): | |
""" | |
Class method to fetch a website by URL and extract all visible text from it. | |
Parameters: | |
url (str): The URL of the website to fetch and extract text from. | |
Returns: | |
str: The visible text extracted from the website. | |
""" | |
site_content = cls.fetch_site(url) | |
return cls.extract_text(site_content) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment