AlexanderCollins · April 8, 2024 07:39
diff --git a/Site Fetcher Utility b/Site Fetcher Utility
 import requests
 from bs4 import BeautifulSoup
 from bs4.element import Comment

 def tag_visible(element):
    """
    Determines if a tag is visible based on its parent's name and whether it is a comment.
    
    Parameters:
        element: The BeautifulSoup element to check visibility for.
    
    Returns:
        bool: True if the element is visible, False otherwise.
    """
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True

 class SiteTextFetcherUtility:
    """
    A utility class for fetching the content of a website by its URL and extracting visible text using BeautifulSoup.
    """
    
    @classmethod
    def fetch_site(cls, url):
        """
        Class method to fetch the content of a website given its URL.
        
        Parameters:
            url (str): The URL of the website to fetch.
        
        Returns:
            bytes: The raw HTML content of the website.
        """
        response = requests.get(url)
        response.raise_for_status()  # Raises HTTPError for bad responses
        return response.content
    
    @classmethod
    def extract_text(cls, html_content):
        """
        Class method to extract visible text from HTML content.
        
        Parameters:
            html_content (bytes): The HTML content from which to extract text.
        
        Returns:
            str: The extracted visible text.
        """
        soup = BeautifulSoup(html_content, 'html.parser')
        texts = soup.findAll(text=True)
        visible_texts = filter(tag_visible, texts)
        return " ".join(t.strip() for t in visible_texts)
    
    @classmethod
    def fetch_and_extract(cls, url):
        """
        Class method to fetch a website by URL and extract all visible text from it.
        
        Parameters:
            url (str): The URL of the website to fetch and extract text from.
        
        Returns:
            str: The visible text extracted from the website.
        """
        site_content = cls.fetch_site(url)
        return cls.extract_text(site_content)
	import requests
	from bs4 import BeautifulSoup
	from bs4.element import Comment

	def tag_visible(element):
	"""
	Determines if a tag is visible based on its parent's name and whether it is a comment.

	Parameters:
	element: The BeautifulSoup element to check visibility for.

	Returns:
	bool: True if the element is visible, False otherwise.
	"""
	if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
	return False
	if isinstance(element, Comment):
	return False
	return True

	class SiteTextFetcherUtility:
	"""
	A utility class for fetching the content of a website by its URL and extracting visible text using BeautifulSoup.
	"""

	@classmethod
	def fetch_site(cls, url):
	"""
	Class method to fetch the content of a website given its URL.

	Parameters:
	url (str): The URL of the website to fetch.

	Returns:
	bytes: The raw HTML content of the website.
	"""
	response = requests.get(url)
	response.raise_for_status() # Raises HTTPError for bad responses
	return response.content

	@classmethod
	def extract_text(cls, html_content):
	"""
	Class method to extract visible text from HTML content.

	Parameters:
	html_content (bytes): The HTML content from which to extract text.

	Returns:
	str: The extracted visible text.
	"""
	soup = BeautifulSoup(html_content, 'html.parser')
	texts = soup.findAll(text=True)
	visible_texts = filter(tag_visible, texts)
	return " ".join(t.strip() for t in visible_texts)

	@classmethod
	def fetch_and_extract(cls, url):
	"""
	Class method to fetch a website by URL and extract all visible text from it.

	Parameters:
	url (str): The URL of the website to fetch and extract text from.

	Returns:
	str: The visible text extracted from the website.
	"""
	site_content = cls.fetch_site(url)
	return cls.extract_text(site_content)