kadirpekel · February 26, 2025 22:17
diff --git a/easy_news_agent.py b/easy_news_agent.py
 import sys

 from urllib.parse import urlparse
 from typing import Dict

 from aixplain.factories import AgentFactory, ModelFactory


 def _scrape_rss_feed(url: str):
    """
    Scrapes an RSS feed from a given URL.
    """
    import xml.etree.ElementTree as etree
    import urllib.request

    response = urllib.request.urlopen(url)
    articles = {}
    with response as f:
        root = etree.fromstring(f.read())
        for item in root.findall(".//item"):
            title = item.find("title").text
            url = item.find("link").text
            if not url.startswith("https://www.bbc.com/news/articles/"):
                continue
            articles[title] = url
    return articles


 def _scrape_news_article(url: str):
    """
    Scrapes an RSS feed from a given URL and returns a string.
    """
    import xml.etree.ElementTree as etree
    import urllib.request
    import re
    from io import StringIO

    response = urllib.request.urlopen(url)
    articles = {}
    with response as f:
        html = f.read().decode("utf-8")
        # div blocks should be inclusive for the search
        headline_block = re.search(
            r'(<div data-component="headline-block".*?</div>)', html, re.DOTALL
        )

        if headline_block is None:
            raise ValueError(f"No title found in the news article: {url}")

        xml_block = etree.fromstring(headline_block.group(0))
        title = xml_block.find(".//h1").text

        text_blocks = re.findall(
            r'(<div data-component="text-block".*?</div>)', html, re.DOTALL
        )
        if len(text_blocks) == 0:
            raise ValueError(f"No text blocks found in the news article: {url}")

        article = StringIO()
        for text_block in text_blocks:
            xml_block = etree.fromstring(text_block)
            for p in xml_block.findall(".//p"):
                article.write(f"{p.text}\n\n")

        articles[title] = article.getvalue()

    return articles


 def scrape_rss_feed(url: str):
    """
    Parses an RSS feed from a given URL.
    """
    from io import StringIO

    feeds_buffer = StringIO()
    try:
        feeds = _scrape_rss_feed(url)
    except Exception as e:
        raise ValueError(f"Error scraping RSS feed: {e}")

    for title, url in feeds.items():
        feeds_buffer.write(f"{title}\n{url}\n\n")

    return feeds_buffer.getvalue()


 def scrape_news_article(url: str):
    """
    Scrapes the BBC News website.
    """

    from io import StringIO

    articles_buffer = StringIO()
    try:
        articles = _scrape_news_article(url)
    except Exception as e:
        raise ValueError(f"Error scraping news article: {e}")

    for title, article in articles.items():
        articles_buffer.write(f"### {title}\n\n{article}\n\n")
    return articles_buffer.getvalue()


 class EasyNewsAgent:
    """
    A class that represents an AI assistant designed to help foreign language learners.
    """

    AGENT_NAME = "Easy News Agent"
    SCRAPE_RSS_TOOL_MODEL = "67bf733d058286b62912e38e"
    SCRAPE_NEWS_TOOL_MODEL = "67bf740f058286b62912e38f"

    PROMPT_TEMPLATE = """
        You are an AI assistant designed to help foreign language learners stay informed.

        You are given a list of categories and their corresponding RSS feed URLs as follows:
        {categories}

        By using above the categories and their rss feeds, Your goal is to:

        1. Infer the most relevant category based on the user's input, conversation history, or by guiding them to choose one of available categories.
        2. Fetch and parse the rss feed by using the scrape_rss_feed tool for the inferred category to get the relevant news articles.
        3. Pick the interesting news articles that are most interesting and relevant to the user's interest.
        4. Provide the picked items as a list of news articles with a interesting headline and build a conversation with user which news article is most interesting and relevant to the user's interest.
        5. As the user asks for more details or shows more interest in the news article, fetch the full content for each selected news article by using the scrape_news_article tool again by referring each news article's own url.
        6. Once you think the conversation is stuck or no more information is available regarding the discussed news article, please ask the user to provide more details about the news article or choose a different category or interest or news article.
        7. Please do your ALL the conversation in the user's requested language level (e.g., A1, A2) or infer the appropriate level based on the user's language proficiency in the conversation history.
        8. Unless requested otherwise, deliver the news in conversational text like a news reporter or storyteller.
    """

    def __init__(
        self,
        feeds: Dict[str, str] = None,
        prompt_template: str = None,
        agent_name: str = None,
    ):
        self.feeds = feeds
        self.prompt_template = prompt_template or self.PROMPT_TEMPLATE
        self.agent_name = agent_name or self.AGENT_NAME

        self._validate_feeds()

        self.agent = self._populate_agent()

    def _validate_feeds(self):
        """
        Validates the feeds dictionary.
        """
        if not self.feeds:
            raise ValueError("Feeds dictionary is required")
        for category, url in self.feeds.items():
            if not url:
                raise ValueError(f"Feed URL for category {category} is required")
            try:
                urlparse(url)
            except Exception as e:
                raise ValueError(f"Invalid feed URL for category {category}: {e}")

    def _populate_prompt(self):
        """
        Populates the prompt with the necessary information.
        """
        categories = "\n".join(
            [f"{category}: {url}" for category, url in self.feeds.items()]
        )
        return self.PROMPT_TEMPLATE.format(categories=categories)

    def _populate_agent(self):
        """
        Populates the agent with the necessary tools and description.

        Returns:
            Agent: The populated agent.
        """
        # scrape_rss_feed_tool = AgentFactory.create_custom_python_code_tool(
        #     scrape_rss_feed,
        #     description="Fetch and parse an RSS feed from a given URL.",
        # )
        # scrape_news_tool = AgentFactory.create_custom_python_code_tool(
        #     scrape_news_article,
        #     description="Fetch and parse an news article from a given URL.",
        # )

        scrape_rss_feed_tool = ModelFactory.get(model_id=self.SCRAPE_RSS_TOOL_MODEL)
        scrape_news_tool = ModelFactory.get(model_id=self.SCRAPE_NEWS_TOOL_MODEL)
        prompt = self._populate_prompt()
        return AgentFactory.create(
            name=self.agent_name,
            description=prompt,
            tools=[scrape_rss_feed_tool, scrape_news_tool],
        )

    def run(self, data: str):
        """
        Runs the agent with the given data.
        """
        return self.agent.run(data=data)


 def test_scraping():
    feeds = _scrape_rss_feed("http://feeds.bbci.co.uk/news/rss.xml")
    for feed in feeds:
        article = scrape_news_article(feeds[feed])
        print(article)
        break


 def __main__():
    if len(sys.argv) > 1 and sys.argv[1] == "test":
        test_scraping()
        return

    agent = EasyNewsAgent(
        {
            "Top Stories": "http://feeds.bbci.co.uk/news/rss.xml",
            "World": "http://feeds.bbci.co.uk/news/world/rss.xml",
            "UK": "http://feeds.bbci.co.uk/news/uk/rss.xml",
            "Business": "http://feeds.bbci.co.uk/news/business/rss.xml",
            "Politics": "http://feeds.bbci.co.uk/news/politics/rss.xml",
            "Health": "http://feeds.bbci.co.uk/news/health/rss.xml",
            "Education & Family": "http://feeds.bbci.co.uk/news/education/rss.xml",
            "Science & Environment": "http://feeds.bbci.co.uk/news/science_and_environment/rss.xml",
            "Technology": "http://feeds.bbci.co.uk/news/technology/rss.xml",
            "Entertainment & Arts": "http://feeds.bbci.co.uk/news/entertainment_and_arts/rss.xml",
        }
    )
    response = agent.run("was passiert heute in der Welt?")
    print(response)


 if __name__ == "__main__":
    __main__()
	import sys

	from urllib.parse import urlparse
	from typing import Dict

	from aixplain.factories import AgentFactory, ModelFactory


	def _scrape_rss_feed(url: str):
	"""
	Scrapes an RSS feed from a given URL.
	"""
	import xml.etree.ElementTree as etree
	import urllib.request

	response = urllib.request.urlopen(url)
	articles = {}
	with response as f:
	root = etree.fromstring(f.read())
	for item in root.findall(".//item"):
	title = item.find("title").text
	url = item.find("link").text
	if not url.startswith("https://www.bbc.com/news/articles/"):
	continue
	articles[title] = url
	return articles


	def _scrape_news_article(url: str):
	"""
	Scrapes an RSS feed from a given URL and returns a string.
	"""
	import xml.etree.ElementTree as etree
	import urllib.request
	import re
	from io import StringIO

	response = urllib.request.urlopen(url)
	articles = {}
	with response as f:
	html = f.read().decode("utf-8")
	# div blocks should be inclusive for the search
	headline_block = re.search(
	r'(<div data-component="headline-block".*?</div>)', html, re.DOTALL
	)

	if headline_block is None:
	raise ValueError(f"No title found in the news article: {url}")

	xml_block = etree.fromstring(headline_block.group(0))
	title = xml_block.find(".//h1").text

	text_blocks = re.findall(
	r'(<div data-component="text-block".*?</div>)', html, re.DOTALL
	)
	if len(text_blocks) == 0:
	raise ValueError(f"No text blocks found in the news article: {url}")

	article = StringIO()
	for text_block in text_blocks:
	xml_block = etree.fromstring(text_block)
	for p in xml_block.findall(".//p"):
	article.write(f"{p.text}\n\n")

	articles[title] = article.getvalue()

	return articles


	def scrape_rss_feed(url: str):
	"""
	Parses an RSS feed from a given URL.
	"""
	from io import StringIO

	feeds_buffer = StringIO()
	try:
	feeds = _scrape_rss_feed(url)
	except Exception as e:
	raise ValueError(f"Error scraping RSS feed: {e}")

	for title, url in feeds.items():
	feeds_buffer.write(f"{title}\n{url}\n\n")

	return feeds_buffer.getvalue()


	def scrape_news_article(url: str):
	"""
	Scrapes the BBC News website.
	"""

	from io import StringIO

	articles_buffer = StringIO()
	try:
	articles = _scrape_news_article(url)
	except Exception as e:
	raise ValueError(f"Error scraping news article: {e}")

	for title, article in articles.items():
	articles_buffer.write(f"### {title}\n\n{article}\n\n")
	return articles_buffer.getvalue()


	class EasyNewsAgent:
	"""
	A class that represents an AI assistant designed to help foreign language learners.
	"""

	AGENT_NAME = "Easy News Agent"
	SCRAPE_RSS_TOOL_MODEL = "67bf733d058286b62912e38e"
	SCRAPE_NEWS_TOOL_MODEL = "67bf740f058286b62912e38f"

	PROMPT_TEMPLATE = """
	You are an AI assistant designed to help foreign language learners stay informed.

	You are given a list of categories and their corresponding RSS feed URLs as follows:
	{categories}

	By using above the categories and their rss feeds, Your goal is to:

	1. Infer the most relevant category based on the user's input, conversation history, or by guiding them to choose one of available categories.
	2. Fetch and parse the rss feed by using the scrape_rss_feed tool for the inferred category to get the relevant news articles.
	3. Pick the interesting news articles that are most interesting and relevant to the user's interest.
	4. Provide the picked items as a list of news articles with a interesting headline and build a conversation with user which news article is most interesting and relevant to the user's interest.
	5. As the user asks for more details or shows more interest in the news article, fetch the full content for each selected news article by using the scrape_news_article tool again by referring each news article's own url.
	6. Once you think the conversation is stuck or no more information is available regarding the discussed news article, please ask the user to provide more details about the news article or choose a different category or interest or news article.
	7. Please do your ALL the conversation in the user's requested language level (e.g., A1, A2) or infer the appropriate level based on the user's language proficiency in the conversation history.
	8. Unless requested otherwise, deliver the news in conversational text like a news reporter or storyteller.
	"""

	def __init__(
	self,
	feeds: Dict[str, str] = None,
	prompt_template: str = None,
	agent_name: str = None,
	):
	self.feeds = feeds
	self.prompt_template = prompt_template or self.PROMPT_TEMPLATE
	self.agent_name = agent_name or self.AGENT_NAME

	self._validate_feeds()

	self.agent = self._populate_agent()

	def _validate_feeds(self):
	"""
	Validates the feeds dictionary.
	"""
	if not self.feeds:
	raise ValueError("Feeds dictionary is required")
	for category, url in self.feeds.items():
	if not url:
	raise ValueError(f"Feed URL for category {category} is required")
	try:
	urlparse(url)
	except Exception as e:
	raise ValueError(f"Invalid feed URL for category {category}: {e}")

	def _populate_prompt(self):
	"""
	Populates the prompt with the necessary information.
	"""
	categories = "\n".join(
	[f"{category}: {url}" for category, url in self.feeds.items()]
	)
	return self.PROMPT_TEMPLATE.format(categories=categories)

	def _populate_agent(self):
	"""
	Populates the agent with the necessary tools and description.

	Returns:
	Agent: The populated agent.
	"""
	# scrape_rss_feed_tool = AgentFactory.create_custom_python_code_tool(
	# scrape_rss_feed,
	# description="Fetch and parse an RSS feed from a given URL.",
	# )
	# scrape_news_tool = AgentFactory.create_custom_python_code_tool(
	# scrape_news_article,
	# description="Fetch and parse an news article from a given URL.",
	# )

	scrape_rss_feed_tool = ModelFactory.get(model_id=self.SCRAPE_RSS_TOOL_MODEL)
	scrape_news_tool = ModelFactory.get(model_id=self.SCRAPE_NEWS_TOOL_MODEL)
	prompt = self._populate_prompt()
	return AgentFactory.create(
	name=self.agent_name,
	description=prompt,
	tools=[scrape_rss_feed_tool, scrape_news_tool],
	)

	def run(self, data: str):
	"""
	Runs the agent with the given data.
	"""
	return self.agent.run(data=data)


	def test_scraping():
	feeds = _scrape_rss_feed("http://feeds.bbci.co.uk/news/rss.xml")
	for feed in feeds:
	article = scrape_news_article(feeds[feed])
	print(article)
	break


	def __main__():
	if len(sys.argv) > 1 and sys.argv[1] == "test":
	test_scraping()
	return

	agent = EasyNewsAgent(
	{
	"Top Stories": "http://feeds.bbci.co.uk/news/rss.xml",
	"World": "http://feeds.bbci.co.uk/news/world/rss.xml",
	"UK": "http://feeds.bbci.co.uk/news/uk/rss.xml",
	"Business": "http://feeds.bbci.co.uk/news/business/rss.xml",
	"Politics": "http://feeds.bbci.co.uk/news/politics/rss.xml",
	"Health": "http://feeds.bbci.co.uk/news/health/rss.xml",
	"Education & Family": "http://feeds.bbci.co.uk/news/education/rss.xml",
	"Science & Environment": "http://feeds.bbci.co.uk/news/science_and_environment/rss.xml",
	"Technology": "http://feeds.bbci.co.uk/news/technology/rss.xml",
	"Entertainment & Arts": "http://feeds.bbci.co.uk/news/entertainment_and_arts/rss.xml",
	}
	)
	response = agent.run("was passiert heute in der Welt?")
	print(response)


	if __name__ == "__main__":
	__main__()