Skip to content

Instantly share code, notes, and snippets.

@kadirpekel
Last active February 26, 2025 22:17
Show Gist options
  • Save kadirpekel/d369caf74d505f4a340bfd79eb0e38f7 to your computer and use it in GitHub Desktop.
Save kadirpekel/d369caf74d505f4a340bfd79eb0e38f7 to your computer and use it in GitHub Desktop.
Easy News Agent
import sys
from urllib.parse import urlparse
from typing import Dict
from aixplain.factories import AgentFactory, ModelFactory
def _scrape_rss_feed(url: str):
"""
Scrapes an RSS feed from a given URL.
"""
import xml.etree.ElementTree as etree
import urllib.request
response = urllib.request.urlopen(url)
articles = {}
with response as f:
root = etree.fromstring(f.read())
for item in root.findall(".//item"):
title = item.find("title").text
url = item.find("link").text
if not url.startswith("https://www.bbc.com/news/articles/"):
continue
articles[title] = url
return articles
def _scrape_news_article(url: str):
"""
Scrapes an RSS feed from a given URL and returns a string.
"""
import xml.etree.ElementTree as etree
import urllib.request
import re
from io import StringIO
response = urllib.request.urlopen(url)
articles = {}
with response as f:
html = f.read().decode("utf-8")
# div blocks should be inclusive for the search
headline_block = re.search(
r'(<div data-component="headline-block".*?</div>)', html, re.DOTALL
)
if headline_block is None:
raise ValueError(f"No title found in the news article: {url}")
xml_block = etree.fromstring(headline_block.group(0))
title = xml_block.find(".//h1").text
text_blocks = re.findall(
r'(<div data-component="text-block".*?</div>)', html, re.DOTALL
)
if len(text_blocks) == 0:
raise ValueError(f"No text blocks found in the news article: {url}")
article = StringIO()
for text_block in text_blocks:
xml_block = etree.fromstring(text_block)
for p in xml_block.findall(".//p"):
article.write(f"{p.text}\n\n")
articles[title] = article.getvalue()
return articles
def scrape_rss_feed(url: str):
"""
Parses an RSS feed from a given URL.
"""
from io import StringIO
feeds_buffer = StringIO()
try:
feeds = _scrape_rss_feed(url)
except Exception as e:
raise ValueError(f"Error scraping RSS feed: {e}")
for title, url in feeds.items():
feeds_buffer.write(f"{title}\n{url}\n\n")
return feeds_buffer.getvalue()
def scrape_news_article(url: str):
"""
Scrapes the BBC News website.
"""
from io import StringIO
articles_buffer = StringIO()
try:
articles = _scrape_news_article(url)
except Exception as e:
raise ValueError(f"Error scraping news article: {e}")
for title, article in articles.items():
articles_buffer.write(f"### {title}\n\n{article}\n\n")
return articles_buffer.getvalue()
class EasyNewsAgent:
"""
A class that represents an AI assistant designed to help foreign language learners.
"""
AGENT_NAME = "Easy News Agent"
SCRAPE_RSS_TOOL_MODEL = "67bf733d058286b62912e38e"
SCRAPE_NEWS_TOOL_MODEL = "67bf740f058286b62912e38f"
PROMPT_TEMPLATE = """
You are an AI assistant designed to help foreign language learners stay informed.
You are given a list of categories and their corresponding RSS feed URLs as follows:
{categories}
By using above the categories and their rss feeds, Your goal is to:
1. Infer the most relevant category based on the user's input, conversation history, or by guiding them to choose one of available categories.
2. Fetch and parse the rss feed by using the scrape_rss_feed tool for the inferred category to get the relevant news articles.
3. Pick the interesting news articles that are most interesting and relevant to the user's interest.
4. Provide the picked items as a list of news articles with a interesting headline and build a conversation with user which news article is most interesting and relevant to the user's interest.
5. As the user asks for more details or shows more interest in the news article, fetch the full content for each selected news article by using the scrape_news_article tool again by referring each news article's own url.
6. Once you think the conversation is stuck or no more information is available regarding the discussed news article, please ask the user to provide more details about the news article or choose a different category or interest or news article.
7. Please do your ALL the conversation in the user's requested language level (e.g., A1, A2) or infer the appropriate level based on the user's language proficiency in the conversation history.
8. Unless requested otherwise, deliver the news in conversational text like a news reporter or storyteller.
"""
def __init__(
self,
feeds: Dict[str, str] = None,
prompt_template: str = None,
agent_name: str = None,
):
self.feeds = feeds
self.prompt_template = prompt_template or self.PROMPT_TEMPLATE
self.agent_name = agent_name or self.AGENT_NAME
self._validate_feeds()
self.agent = self._populate_agent()
def _validate_feeds(self):
"""
Validates the feeds dictionary.
"""
if not self.feeds:
raise ValueError("Feeds dictionary is required")
for category, url in self.feeds.items():
if not url:
raise ValueError(f"Feed URL for category {category} is required")
try:
urlparse(url)
except Exception as e:
raise ValueError(f"Invalid feed URL for category {category}: {e}")
def _populate_prompt(self):
"""
Populates the prompt with the necessary information.
"""
categories = "\n".join(
[f"{category}: {url}" for category, url in self.feeds.items()]
)
return self.PROMPT_TEMPLATE.format(categories=categories)
def _populate_agent(self):
"""
Populates the agent with the necessary tools and description.
Returns:
Agent: The populated agent.
"""
# scrape_rss_feed_tool = AgentFactory.create_custom_python_code_tool(
# scrape_rss_feed,
# description="Fetch and parse an RSS feed from a given URL.",
# )
# scrape_news_tool = AgentFactory.create_custom_python_code_tool(
# scrape_news_article,
# description="Fetch and parse an news article from a given URL.",
# )
scrape_rss_feed_tool = ModelFactory.get(model_id=self.SCRAPE_RSS_TOOL_MODEL)
scrape_news_tool = ModelFactory.get(model_id=self.SCRAPE_NEWS_TOOL_MODEL)
prompt = self._populate_prompt()
return AgentFactory.create(
name=self.agent_name,
description=prompt,
tools=[scrape_rss_feed_tool, scrape_news_tool],
)
def run(self, data: str):
"""
Runs the agent with the given data.
"""
return self.agent.run(data=data)
def test_scraping():
feeds = _scrape_rss_feed("http://feeds.bbci.co.uk/news/rss.xml")
for feed in feeds:
article = scrape_news_article(feeds[feed])
print(article)
break
def __main__():
if len(sys.argv) > 1 and sys.argv[1] == "test":
test_scraping()
return
agent = EasyNewsAgent(
{
"Top Stories": "http://feeds.bbci.co.uk/news/rss.xml",
"World": "http://feeds.bbci.co.uk/news/world/rss.xml",
"UK": "http://feeds.bbci.co.uk/news/uk/rss.xml",
"Business": "http://feeds.bbci.co.uk/news/business/rss.xml",
"Politics": "http://feeds.bbci.co.uk/news/politics/rss.xml",
"Health": "http://feeds.bbci.co.uk/news/health/rss.xml",
"Education & Family": "http://feeds.bbci.co.uk/news/education/rss.xml",
"Science & Environment": "http://feeds.bbci.co.uk/news/science_and_environment/rss.xml",
"Technology": "http://feeds.bbci.co.uk/news/technology/rss.xml",
"Entertainment & Arts": "http://feeds.bbci.co.uk/news/entertainment_and_arts/rss.xml",
}
)
response = agent.run("was passiert heute in der Welt?")
print(response)
if __name__ == "__main__":
__main__()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment