Skip to content

Instantly share code, notes, and snippets.

@arhouati
Created February 2, 2021 22:41
Show Gist options
  • Select an option

  • Save arhouati/c831858120a842def1463a2b08f40785 to your computer and use it in GitHub Desktop.

Select an option

Save arhouati/c831858120a842def1463a2b08f40785 to your computer and use it in GitHub Desktop.
class Aljazeera():
url = 'https://www.aljazeera.net/aljazeerarss/a7c186be-1baa-4bd4-9d80-a84db769f779/73d0e1b4-532f-45ef-b135-bfdff8b8cab9'
url_base = 'https://www.aljazeera.net'
name = 'aljazeera.net'
ua = {
'use-agent': "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36"}
feed = None
articles = []
def __init__(self):
self.feed = RssFeed(self.url.strip())
self.feed.set_name(self.name)
def craw(self):
self.feed.start() # gather rss feed in a thread
self.feed.join() # wait for of all threads to finish
def get_articles(self):
for post in self.feed.items:
content, tags, image, ref = self.get_content(post['link'])
self.articles.append({
'guid': post['guid'],
'published': post['pubDate'],
'title': post['title'],
'description': post['description'],
'link': post['link'],
'content': content,
'image': image,
'ref': ref,
'tags': ' '.join(tags)
})
return self.articles
def get_content(self, url):
content, tags, image, ref= '', [], '', ''
res = requests.get(url)
soup = BeautifulSoup(res.text, 'html.parser')
try:
content = html2text.html2text(soup.select("div.body div.tinyMCE div#DynamicContentContainer")[0].getText())
except Exception as err:
logging.error('error: {0}'.format(err))
try:
tags = ["#"+t.getText().replace(" ", "_") for t in soup.select("div.tags ul li a")]
except Exception as err:
logging.error('error: {0}'.format(err))
try:
image = self.url_base + soup.select("figure.lead a#main-player img")[0]["src"]
except Exception as err:
logging.error('error: {0}'.format(err))
try:
ref = html2text.html2text(soup.select("div.body div.tinyMCE span.ref")[0].getText())
except Exception as err:
logging.error('error: {0}'.format(err))
return content, tags, image, ref
if __name__ == '__main__':
alj = Aljazeera()
alj.craw()
articles = alj.get_articles()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment