Skip to content

Instantly share code, notes, and snippets.

@Nama
Created September 29, 2016 22:19
Show Gist options
  • Save Nama/7d41c08dae66f68463904072affb3643 to your computer and use it in GitHub Desktop.
Save Nama/7d41c08dae66f68463904072affb3643 to your computer and use it in GitHub Desktop.
Parse the news site https://noc.rub.de and serve the headlines as RSS feeds. Using Flask
#!/usr/bin/env python
from flask import Flask, request, url_for
from werkzeug.contrib.atom import AtomFeed
from html.parser import HTMLParser
import urllib.request
from datetime import datetime
app = Flask(__name__)
url_noc = 'https://noc.rub.de/cgi-bin/status/'
@app.route('/noc/')
def noc():
i = 0
feed = AtomFeed('Noc News', feed_url=url_for('noc'))
class TagParser(HTMLParser):
def handle_starttag(self, tag, attrs):
self.tag = tag
def handle_data(self, data):
try:
if self.tag == 'h3':
# Set a random date, so it doesn't raise an error. Attempt to Parse the timestamp is in the other file
feed.add(title=data, content=data, updated=datetime(2016, 9, 1, minute=i), content_type='text', author='NOC', url=url_noc)
except:
pass
response = urllib.request.urlopen(url_noc)
html = response.read().decode('utf-8')
parser = TagParser()
parser.feed(html)
return feed.get_response()
#!/usr/bin/env python
''' NOT WORKING
parsing the timestamp is ugly.
Maybe you have an idea?'''
from flask import Flask, request, url_for
from werkzeug.contrib.atom import AtomFeed
from html.parser import HTMLParser
import urllib.request
from datetime import datetime
app = Flask(__name__)
url_noc = 'https://noc.rub.de/cgi-bin/status/'
@app.route('/noc/')
def noc():
last_tag = 'span'
feed = AtomFeed('NOC News', feed_url=url_for('noc'))
class TagParser(HTMLParser):
def handle_starttag(self, tag, attrs):
self.tag = tag
try:
for attr in attrs:
if attr[0] == 'id':
self.attr = attr[1]
except:
pass
def handle_data(self, data):
try:
print(self.attr)
#if last_tag == 'span' and self.tag == 'div':
if self.tag == 'div' and last_tag == 'span' and 'headline' in self.attr:
print(self.tag)
last_tag = self.tag
url = '%s#%s' % (url_noc, self.attr)
print(url)
elif last_tag == 'div' and self.tag == 'h3':
last_tag = self.tag
title = data
print(title)
elif last_tag == 'h3':
if self.tag == 'span' and 'Uhr' in data:
last_tag = self.tag
print(datetime.strptime(data, "%d.%m.%Y %H:%M Uhr"))
feed.add(title=title, content=title, updated=datetime(2016, 9, 1, minute=i), content_type='text', author='NOC', url=url_noc)
except:
pass
response = urllib.request.urlopen(url_noc)
html = response.read().decode('utf-8')
parser = TagParser()
parser.feed(html)
return feed.get_response()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment