Created
September 29, 2016 22:19
-
-
Save Nama/7d41c08dae66f68463904072affb3643 to your computer and use it in GitHub Desktop.
Parse the news site https://noc.rub.de and serve the headlines as RSS feeds. Using Flask
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
from flask import Flask, request, url_for | |
from werkzeug.contrib.atom import AtomFeed | |
from html.parser import HTMLParser | |
import urllib.request | |
from datetime import datetime | |
app = Flask(__name__) | |
url_noc = 'https://noc.rub.de/cgi-bin/status/' | |
@app.route('/noc/') | |
def noc(): | |
i = 0 | |
feed = AtomFeed('Noc News', feed_url=url_for('noc')) | |
class TagParser(HTMLParser): | |
def handle_starttag(self, tag, attrs): | |
self.tag = tag | |
def handle_data(self, data): | |
try: | |
if self.tag == 'h3': | |
# Set a random date, so it doesn't raise an error. Attempt to Parse the timestamp is in the other file | |
feed.add(title=data, content=data, updated=datetime(2016, 9, 1, minute=i), content_type='text', author='NOC', url=url_noc) | |
except: | |
pass | |
response = urllib.request.urlopen(url_noc) | |
html = response.read().decode('utf-8') | |
parser = TagParser() | |
parser.feed(html) | |
return feed.get_response() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
''' NOT WORKING | |
parsing the timestamp is ugly. | |
Maybe you have an idea?''' | |
from flask import Flask, request, url_for | |
from werkzeug.contrib.atom import AtomFeed | |
from html.parser import HTMLParser | |
import urllib.request | |
from datetime import datetime | |
app = Flask(__name__) | |
url_noc = 'https://noc.rub.de/cgi-bin/status/' | |
@app.route('/noc/') | |
def noc(): | |
last_tag = 'span' | |
feed = AtomFeed('NOC News', feed_url=url_for('noc')) | |
class TagParser(HTMLParser): | |
def handle_starttag(self, tag, attrs): | |
self.tag = tag | |
try: | |
for attr in attrs: | |
if attr[0] == 'id': | |
self.attr = attr[1] | |
except: | |
pass | |
def handle_data(self, data): | |
try: | |
print(self.attr) | |
#if last_tag == 'span' and self.tag == 'div': | |
if self.tag == 'div' and last_tag == 'span' and 'headline' in self.attr: | |
print(self.tag) | |
last_tag = self.tag | |
url = '%s#%s' % (url_noc, self.attr) | |
print(url) | |
elif last_tag == 'div' and self.tag == 'h3': | |
last_tag = self.tag | |
title = data | |
print(title) | |
elif last_tag == 'h3': | |
if self.tag == 'span' and 'Uhr' in data: | |
last_tag = self.tag | |
print(datetime.strptime(data, "%d.%m.%Y %H:%M Uhr")) | |
feed.add(title=title, content=title, updated=datetime(2016, 9, 1, minute=i), content_type='text', author='NOC', url=url_noc) | |
except: | |
pass | |
response = urllib.request.urlopen(url_noc) | |
html = response.read().decode('utf-8') | |
parser = TagParser() | |
parser.feed(html) | |
return feed.get_response() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment