Skip to content

Instantly share code, notes, and snippets.

@LeslieZhu
Last active August 29, 2015 13:56
Show Gist options
  • Save LeslieZhu/8978432 to your computer and use it in GitHub Desktop.
Save LeslieZhu/8978432 to your computer and use it in GitHub Desktop.
Read http://lesliezhu.github.com/Notes/index.html and gen my blog's RSS feed file.
#!/usr/bin/env python
#-*- coding:utf-8 -*-
import os,sys,re,time
_rss_ = "http://lesliezhu.github.com/rss.xml"
_dirs_ = ["Notes"] # only RSS Feed 'Notes'
def gen_url(dir=""):
"""get dir and read dir/index.org to filter URL """
for line in open(dir+"/index.org").readlines():
if "[[" in line:
line=line.strip()
line=re.search("(\..*html)",line).groups(1)[0]
yield dir+"/"+line
def gen_prefix():
""" RSS XML prefix """
print '''<?xml version="1.0" encoding="UTF-8" ?>
<rss version="2.0">
<channel>
<title>弘毅 </title>
<link>http://lesliezhu.github.com/ </link>
<description>
<![CDATA[士不可以不弘毅,任重而道远! ]]>
</description>
<language>zh-CN</language>
<generator>genrss.py rss generator </generator>
<webMaster><![CDATA[朱春来(Leslie Zhu)]]></webMaster>
<ttl>120</ttl>
<image>
<title><![CDATA[弘毅]]></title>
<url>http://lesliezhu.github.io/favicon.ico</url>
<link>http://lesliezhu.github.com</link>
</image>
'''
def gen_suffix():
""" RSS XML suffix"""
print '''
</channel>
</rss>
'''
def gen_item(link="",creator="朱春来(Leslie Zhu)"):
""" RSS Item from HTML """
print '''
<item>
<title> %s </title>
<link> http://lesliezhu.github.com/%s </link>
<author><![CDATA[%s]]></author>
<guid isPermaLink="true">http://lesliezhu.github.com/%s</guid>
''' % (gen_title(link),link,creator,link)
for tag in gen_category(link):
print '''
<category><![CDATA[%s]]></category>''' % tag
print '''
<pubDate>%s</pubDate>
<description><![CDATA[%s]]></description>
<comments>http://lesliezhu.github.com/%s</comments>
</item>
''' % (gen_pubdate(link),gen_description(link),link)
def gen_category(link=""):
""" Filter Keywords from HTML metadata """
for line in open(link).readlines():
if "<meta name=\"keywords\"" in line:
line = line.strip()
keywords=re.search("content=\"(.*)\"",line).groups(1)[0]
break
if len(keywords) > 0:
for ch in "\ ,;:、,":
if ch in keywords:
return [i.strip() for i in keywords.split(ch)]
else:
return ["札记"]
def gen_title(link=""):
""" Filter Title from HTML metadata """
for line in open(link).readlines():
if "<title>" in line:
line = line.strip()
title=re.match("<title>(.*)</title>",line).groups(1)[0]
break
return title
def gen_pubdate(link=""):
""" Filter Publish data from HTML metadata>"""
for line in open(link).readlines():
if "<meta name=\"generated\"" in line:
line = line.strip()
pattern="([0-9][0-9]/[0-9][0-9]/[0-9][0-9][0-9][0-9]|[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9])"
pubdate=re.search(pattern,line).groups(1)[0]
break
if "/" in pubdate:
pubdate=time.strptime(pubdate, "%m/%d/%Y")
elif "-" in pubdate:
pubdate=time.strptime(pubdate, "%Y-%m-%d")
else:
pubdate=re.search("([0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9])",link).groups(1)[0]
pubdate=time.strptime(pubdate, "%Y-%m-%d")
pubdate=time.strftime("%a, %d %b %Y %H:%M:%S +0800",pubdate)
return pubdate
def gen_description(link=""):
""" Filter body data of HTML """
data=open(link).read()
data=re.search("(<div id=\"content\">.*</div.).*</body>",data.replace('\n','敏敏')).groups()[0].replace('敏敏','\n')
#data=re.search("(<\?xml.*</html>)",data.replace('\n','敏敏')).groups(1)[0].replace('敏敏','\n')
for spec in ['&nbsp']:
data=data.replace(spec,'')
return data
if __name__ == "__main__":
gen_prefix()
for dir in _dirs_:
for url in gen_url(dir):
gen_item(url)
gen_suffix()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment