Last active
August 29, 2015 13:56
-
-
Save LeslieZhu/8978432 to your computer and use it in GitHub Desktop.
Read http://lesliezhu.github.com/Notes/index.html and gen my blog's RSS feed file.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
#-*- coding:utf-8 -*- | |
import os,sys,re,time | |
_rss_ = "http://lesliezhu.github.com/rss.xml" | |
_dirs_ = ["Notes"] # only RSS Feed 'Notes' | |
def gen_url(dir=""): | |
"""get dir and read dir/index.org to filter URL """ | |
for line in open(dir+"/index.org").readlines(): | |
if "[[" in line: | |
line=line.strip() | |
line=re.search("(\..*html)",line).groups(1)[0] | |
yield dir+"/"+line | |
def gen_prefix(): | |
""" RSS XML prefix """ | |
print '''<?xml version="1.0" encoding="UTF-8" ?> | |
<rss version="2.0"> | |
<channel> | |
<title>弘毅 </title> | |
<link>http://lesliezhu.github.com/ </link> | |
<description> | |
<![CDATA[士不可以不弘毅,任重而道远! ]]> | |
</description> | |
<language>zh-CN</language> | |
<generator>genrss.py rss generator </generator> | |
<webMaster><![CDATA[朱春来(Leslie Zhu)]]></webMaster> | |
<ttl>120</ttl> | |
<image> | |
<title><![CDATA[弘毅]]></title> | |
<url>http://lesliezhu.github.io/favicon.ico</url> | |
<link>http://lesliezhu.github.com</link> | |
</image> | |
''' | |
def gen_suffix(): | |
""" RSS XML suffix""" | |
print ''' | |
</channel> | |
</rss> | |
''' | |
def gen_item(link="",creator="朱春来(Leslie Zhu)"): | |
""" RSS Item from HTML """ | |
print ''' | |
<item> | |
<title> %s </title> | |
<link> http://lesliezhu.github.com/%s </link> | |
<author><![CDATA[%s]]></author> | |
<guid isPermaLink="true">http://lesliezhu.github.com/%s</guid> | |
''' % (gen_title(link),link,creator,link) | |
for tag in gen_category(link): | |
print ''' | |
<category><![CDATA[%s]]></category>''' % tag | |
print ''' | |
<pubDate>%s</pubDate> | |
<description><![CDATA[%s]]></description> | |
<comments>http://lesliezhu.github.com/%s</comments> | |
</item> | |
''' % (gen_pubdate(link),gen_description(link),link) | |
def gen_category(link=""): | |
""" Filter Keywords from HTML metadata """ | |
for line in open(link).readlines(): | |
if "<meta name=\"keywords\"" in line: | |
line = line.strip() | |
keywords=re.search("content=\"(.*)\"",line).groups(1)[0] | |
break | |
if len(keywords) > 0: | |
for ch in "\ ,;:、,": | |
if ch in keywords: | |
return [i.strip() for i in keywords.split(ch)] | |
else: | |
return ["札记"] | |
def gen_title(link=""): | |
""" Filter Title from HTML metadata """ | |
for line in open(link).readlines(): | |
if "<title>" in line: | |
line = line.strip() | |
title=re.match("<title>(.*)</title>",line).groups(1)[0] | |
break | |
return title | |
def gen_pubdate(link=""): | |
""" Filter Publish data from HTML metadata>""" | |
for line in open(link).readlines(): | |
if "<meta name=\"generated\"" in line: | |
line = line.strip() | |
pattern="([0-9][0-9]/[0-9][0-9]/[0-9][0-9][0-9][0-9]|[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9])" | |
pubdate=re.search(pattern,line).groups(1)[0] | |
break | |
if "/" in pubdate: | |
pubdate=time.strptime(pubdate, "%m/%d/%Y") | |
elif "-" in pubdate: | |
pubdate=time.strptime(pubdate, "%Y-%m-%d") | |
else: | |
pubdate=re.search("([0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9])",link).groups(1)[0] | |
pubdate=time.strptime(pubdate, "%Y-%m-%d") | |
pubdate=time.strftime("%a, %d %b %Y %H:%M:%S +0800",pubdate) | |
return pubdate | |
def gen_description(link=""): | |
""" Filter body data of HTML """ | |
data=open(link).read() | |
data=re.search("(<div id=\"content\">.*</div.).*</body>",data.replace('\n','敏敏')).groups()[0].replace('敏敏','\n') | |
#data=re.search("(<\?xml.*</html>)",data.replace('\n','敏敏')).groups(1)[0].replace('敏敏','\n') | |
for spec in [' ']: | |
data=data.replace(spec,'') | |
return data | |
if __name__ == "__main__": | |
gen_prefix() | |
for dir in _dirs_: | |
for url in gen_url(dir): | |
gen_item(url) | |
gen_suffix() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment