|
|
@@ -0,0 +1,400 @@ |
|
|
import sys |
|
|
from datetime import datetime |
|
|
from xml.dom.minidom import parse, parseString |
|
|
from xml.dom import Node |
|
|
import cgi |
|
|
|
|
|
import dateutil.parser |
|
|
|
|
|
inp = sys.argv[1] |
|
|
|
|
|
def d(*msg): |
|
|
print >>sys.stderr, ' '.join(map(str, msg)) |
|
|
|
|
|
class Blog(object): |
|
|
class Author(object): |
|
|
name = None |
|
|
email = None |
|
|
uri = None |
|
|
|
|
|
class Entry(object): |
|
|
entry_id = None |
|
|
url = None |
|
|
permalink = None |
|
|
title = None |
|
|
title_type = None |
|
|
content = None |
|
|
content_type = None |
|
|
published = None |
|
|
updated = None |
|
|
author = None |
|
|
|
|
|
class Post(Entry): |
|
|
draft = False |
|
|
|
|
|
def __init__(self): |
|
|
self.labels = [] |
|
|
self.comments = [] |
|
|
|
|
|
class Comment(Entry): |
|
|
pass |
|
|
|
|
|
author = Author() |
|
|
posts = [] |
|
|
|
|
|
class BlogParser(object): |
|
|
def __init__(self, atom_file): |
|
|
self.atom_file = atom_file |
|
|
|
|
|
def parse(self): |
|
|
self.blog = Blog() |
|
|
dom = parse(open(self.atom_file)) |
|
|
|
|
|
feed = None |
|
|
for child in dom.childNodes: |
|
|
if child.nodeName == 'feed': |
|
|
feed = child |
|
|
break |
|
|
|
|
|
if feed is not None: |
|
|
self.parse_metadata(feed) |
|
|
self.parse_entries(feed) |
|
|
|
|
|
return self.blog |
|
|
|
|
|
def get_text(self, el): |
|
|
if el.nodeType == Node.TEXT_NODE: |
|
|
return el.nodeValue |
|
|
|
|
|
value = [] |
|
|
|
|
|
if el.nodeType == Node.ELEMENT_NODE: |
|
|
for child in el.childNodes: |
|
|
value.append(self.get_text(child)) |
|
|
|
|
|
return ''.join(value) |
|
|
|
|
|
def parse_date(self, txt): |
|
|
return dateutil.parser.parse(txt) |
|
|
|
|
|
def parse_metadata(self, feed): |
|
|
for child in feed.childNodes: |
|
|
name = child.nodeName.split(':')[-1] |
|
|
|
|
|
if name == 'entry': |
|
|
break |
|
|
|
|
|
if name == 'id': |
|
|
self.blog.blog_id = self.get_text(child) |
|
|
elif name == 'updated': |
|
|
self.blog.updated = self.parse_date(self.get_text(child)) |
|
|
elif name == 'title': |
|
|
self.blog.title = self.get_text(child) |
|
|
elif name == 'author': |
|
|
self.blog.author = self.parse_author(child) |
|
|
|
|
|
def parse_author(self, author): |
|
|
data = Blog.Author() |
|
|
|
|
|
for child in author.childNodes: |
|
|
name = child.nodeName.split(':')[-1] |
|
|
|
|
|
if name == 'name': |
|
|
data.name = self.get_text(child) |
|
|
elif name == 'uri': |
|
|
data.uri = self.get_text(child) |
|
|
elif name == 'email': |
|
|
data.email = self.get_text(child) |
|
|
|
|
|
return data |
|
|
|
|
|
def parse_entries(self, feed): |
|
|
self.posts = [] |
|
|
self.comments = [] |
|
|
self.post_ids = {} |
|
|
self.comment_ids = {} |
|
|
|
|
|
for child in feed.childNodes: |
|
|
if child.nodeName != 'entry': |
|
|
continue |
|
|
self.parse_entry(child) |
|
|
|
|
|
self.assign_comments() |
|
|
|
|
|
self.blog.posts = self.posts |
|
|
|
|
|
total = len(self.blog.posts) |
|
|
d('total posts:', total) |
|
|
d('total comments:', len(self.comments)) |
|
|
for i, post in enumerate(self.blog.posts): |
|
|
d('(%d / %d) -> %d: %s' % (i+1, total, len(post.comments), post.title)) |
|
|
|
|
|
def assign_comments(self): |
|
|
i = 0 |
|
|
for comment in self.comments: |
|
|
entry_id = comment.post_entry_id |
|
|
if entry_id not in self.post_ids: |
|
|
continue |
|
|
|
|
|
post = self.post_ids[entry_id] |
|
|
post.comments.append(comment) |
|
|
|
|
|
d('%s. comment: %s -> %s' % (i+1, id(comment), id(post))) |
|
|
i += 1 |
|
|
|
|
|
def parse_category(self, category): |
|
|
scheme = category.attributes['scheme'].nodeValue |
|
|
term = category.attributes['term'].nodeValue |
|
|
return scheme, term |
|
|
|
|
|
def get_kind(self, entry): |
|
|
for child in entry.childNodes: |
|
|
if child.nodeName == 'category': |
|
|
scheme, term = self.parse_category(child) |
|
|
if scheme == 'http://schemas.google.com/g/2005#kind': |
|
|
return term |
|
|
|
|
|
def parse_entry(self, entry): |
|
|
kind = self.get_kind(entry) |
|
|
if kind == 'http://schemas.google.com/blogger/2008/kind#post': |
|
|
post = self.parse_post(entry) |
|
|
self.posts.append(post) |
|
|
self.post_ids[post.entry_id] = post |
|
|
elif kind == 'http://schemas.google.com/blogger/2008/kind#comment': |
|
|
comment = self.parse_comment(entry) |
|
|
self.comments.append(comment) |
|
|
self.comment_ids[comment.entry_id] = comment |
|
|
|
|
|
def get_draft(self, control): |
|
|
for child in control.childNodes: |
|
|
name = child.nodeName.split(':')[-1] |
|
|
if name == 'draft': |
|
|
return self.get_text(child) == 'yes' |
|
|
return False |
|
|
|
|
|
def parse_entry_common(self, entry, target): |
|
|
for child in entry.childNodes: |
|
|
name = child.nodeName.split(':')[-1] |
|
|
ns = child.namespaceURI |
|
|
|
|
|
if name == 'id': |
|
|
target.entry_id = self.get_text(child) |
|
|
elif name == 'published': |
|
|
target.published = self.parse_date(self.get_text(child)) |
|
|
elif name == 'updated': |
|
|
target.updated = self.parse_date(self.get_text(child)) |
|
|
elif name == 'title': |
|
|
target.title = self.get_text(child) |
|
|
target.title_type = child.attributes['type'].nodeValue |
|
|
elif name == 'content': |
|
|
target.content = self.get_text(child) |
|
|
target.content_type = child.attributes['type'].nodeValue |
|
|
elif name == 'author': |
|
|
target.author = self.parse_author(child) |
|
|
elif name == 'link': |
|
|
rel = child.attributes['rel'].nodeValue |
|
|
href = child.attributes['href'].nodeValue |
|
|
|
|
|
if rel == 'self': |
|
|
target.permalink = href |
|
|
elif rel == 'alternate': |
|
|
target.url = href |
|
|
|
|
|
def parse_post(self, entry): |
|
|
post = Blog.Post() |
|
|
self.parse_entry_common(entry, post) |
|
|
|
|
|
for child in entry.childNodes: |
|
|
name = child.nodeName.split(':')[-1] |
|
|
ns = child.namespaceURI |
|
|
|
|
|
if name == 'category': |
|
|
scheme, term = self.parse_category(child) |
|
|
if scheme == 'http://www.blogger.com/atom/ns#': |
|
|
post.labels.append(term) |
|
|
elif ns == 'http://purl.org/atom/app#' and name == 'control': |
|
|
post.draft = self.get_draft(child) |
|
|
|
|
|
return post |
|
|
|
|
|
def parse_comment(self, entry): |
|
|
comment = Blog.Comment() |
|
|
self.parse_entry_common(entry, comment) |
|
|
|
|
|
for child in entry.childNodes: |
|
|
name = child.nodeName.split(':')[-1] |
|
|
ns = child.namespaceURI |
|
|
|
|
|
if ns == 'http://purl.org/syndication/thread/1.0' and name == 'in-reply-to': |
|
|
ref = child.attributes['ref'].nodeValue |
|
|
comment.post_entry_id = ref |
|
|
|
|
|
return comment |
|
|
|
|
|
class WXRWriter(object): |
|
|
comment_status = 'open' |
|
|
|
|
|
def __init__(self, blog): |
|
|
self.blog = blog |
|
|
|
|
|
def write(self): |
|
|
self.post_id = 0 |
|
|
self.comment_id = 0 |
|
|
|
|
|
doc = self.get_header() + self.get_entries() + self.get_footer() |
|
|
doc = [line.strip() for line in doc] |
|
|
doc = '\n'.join(doc) |
|
|
return unicode(doc).encode('utf-8') |
|
|
|
|
|
def get_header(self): |
|
|
res = [] |
|
|
res.append('<?xml version="1.0" encoding="UTF-8" ?>') |
|
|
res.append('<rss version="2.0"') |
|
|
res.append(' xmlns:excerpt="http://wordpress.org/export/1.2/excerpt/"') |
|
|
res.append(' xmlns:content="http://purl.org/rss/1.0/modules/content/"') |
|
|
res.append(' xmlns:wfw="http://wellformedweb.org/CommentAPI/"') |
|
|
res.append(' xmlns:dc="http://purl.org/dc/elements/1.1/"') |
|
|
res.append(' xmlns:wp="http://wordpress.org/export/1.2/">') |
|
|
|
|
|
res.append('<channel>') |
|
|
res.append('<title>%s</title>' % self.blog.title) |
|
|
res.append('<wp:wxr_version>1.2</wp:wxr_version>') |
|
|
|
|
|
return res |
|
|
|
|
|
def get_footer(self): |
|
|
res = [] |
|
|
res.append('</channel>') |
|
|
res.append('</rss>') |
|
|
return res |
|
|
|
|
|
def get_entries(self): |
|
|
res = [] |
|
|
|
|
|
for post in self.blog.posts: |
|
|
res += self.get_post(post) |
|
|
|
|
|
return res |
|
|
|
|
|
def get_date(self, ts): |
|
|
return ts.strftime("%a, %d %b %Y %H:%M:%S +0000") |
|
|
|
|
|
def get_date_wp(self, ts): |
|
|
return ts.strftime("%Y-%m-%d %H:%M:%S") |
|
|
|
|
|
def escape(self, s): |
|
|
return s |
|
|
return cgi.escape(s).encode('ascii', 'xmlcharrefreplace') |
|
|
|
|
|
def get_comment(self, comment): |
|
|
status = 1 |
|
|
|
|
|
res = [] |
|
|
|
|
|
self.comment_id += 1 |
|
|
|
|
|
res.append(' <wp:comment>') |
|
|
res.append(' <wp:comment_id>%s</wp:comment_id>' % self.comment_id) |
|
|
if comment.author.name: |
|
|
res.append(' <wp:comment_author>%s</wp:comment_author>' % comment.author.name) |
|
|
if comment.author.email: |
|
|
res.append(' <wp:comment_author_email>%s</wp:comment_author_email>' % comment.author.email) |
|
|
if comment.author.uri: |
|
|
res.append(' <wp:comment_author_url>%s</wp:comment_author_url>' % comment.author.uri) |
|
|
res.append(' <wp:comment_author_IP>%s</wp:comment_author_IP>' % '') |
|
|
res.append(' <wp:comment_date>%s</wp:comment_date>' % self.get_date_wp(comment.published)) |
|
|
res.append(' <wp:comment_date_gmt>%s</wp:comment_date_gmt>' % self.get_date_wp(comment.published)) |
|
|
res.append(' <wp:comment_content><![CDATA[%s]]></wp:comment_content>' % self.escape(comment.content)) |
|
|
res.append(' <wp:comment_approved>%s</wp:comment_approved>' % status) |
|
|
res.append(' <wp:commentmeta>') |
|
|
res.append(' <wp:meta_key>blogger_id</wp:meta_key>') |
|
|
res.append(' <wp:meta_value>%s</wp:meta_value>' % comment.entry_id) |
|
|
res.append(' </wp:commentmeta>') |
|
|
if comment.permalink: |
|
|
res.append(' <wp:commentmeta>') |
|
|
res.append(' <wp:meta_key>blogger_permalink</wp:meta_key>') |
|
|
res.append(' <wp:meta_value>%s</wp:meta_value>' % comment.permalink) |
|
|
res.append(' </wp:commentmeta>') |
|
|
if comment.url: |
|
|
res.append(' <wp:commentmeta>') |
|
|
res.append(' <wp:meta_key>blogger_url</wp:meta_key>') |
|
|
res.append(' <wp:meta_value>%s</wp:meta_value>' % comment.url) |
|
|
res.append(' </wp:commentmeta>') |
|
|
res.append(' </wp:comment>') |
|
|
|
|
|
return res |
|
|
|
|
|
def get_post(self, post): |
|
|
if post.content.strip() == '': |
|
|
return [] |
|
|
|
|
|
res = [] |
|
|
|
|
|
slug = None |
|
|
if post.url is not None: |
|
|
slug = post.url.split('/')[-1] |
|
|
slug = slug[:-5] |
|
|
|
|
|
status = 'publish' |
|
|
if post.draft: |
|
|
status = 'draft' |
|
|
|
|
|
self.post_id += 1 |
|
|
|
|
|
res.append('<item>') |
|
|
res.append(' <title>%s</title>' % post.title) |
|
|
res.append(' <pubDate>%s</pubDate>' % self.get_date(post.published)) |
|
|
res.append(' <dc:creator>%s</dc:creator>' % post.author.name) |
|
|
res.append(' <guid isPermaLink="true">%s</guid>' % post.permalink) |
|
|
res.append(' <description></description/>') |
|
|
res.append(' <content:encoded><![CDATA[%s]]></content:encoded>' % self.escape(post.content)) |
|
|
res.append(' <excerpt:encoded><![CDATA[%s]]></excerpt:encoded>' % self.escape(post.content)) |
|
|
res.append(' <wp:post_id>%s</wp:post_id>' % self.post_id) |
|
|
res.append(' <wp:post_date>%s</wp:post_date>' % self.get_date_wp(post.published)) |
|
|
res.append(' <wp:post_date_gmt>%s</wp:post_date_gmt>' % self.get_date_wp(post.published)) |
|
|
res.append(' <wp:comment_status>%s</wp:comment_status>' % self.comment_status) |
|
|
res.append(' <wp:ping_status>closed</wp:ping_status>') |
|
|
if slug: |
|
|
res.append(' <wp:post_name>%s</wp:post_name>' % slug) |
|
|
res.append(' <wp:status>%s</wp:status>' % status) |
|
|
res.append(' <wp:post_parent>0</wp:post_parent>') |
|
|
res.append(' <wp:menu_order>0</wp:menu_order>') |
|
|
res.append(' <wp:post_type>post</wp:post_type>') |
|
|
res.append(' <wp:post_password></wp:post_password>') |
|
|
res.append(' <wp:is_sticky>0</wp:is_sticky>') |
|
|
res.append(' <category domain="category" nicename="id"><![CDATA[Bahasa Indonesia]]></category>') |
|
|
res.append(' <category domain="category" nicename="hacking"><![CDATA[Hacking]]></category>') |
|
|
for label in post.labels: |
|
|
res.append(' <category domain="post_tag" nicename="%s"><![CDATA[%s]]></category>' % (label, label)) |
|
|
res.append(' <wp:postmeta>') |
|
|
res.append(' <wp:meta_key>blogger_id</wp:meta_key>') |
|
|
res.append(' <wp:meta_value>%s</wp:meta_value>' % post.entry_id) |
|
|
res.append(' </wp:postmeta>') |
|
|
if post.permalink: |
|
|
res.append(' <wp:postmeta>') |
|
|
res.append(' <wp:meta_key>blogger_permalink</wp:meta_key>') |
|
|
res.append(' <wp:meta_value>%s</wp:meta_value>' % post.permalink) |
|
|
res.append(' </wp:postmeta>') |
|
|
if post.url: |
|
|
res.append(' <wp:postmeta>') |
|
|
res.append(' <wp:meta_key>blogger_url</wp:meta_key>') |
|
|
res.append(' <wp:meta_value>%s</wp:meta_value>' % post.url) |
|
|
res.append(' </wp:postmeta>') |
|
|
|
|
|
for comment in post.comments: |
|
|
res += self.get_comment(comment) |
|
|
|
|
|
res.append('</item>') |
|
|
return res |
|
|
|
|
|
p = BlogParser(inp) |
|
|
blog = p.parse() |
|
|
|
|
|
writer = WXRWriter(blog) |
|
|
xml = writer.write() |
|
|
|
|
|
print xml |
|
|
|
|
|
# f = open(out, 'w') |
|
|
# f.write(xml) |
|
|
# f.close() |