Skip to content

Instantly share code, notes, and snippets.

@AviDuda
Forked from fajran/blogger-to-wordpress.py
Last active November 1, 2023 12:00
Show Gist options
  • Save AviDuda/89031ae65cf1253e49e7 to your computer and use it in GitHub Desktop.
Save AviDuda/89031ae65cf1253e49e7 to your computer and use it in GitHub Desktop.
Blogger's backup file to WordPress' WXR converter for Disqus
# Blogger's backup file to WordPress' WXR converter.
#
# Only tested with posts and comments, and NOT with pages.
# May not be efficient for huge blogs since the script keep
# all content in the memory during conversion.
#
# Released as public domain.
#
# Please note that I converted the labels in Blogspot
# as tags in WordPress. I also hardcoded two categories for the
# WordPress posts. Adjust these first to suit your need.
import sys
from datetime import datetime
from xml.dom.minidom import parse, parseString
from xml.dom import Node
import cgi
import dateutil.parser
inp = sys.argv[1]
def d(*msg):
print >>sys.stderr, ' '.join(map(str, msg))
class Blog(object):
class Author(object):
name = None
email = None
uri = None
class Entry(object):
entry_id = None
url = None
permalink = None
title = None
title_type = None
content = None
content_type = None
published = None
updated = None
author = None
class Post(Entry):
draft = False
def __init__(self):
self.labels = []
self.comments = []
class Comment(Entry):
pass
author = Author()
posts = []
class BlogParser(object):
def __init__(self, atom_file):
self.atom_file = atom_file
def parse(self):
self.blog = Blog()
dom = parse(open(self.atom_file))
feed = None
for child in dom.childNodes:
if child.nodeName == 'feed':
feed = child
break
if feed is not None:
self.parse_metadata(feed)
self.parse_entries(feed)
return self.blog
def get_text(self, el):
if el.nodeType == Node.TEXT_NODE:
return el.nodeValue
value = []
if el.nodeType == Node.ELEMENT_NODE:
for child in el.childNodes:
value.append(self.get_text(child))
return ''.join(value)
def parse_date(self, txt):
return dateutil.parser.parse(txt)
def parse_metadata(self, feed):
for child in feed.childNodes:
name = child.nodeName.split(':')[-1]
if name == 'entry':
break
if name == 'id':
self.blog.blog_id = self.get_text(child)
elif name == 'updated':
self.blog.updated = self.parse_date(self.get_text(child))
elif name == 'title':
self.blog.title = self.get_text(child)
elif name == 'author':
self.blog.author = self.parse_author(child)
def parse_author(self, author):
data = Blog.Author()
for child in author.childNodes:
name = child.nodeName.split(':')[-1]
if name == 'name':
data.name = self.get_text(child)
elif name == 'uri':
data.uri = self.get_text(child)
elif name == 'email':
data.email = self.get_text(child)
return data
def parse_entries(self, feed):
self.posts = []
self.comments = []
self.post_ids = {}
self.comment_ids = {}
for child in feed.childNodes:
if child.nodeName != 'entry':
continue
self.parse_entry(child)
self.assign_comments()
self.blog.posts = self.posts
total = len(self.blog.posts)
d('total posts:', total)
d('total comments:', len(self.comments))
for i, post in enumerate(self.blog.posts):
d('(%d / %d) -> %d: %s' % (i+1, total, len(post.comments), post.title))
def assign_comments(self):
i = 0
for comment in self.comments:
entry_id = comment.post_entry_id
if entry_id not in self.post_ids:
continue
post = self.post_ids[entry_id]
post.comments.append(comment)
d('%s. comment: %s -> %s' % (i+1, id(comment), id(post)))
i += 1
def parse_category(self, category):
scheme = category.attributes['scheme'].nodeValue
term = category.attributes['term'].nodeValue
return scheme, term
def get_kind(self, entry):
for child in entry.childNodes:
if child.nodeName == 'category':
scheme, term = self.parse_category(child)
if scheme == 'http://schemas.google.com/g/2005#kind':
return term
def parse_entry(self, entry):
kind = self.get_kind(entry)
if kind == 'http://schemas.google.com/blogger/2008/kind#post':
post = self.parse_post(entry)
self.posts.append(post)
self.post_ids[post.entry_id] = post
elif kind == 'http://schemas.google.com/blogger/2008/kind#comment':
comment = self.parse_comment(entry)
self.comments.append(comment)
self.comment_ids[comment.entry_id] = comment
def get_draft(self, control):
for child in control.childNodes:
name = child.nodeName.split(':')[-1]
if name == 'draft':
return self.get_text(child) == 'yes'
return False
def parse_entry_common(self, entry, target):
for child in entry.childNodes:
name = child.nodeName.split(':')[-1]
ns = child.namespaceURI
if name == 'id':
target.entry_id = self.get_text(child)
elif name == 'published':
target.published = self.parse_date(self.get_text(child))
elif name == 'updated':
target.updated = self.parse_date(self.get_text(child))
elif name == 'title':
target.title = self.get_text(child)
target.title_type = child.attributes['type'].nodeValue
elif name == 'content':
target.content = self.get_text(child)
target.content_type = child.attributes['type'].nodeValue
elif name == 'author':
target.author = self.parse_author(child)
elif name == 'link':
rel = child.attributes['rel'].nodeValue
href = child.attributes['href'].nodeValue
if rel == 'self':
target.permalink = href
elif rel == 'alternate':
target.url = href
def parse_post(self, entry):
post = Blog.Post()
self.parse_entry_common(entry, post)
for child in entry.childNodes:
name = child.nodeName.split(':')[-1]
ns = child.namespaceURI
if name == 'category':
scheme, term = self.parse_category(child)
if scheme == 'http://www.blogger.com/atom/ns#':
post.labels.append(term)
elif ns == 'http://purl.org/atom/app#' and name == 'control':
post.draft = self.get_draft(child)
return post
def parse_comment(self, entry):
comment = Blog.Comment()
self.parse_entry_common(entry, comment)
for child in entry.childNodes:
name = child.nodeName.split(':')[-1]
ns = child.namespaceURI
if ns == 'http://purl.org/syndication/thread/1.0' and name == 'in-reply-to':
ref = child.attributes['ref'].nodeValue
comment.post_entry_id = ref
return comment
class WXRWriter(object):
comment_status = 'open'
def __init__(self, blog):
self.blog = blog
def write(self):
self.post_id = 0
self.comment_id = 0
doc = self.get_header() + self.get_entries() + self.get_footer()
doc = [line.strip() for line in doc]
doc = '\n'.join(doc)
return unicode(doc).encode('utf-8')
def get_header(self):
res = []
res.append('<?xml version="1.0" encoding="UTF-8" ?>')
res.append('<rss version="2.0"')
res.append(' xmlns:excerpt="http://wordpress.org/export/1.2/excerpt/"')
res.append(' xmlns:content="http://purl.org/rss/1.0/modules/content/"')
res.append(' xmlns:wfw="http://wellformedweb.org/CommentAPI/"')
res.append(' xmlns:dc="http://purl.org/dc/elements/1.1/"')
res.append(' xmlns:wp="http://wordpress.org/export/1.2/">')
res.append('<channel>')
res.append('<title>%s</title>' % self.blog.title)
res.append('<wp:wxr_version>1.2</wp:wxr_version>')
return res
def get_footer(self):
res = []
res.append('</channel>')
res.append('</rss>')
return res
def get_entries(self):
res = []
for post in self.blog.posts:
res += self.get_post(post)
return res
def get_date(self, ts):
return ts.strftime("%a, %d %b %Y %H:%M:%S +0000")
def get_date_wp(self, ts):
return ts.strftime("%Y-%m-%d %H:%M:%S")
def escape(self, s):
return s
return cgi.escape(s).encode('ascii', 'xmlcharrefreplace')
def get_comment(self, comment):
status = 1
res = []
self.comment_id += 1
res.append(' <wp:comment>')
res.append(' <wp:comment_id>%s</wp:comment_id>' % self.comment_id)
if comment.author.name:
res.append(' <wp:comment_author>%s</wp:comment_author>' % comment.author.name)
if comment.author.email:
res.append(' <wp:comment_author_email>%s</wp:comment_author_email>' % comment.author.email)
if comment.author.uri:
res.append(' <wp:comment_author_url>%s</wp:comment_author_url>' % comment.author.uri)
res.append(' <wp:comment_author_IP>%s</wp:comment_author_IP>' % '')
res.append(' <wp:comment_date>%s</wp:comment_date>' % self.get_date_wp(comment.published))
res.append(' <wp:comment_date_gmt>%s</wp:comment_date_gmt>' % self.get_date_wp(comment.published))
res.append(' <wp:comment_content><![CDATA[%s]]></wp:comment_content>' % self.escape(comment.content))
res.append(' <wp:comment_approved>%s</wp:comment_approved>' % status)
res.append(' <wp:commentmeta>')
res.append(' <wp:meta_key>blogger_id</wp:meta_key>')
res.append(' <wp:meta_value>%s</wp:meta_value>' % comment.entry_id)
res.append(' </wp:commentmeta>')
if comment.permalink:
res.append(' <wp:commentmeta>')
res.append(' <wp:meta_key>blogger_permalink</wp:meta_key>')
res.append(' <wp:meta_value>%s</wp:meta_value>' % comment.permalink)
res.append(' </wp:commentmeta>')
if comment.url:
res.append(' <wp:commentmeta>')
res.append(' <wp:meta_key>blogger_url</wp:meta_key>')
res.append(' <wp:meta_value>%s</wp:meta_value>' % comment.url)
res.append(' </wp:commentmeta>')
res.append(' </wp:comment>')
return res
def get_post(self, post):
if post.content.strip() == '':
return []
res = []
slug = None
if post.url is not None:
slug = post.url.split('/')[-1]
slug = slug[:-5]
status = 'publish'
if post.draft:
status = 'draft'
self.post_id += 1
res.append('<item>')
res.append(' <title>%s</title>' % post.title)
res.append(' <pubDate>%s</pubDate>' % self.get_date(post.published))
res.append(' <dc:creator>%s</dc:creator>' % post.author.name)
res.append(' <guid isPermaLink="true">%s</guid>' % post.permalink)
res.append(' <description></description/>')
res.append(' <content:encoded><![CDATA[%s]]></content:encoded>' % self.escape(post.content))
res.append(' <excerpt:encoded><![CDATA[%s]]></excerpt:encoded>' % self.escape(post.content))
res.append(' <wp:post_id>%s</wp:post_id>' % self.post_id)
res.append(' <wp:post_date>%s</wp:post_date>' % self.get_date_wp(post.published))
res.append(' <wp:post_date_gmt>%s</wp:post_date_gmt>' % self.get_date_wp(post.published))
res.append(' <wp:comment_status>%s</wp:comment_status>' % self.comment_status)
res.append(' <wp:ping_status>closed</wp:ping_status>')
if slug:
res.append(' <wp:post_name>%s</wp:post_name>' % slug)
res.append(' <wp:status>%s</wp:status>' % status)
res.append(' <wp:post_parent>0</wp:post_parent>')
res.append(' <wp:menu_order>0</wp:menu_order>')
res.append(' <wp:post_type>post</wp:post_type>')
res.append(' <wp:post_password></wp:post_password>')
res.append(' <wp:is_sticky>0</wp:is_sticky>')
res.append(' <category domain="category" nicename="id"><![CDATA[Bahasa Indonesia]]></category>')
res.append(' <category domain="category" nicename="hacking"><![CDATA[Hacking]]></category>')
for label in post.labels:
res.append(' <category domain="post_tag" nicename="%s"><![CDATA[%s]]></category>' % (label, label))
res.append(' <wp:postmeta>')
res.append(' <wp:meta_key>blogger_id</wp:meta_key>')
res.append(' <wp:meta_value>%s</wp:meta_value>' % post.entry_id)
res.append(' </wp:postmeta>')
if post.permalink:
res.append(' <wp:postmeta>')
res.append(' <wp:meta_key>blogger_permalink</wp:meta_key>')
res.append(' <wp:meta_value>%s</wp:meta_value>' % post.permalink)
res.append(' </wp:postmeta>')
if post.url:
res.append(' <wp:postmeta>')
res.append(' <wp:meta_key>blogger_url</wp:meta_key>')
res.append(' <wp:meta_value>%s</wp:meta_value>' % post.url)
res.append(' </wp:postmeta>')
for comment in post.comments:
res += self.get_comment(comment)
res.append('</item>')
return res
p = BlogParser(inp)
blog = p.parse()
writer = WXRWriter(blog)
xml = writer.write()
print xml
# f = open(out, 'w')
# f.write(xml)
# f.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment