Skip to content

Instantly share code, notes, and snippets.

@AviDuda
Forked from fajran/blogger-to-wordpress.py
Last active November 1, 2023 12:00

Revisions

  1. AviDuda renamed this gist Mar 18, 2016. 1 changed file with 63 additions and 75 deletions.
    138 changes: 63 additions & 75 deletions blogger-to-wordpress.py → blogger-to-disqus.py
    Original file line number Diff line number Diff line change
    @@ -1,27 +1,32 @@
    # Blogger's backup file to WordPress' WXR converter.
    # -*- coding: utf-8 -*-

    # Blogger's backup file to WordPress' WXR converter for Disqus.
    #
    # Original script: https://gist.github.com/fajran/5659455
    #
    # Only tested with posts and comments, and NOT with pages.
    # May not be efficient for huge blogs since the script keep
    # May not be efficient for huge blogs since the script keeps
    # all content in the memory during conversion.
    #
    # Released as public domain.
    #
    # Please note that I converted the labels in Blogspot
    # as tags in WordPress. I also hardcoded two categories for the
    # WordPress posts. Adjust these first to suit your need.
    # Required library: dateutil - https://labix.org/python-dateutil
    #
    # Usage: python blogger-to-disqus.py blogger.xml > output.xml

    import sys
    from datetime import datetime
    from xml.dom.minidom import parse, parseString
    from xml.dom import Node
    import cgi
    from HTMLParser import HTMLParser

    import dateutil.parser

    inp = sys.argv[1]

    def d(*msg):
    print >>sys.stderr, ' '.join(map(str, msg))
    print >> sys.stderr, ' '.join(map(str, msg))

    class Blog(object):
    class Author(object):
    @@ -40,6 +45,7 @@ class Entry(object):
    published = None
    updated = None
    author = None
    related = None

    class Post(Entry):
    draft = False
    @@ -113,6 +119,11 @@ def parse_author(self, author):

    if name == 'name':
    data.name = self.get_text(child)
    if len(data.name) < 3:
    data.name = 'Anonymous'
    else:
    # automatically generated email has a max of 75 characters including @wordpress.disqus.net, so use a sane value
    data.name = data.name[:42]
    elif name == 'uri':
    data.uri = self.get_text(child)
    elif name == 'email':
    @@ -138,8 +149,8 @@ def parse_entries(self, feed):
    total = len(self.blog.posts)
    d('total posts:', total)
    d('total comments:', len(self.comments))
    for i, post in enumerate(self.blog.posts):
    d('(%d / %d) -> %d: %s' % (i+1, total, len(post.comments), post.title))
    # for i, post in enumerate(self.blog.posts):
    # d('(%d / %d) -> %d: %s' % (i+1, total, len(post.comments), post.title))

    def assign_comments(self):
    i = 0
    @@ -151,7 +162,7 @@ def assign_comments(self):
    post = self.post_ids[entry_id]
    post.comments.append(comment)

    d('%s. comment: %s -> %s' % (i+1, id(comment), id(post)))
    # d('%s. comment: %s -> %s' % (i+1, id(comment), id(post)))
    i += 1

    def parse_category(self, category):
    @@ -200,6 +211,7 @@ def parse_entry_common(self, entry, target):
    target.title_type = child.attributes['type'].nodeValue
    elif name == 'content':
    target.content = self.get_text(child)
    target.content = target.content.ljust(3, '.') # Disqus requires 3 characters
    target.content_type = child.attributes['type'].nodeValue
    elif name == 'author':
    target.author = self.parse_author(child)
    @@ -241,6 +253,10 @@ def parse_comment(self, entry):
    ref = child.attributes['ref'].nodeValue
    comment.post_entry_id = ref

    if name == 'link' and child.attributes['rel'].nodeValue == 'related':
    related = child.attributes['href'].nodeValue
    comment.related = related[related.rfind('/') + 1 :]

    return comment

    class WXRWriter(object):
    @@ -262,15 +278,12 @@ def get_header(self):
    res = []
    res.append('<?xml version="1.0" encoding="UTF-8" ?>')
    res.append('<rss version="2.0"')
    res.append(' xmlns:excerpt="http://wordpress.org/export/1.2/excerpt/"')
    res.append(' xmlns:content="http://purl.org/rss/1.0/modules/content/"')
    res.append(' xmlns:wfw="http://wellformedweb.org/CommentAPI/"')
    res.append(' xmlns:dsq="http://www.disqus.com/"')
    res.append(' xmlns:dc="http://purl.org/dc/elements/1.1/"')
    res.append(' xmlns:wp="http://wordpress.org/export/1.2/">')
    res.append(' xmlns:wp="http://wordpress.org/export/1.0/">')

    res.append('<channel>')
    res.append('<title>%s</title>' % self.blog.title)
    res.append('<wp:wxr_version>1.2</wp:wxr_version>')

    return res

    @@ -295,9 +308,19 @@ def get_date_wp(self, ts):
    return ts.strftime("%Y-%m-%d %H:%M:%S")

    def escape(self, s):
    return s
    return cgi.escape(s).encode('ascii', 'xmlcharrefreplace')

    def unescape(self, s):
    parser = HTMLParser()
    return parser.unescape(s)

    def get_comment_id(self, comment):
    self.comment_id += 1

    comment_id_blogger = comment.entry_id[comment.entry_id.rfind('-') + 1 :]

    self.post_comment_ids[comment_id_blogger] = self.comment_id

    def get_comment(self, comment):
    status = 1

    @@ -307,31 +330,25 @@ def get_comment(self, comment):

    res.append(' <wp:comment>')
    res.append(' <wp:comment_id>%s</wp:comment_id>' % self.comment_id)
    if comment.author.name:
    res.append(' <wp:comment_author>%s</wp:comment_author>' % comment.author.name)
    if comment.author.email:
    res.append(' <wp:comment_author_email>%s</wp:comment_author_email>' % comment.author.email)

    if not comment.author.name:
    comment.author.name = 'Anonymous'
    res.append(' <wp:comment_author><![CDATA[%s]]></wp:comment_author>' % comment.author.name)
    # Blogger has the [email protected] email by default. Uncomment if you need it.
    # if comment.author.email:
    # res.append(' <wp:comment_author_email>%s</wp:comment_author_email>' % comment.author.email)
    if comment.author.uri:
    res.append(' <wp:comment_author_url>%s</wp:comment_author_url>' % comment.author.uri)
    res.append(' <wp:comment_author_url><![CDATA[%s]]></wp:comment_author_url>' % comment.author.uri)
    res.append(' <wp:comment_author_IP>%s</wp:comment_author_IP>' % '')
    res.append(' <wp:comment_date>%s</wp:comment_date>' % self.get_date_wp(comment.published))
    res.append(' <wp:comment_date_gmt>%s</wp:comment_date_gmt>' % self.get_date_wp(comment.published))
    res.append(' <wp:comment_content><![CDATA[%s]]></wp:comment_content>' % self.escape(comment.content))
    res.append(' <wp:comment_content><![CDATA[%s]]></wp:comment_content>' % self.unescape(comment.content))
    res.append(' <wp:comment_approved>%s</wp:comment_approved>' % status)
    res.append(' <wp:commentmeta>')
    res.append(' <wp:meta_key>blogger_id</wp:meta_key>')
    res.append(' <wp:meta_value>%s</wp:meta_value>' % comment.entry_id)
    res.append(' </wp:commentmeta>')
    if comment.permalink:
    res.append(' <wp:commentmeta>')
    res.append(' <wp:meta_key>blogger_permalink</wp:meta_key>')
    res.append(' <wp:meta_value>%s</wp:meta_value>' % comment.permalink)
    res.append(' </wp:commentmeta>')
    if comment.url:
    res.append(' <wp:commentmeta>')
    res.append(' <wp:meta_key>blogger_url</wp:meta_key>')
    res.append(' <wp:meta_value>%s</wp:meta_value>' % comment.url)
    res.append(' </wp:commentmeta>')
    if comment.related:
    if comment.related in self.post_comment_ids:
    res.append(' <wp:comment_parent>%s</wp:comment_parent>' % self.post_comment_ids[comment.related])
    else:
    d('could not find related comment %s for comment entry %s (comment_id %s)' % (comment.related, comment.entry_id, self.comment_id))

    res.append(' </wp:comment>')

    return res
    @@ -354,44 +371,19 @@ def get_post(self, post):
    self.post_id += 1

    res.append('<item>')
    res.append(' <title>%s</title>' % post.title)
    res.append(' <pubDate>%s</pubDate>' % self.get_date(post.published))
    res.append(' <dc:creator>%s</dc:creator>' % post.author.name)
    res.append(' <guid isPermaLink="true">%s</guid>' % post.permalink)
    res.append(' <description></description/>')
    res.append(' <title><![CDATA[%s]]></title>' % self.escape(post.title))
    res.append(' <link>%s</link>' % post.url)
    res.append(' <content:encoded><![CDATA[%s]]></content:encoded>' % self.escape(post.content))
    res.append(' <excerpt:encoded><![CDATA[%s]]></excerpt:encoded>' % self.escape(post.content))
    res.append(' <wp:post_id>%s</wp:post_id>' % self.post_id)
    res.append(' <wp:post_date>%s</wp:post_date>' % self.get_date_wp(post.published))
    res.append(' <wp:post_date_gmt>%s</wp:post_date_gmt>' % self.get_date_wp(post.published))
    res.append(' <wp:comment_status>%s</wp:comment_status>' % self.comment_status)
    res.append(' <wp:ping_status>closed</wp:ping_status>')
    if slug:
    res.append(' <wp:post_name>%s</wp:post_name>' % slug)
    res.append(' <wp:status>%s</wp:status>' % status)
    res.append(' <wp:post_parent>0</wp:post_parent>')
    res.append(' <wp:menu_order>0</wp:menu_order>')
    res.append(' <wp:post_type>post</wp:post_type>')
    res.append(' <wp:post_password></wp:post_password>')
    res.append(' <wp:is_sticky>0</wp:is_sticky>')
    res.append(' <category domain="category" nicename="id"><![CDATA[Bahasa Indonesia]]></category>')
    res.append(' <category domain="category" nicename="hacking"><![CDATA[Hacking]]></category>')
    for label in post.labels:
    res.append(' <category domain="post_tag" nicename="%s"><![CDATA[%s]]></category>' % (label, label))
    res.append(' <wp:postmeta>')
    res.append(' <wp:meta_key>blogger_id</wp:meta_key>')
    res.append(' <wp:meta_value>%s</wp:meta_value>' % post.entry_id)
    res.append(' </wp:postmeta>')
    if post.permalink:
    res.append(' <wp:postmeta>')
    res.append(' <wp:meta_key>blogger_permalink</wp:meta_key>')
    res.append(' <wp:meta_value>%s</wp:meta_value>' % post.permalink)
    res.append(' </wp:postmeta>')
    if post.url:
    res.append(' <wp:postmeta>')
    res.append(' <wp:meta_key>blogger_url</wp:meta_key>')
    res.append(' <wp:meta_value>%s</wp:meta_value>' % post.url)
    res.append(' </wp:postmeta>')

    self.post_comment_ids = {}
    old_comment_id = self.comment_id

    for comment in post.comments:
    self.get_comment_id(comment)

    self.comment_id = old_comment_id

    for comment in post.comments:
    res += self.get_comment(comment)
    @@ -406,7 +398,3 @@ def get_post(self, post):
    xml = writer.write()

    print xml

    # f = open(out, 'w')
    # f.write(xml)
    # f.close()
  2. @fajran fajran revised this gist May 27, 2013. 1 changed file with 12 additions and 0 deletions.
    12 changes: 12 additions & 0 deletions blogger-to-wordpress.py
    Original file line number Diff line number Diff line change
    @@ -1,3 +1,15 @@
    # Blogger's backup file to WordPress' WXR converter.
    #
    # Only tested with posts and comments, and NOT with pages.
    # May not be efficient for huge blogs since the script keep
    # all content in the memory during conversion.
    #
    # Released as public domain.
    #
    # Please note that I converted the labels in Blogspot
    # as tags in WordPress. I also hardcoded two categories for the
    # WordPress posts. Adjust these first to suit your need.

    import sys
    from datetime import datetime
    from xml.dom.minidom import parse, parseString
  3. @fajran fajran created this gist May 27, 2013.
    400 changes: 400 additions & 0 deletions blogger-to-wordpress.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,400 @@
    import sys
    from datetime import datetime
    from xml.dom.minidom import parse, parseString
    from xml.dom import Node
    import cgi

    import dateutil.parser

    inp = sys.argv[1]

    def d(*msg):
    print >>sys.stderr, ' '.join(map(str, msg))

    class Blog(object):
    class Author(object):
    name = None
    email = None
    uri = None

    class Entry(object):
    entry_id = None
    url = None
    permalink = None
    title = None
    title_type = None
    content = None
    content_type = None
    published = None
    updated = None
    author = None

    class Post(Entry):
    draft = False

    def __init__(self):
    self.labels = []
    self.comments = []

    class Comment(Entry):
    pass

    author = Author()
    posts = []

    class BlogParser(object):
    def __init__(self, atom_file):
    self.atom_file = atom_file

    def parse(self):
    self.blog = Blog()
    dom = parse(open(self.atom_file))

    feed = None
    for child in dom.childNodes:
    if child.nodeName == 'feed':
    feed = child
    break

    if feed is not None:
    self.parse_metadata(feed)
    self.parse_entries(feed)

    return self.blog

    def get_text(self, el):
    if el.nodeType == Node.TEXT_NODE:
    return el.nodeValue

    value = []

    if el.nodeType == Node.ELEMENT_NODE:
    for child in el.childNodes:
    value.append(self.get_text(child))

    return ''.join(value)

    def parse_date(self, txt):
    return dateutil.parser.parse(txt)

    def parse_metadata(self, feed):
    for child in feed.childNodes:
    name = child.nodeName.split(':')[-1]

    if name == 'entry':
    break

    if name == 'id':
    self.blog.blog_id = self.get_text(child)
    elif name == 'updated':
    self.blog.updated = self.parse_date(self.get_text(child))
    elif name == 'title':
    self.blog.title = self.get_text(child)
    elif name == 'author':
    self.blog.author = self.parse_author(child)

    def parse_author(self, author):
    data = Blog.Author()

    for child in author.childNodes:
    name = child.nodeName.split(':')[-1]

    if name == 'name':
    data.name = self.get_text(child)
    elif name == 'uri':
    data.uri = self.get_text(child)
    elif name == 'email':
    data.email = self.get_text(child)

    return data

    def parse_entries(self, feed):
    self.posts = []
    self.comments = []
    self.post_ids = {}
    self.comment_ids = {}

    for child in feed.childNodes:
    if child.nodeName != 'entry':
    continue
    self.parse_entry(child)

    self.assign_comments()

    self.blog.posts = self.posts

    total = len(self.blog.posts)
    d('total posts:', total)
    d('total comments:', len(self.comments))
    for i, post in enumerate(self.blog.posts):
    d('(%d / %d) -> %d: %s' % (i+1, total, len(post.comments), post.title))

    def assign_comments(self):
    i = 0
    for comment in self.comments:
    entry_id = comment.post_entry_id
    if entry_id not in self.post_ids:
    continue

    post = self.post_ids[entry_id]
    post.comments.append(comment)

    d('%s. comment: %s -> %s' % (i+1, id(comment), id(post)))
    i += 1

    def parse_category(self, category):
    scheme = category.attributes['scheme'].nodeValue
    term = category.attributes['term'].nodeValue
    return scheme, term

    def get_kind(self, entry):
    for child in entry.childNodes:
    if child.nodeName == 'category':
    scheme, term = self.parse_category(child)
    if scheme == 'http://schemas.google.com/g/2005#kind':
    return term

    def parse_entry(self, entry):
    kind = self.get_kind(entry)
    if kind == 'http://schemas.google.com/blogger/2008/kind#post':
    post = self.parse_post(entry)
    self.posts.append(post)
    self.post_ids[post.entry_id] = post
    elif kind == 'http://schemas.google.com/blogger/2008/kind#comment':
    comment = self.parse_comment(entry)
    self.comments.append(comment)
    self.comment_ids[comment.entry_id] = comment

    def get_draft(self, control):
    for child in control.childNodes:
    name = child.nodeName.split(':')[-1]
    if name == 'draft':
    return self.get_text(child) == 'yes'
    return False

    def parse_entry_common(self, entry, target):
    for child in entry.childNodes:
    name = child.nodeName.split(':')[-1]
    ns = child.namespaceURI

    if name == 'id':
    target.entry_id = self.get_text(child)
    elif name == 'published':
    target.published = self.parse_date(self.get_text(child))
    elif name == 'updated':
    target.updated = self.parse_date(self.get_text(child))
    elif name == 'title':
    target.title = self.get_text(child)
    target.title_type = child.attributes['type'].nodeValue
    elif name == 'content':
    target.content = self.get_text(child)
    target.content_type = child.attributes['type'].nodeValue
    elif name == 'author':
    target.author = self.parse_author(child)
    elif name == 'link':
    rel = child.attributes['rel'].nodeValue
    href = child.attributes['href'].nodeValue

    if rel == 'self':
    target.permalink = href
    elif rel == 'alternate':
    target.url = href

    def parse_post(self, entry):
    post = Blog.Post()
    self.parse_entry_common(entry, post)

    for child in entry.childNodes:
    name = child.nodeName.split(':')[-1]
    ns = child.namespaceURI

    if name == 'category':
    scheme, term = self.parse_category(child)
    if scheme == 'http://www.blogger.com/atom/ns#':
    post.labels.append(term)
    elif ns == 'http://purl.org/atom/app#' and name == 'control':
    post.draft = self.get_draft(child)

    return post

    def parse_comment(self, entry):
    comment = Blog.Comment()
    self.parse_entry_common(entry, comment)

    for child in entry.childNodes:
    name = child.nodeName.split(':')[-1]
    ns = child.namespaceURI

    if ns == 'http://purl.org/syndication/thread/1.0' and name == 'in-reply-to':
    ref = child.attributes['ref'].nodeValue
    comment.post_entry_id = ref

    return comment

    class WXRWriter(object):
    comment_status = 'open'

    def __init__(self, blog):
    self.blog = blog

    def write(self):
    self.post_id = 0
    self.comment_id = 0

    doc = self.get_header() + self.get_entries() + self.get_footer()
    doc = [line.strip() for line in doc]
    doc = '\n'.join(doc)
    return unicode(doc).encode('utf-8')

    def get_header(self):
    res = []
    res.append('<?xml version="1.0" encoding="UTF-8" ?>')
    res.append('<rss version="2.0"')
    res.append(' xmlns:excerpt="http://wordpress.org/export/1.2/excerpt/"')
    res.append(' xmlns:content="http://purl.org/rss/1.0/modules/content/"')
    res.append(' xmlns:wfw="http://wellformedweb.org/CommentAPI/"')
    res.append(' xmlns:dc="http://purl.org/dc/elements/1.1/"')
    res.append(' xmlns:wp="http://wordpress.org/export/1.2/">')

    res.append('<channel>')
    res.append('<title>%s</title>' % self.blog.title)
    res.append('<wp:wxr_version>1.2</wp:wxr_version>')

    return res

    def get_footer(self):
    res = []
    res.append('</channel>')
    res.append('</rss>')
    return res

    def get_entries(self):
    res = []

    for post in self.blog.posts:
    res += self.get_post(post)

    return res

    def get_date(self, ts):
    return ts.strftime("%a, %d %b %Y %H:%M:%S +0000")

    def get_date_wp(self, ts):
    return ts.strftime("%Y-%m-%d %H:%M:%S")

    def escape(self, s):
    return s
    return cgi.escape(s).encode('ascii', 'xmlcharrefreplace')

    def get_comment(self, comment):
    status = 1

    res = []

    self.comment_id += 1

    res.append(' <wp:comment>')
    res.append(' <wp:comment_id>%s</wp:comment_id>' % self.comment_id)
    if comment.author.name:
    res.append(' <wp:comment_author>%s</wp:comment_author>' % comment.author.name)
    if comment.author.email:
    res.append(' <wp:comment_author_email>%s</wp:comment_author_email>' % comment.author.email)
    if comment.author.uri:
    res.append(' <wp:comment_author_url>%s</wp:comment_author_url>' % comment.author.uri)
    res.append(' <wp:comment_author_IP>%s</wp:comment_author_IP>' % '')
    res.append(' <wp:comment_date>%s</wp:comment_date>' % self.get_date_wp(comment.published))
    res.append(' <wp:comment_date_gmt>%s</wp:comment_date_gmt>' % self.get_date_wp(comment.published))
    res.append(' <wp:comment_content><![CDATA[%s]]></wp:comment_content>' % self.escape(comment.content))
    res.append(' <wp:comment_approved>%s</wp:comment_approved>' % status)
    res.append(' <wp:commentmeta>')
    res.append(' <wp:meta_key>blogger_id</wp:meta_key>')
    res.append(' <wp:meta_value>%s</wp:meta_value>' % comment.entry_id)
    res.append(' </wp:commentmeta>')
    if comment.permalink:
    res.append(' <wp:commentmeta>')
    res.append(' <wp:meta_key>blogger_permalink</wp:meta_key>')
    res.append(' <wp:meta_value>%s</wp:meta_value>' % comment.permalink)
    res.append(' </wp:commentmeta>')
    if comment.url:
    res.append(' <wp:commentmeta>')
    res.append(' <wp:meta_key>blogger_url</wp:meta_key>')
    res.append(' <wp:meta_value>%s</wp:meta_value>' % comment.url)
    res.append(' </wp:commentmeta>')
    res.append(' </wp:comment>')

    return res

    def get_post(self, post):
    if post.content.strip() == '':
    return []

    res = []

    slug = None
    if post.url is not None:
    slug = post.url.split('/')[-1]
    slug = slug[:-5]

    status = 'publish'
    if post.draft:
    status = 'draft'

    self.post_id += 1

    res.append('<item>')
    res.append(' <title>%s</title>' % post.title)
    res.append(' <pubDate>%s</pubDate>' % self.get_date(post.published))
    res.append(' <dc:creator>%s</dc:creator>' % post.author.name)
    res.append(' <guid isPermaLink="true">%s</guid>' % post.permalink)
    res.append(' <description></description/>')
    res.append(' <content:encoded><![CDATA[%s]]></content:encoded>' % self.escape(post.content))
    res.append(' <excerpt:encoded><![CDATA[%s]]></excerpt:encoded>' % self.escape(post.content))
    res.append(' <wp:post_id>%s</wp:post_id>' % self.post_id)
    res.append(' <wp:post_date>%s</wp:post_date>' % self.get_date_wp(post.published))
    res.append(' <wp:post_date_gmt>%s</wp:post_date_gmt>' % self.get_date_wp(post.published))
    res.append(' <wp:comment_status>%s</wp:comment_status>' % self.comment_status)
    res.append(' <wp:ping_status>closed</wp:ping_status>')
    if slug:
    res.append(' <wp:post_name>%s</wp:post_name>' % slug)
    res.append(' <wp:status>%s</wp:status>' % status)
    res.append(' <wp:post_parent>0</wp:post_parent>')
    res.append(' <wp:menu_order>0</wp:menu_order>')
    res.append(' <wp:post_type>post</wp:post_type>')
    res.append(' <wp:post_password></wp:post_password>')
    res.append(' <wp:is_sticky>0</wp:is_sticky>')
    res.append(' <category domain="category" nicename="id"><![CDATA[Bahasa Indonesia]]></category>')
    res.append(' <category domain="category" nicename="hacking"><![CDATA[Hacking]]></category>')
    for label in post.labels:
    res.append(' <category domain="post_tag" nicename="%s"><![CDATA[%s]]></category>' % (label, label))
    res.append(' <wp:postmeta>')
    res.append(' <wp:meta_key>blogger_id</wp:meta_key>')
    res.append(' <wp:meta_value>%s</wp:meta_value>' % post.entry_id)
    res.append(' </wp:postmeta>')
    if post.permalink:
    res.append(' <wp:postmeta>')
    res.append(' <wp:meta_key>blogger_permalink</wp:meta_key>')
    res.append(' <wp:meta_value>%s</wp:meta_value>' % post.permalink)
    res.append(' </wp:postmeta>')
    if post.url:
    res.append(' <wp:postmeta>')
    res.append(' <wp:meta_key>blogger_url</wp:meta_key>')
    res.append(' <wp:meta_value>%s</wp:meta_value>' % post.url)
    res.append(' </wp:postmeta>')

    for comment in post.comments:
    res += self.get_comment(comment)

    res.append('</item>')
    return res

    p = BlogParser(inp)
    blog = p.parse()

    writer = WXRWriter(blog)
    xml = writer.write()

    print xml

    # f = open(out, 'w')
    # f.write(xml)
    # f.close()