-
-
Save poppingtonic/e3d0df392b90c1403b29017a336678d2 to your computer and use it in GitHub Desktop.
Python script importing Wikipedia xml dump data to PostgreSQL
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import psycopg2 | |
import xml.sax | |
from xml.sax.handler import ContentHandler | |
from dicttoxml import dicttoxml | |
INSERT_STMT = "INSERT INTO pages (id, page) VALUES('%s', '%s')" | |
COMMIT_WINDOW = 10000 | |
class WikidataHandler(ContentHandler): | |
def __init__(self, conn, stop=None): | |
self.in_page = False | |
self.buff = "" | |
self.doc = {} # dict holding xml tag and data | |
self.tags = [] # list holding xml path (root to leaf node) | |
self.conn = conn | |
self.stop = stop | |
self.processed = 0 | |
def startElement(self, name, attrs): | |
if name == "page": | |
self.in_page = True | |
if self.in_page: | |
# dig xml path hierarchy | |
self.tags.append(name) | |
def endElement(self, name): | |
if not self.in_page: | |
return | |
# update current dict(page) data | |
if self.tags[0] not in self.doc: | |
self.doc[self.tags[0]] = {} | |
node = self.doc[self.tags[0]] | |
for tag in self.tags[1:]: | |
if tag not in node: | |
node[tag] = self.buff if tag == name and self.buff else {} | |
node = node[tag] | |
self.buff = "" | |
# up xml path hierarchy | |
del self.tags[len(self.tags)-1] | |
if name == "page": | |
self.in_page = False | |
if self.doc["page"] and \ | |
self.doc["page"]["revision"] and \ | |
self.doc["page"]["revision"]["text"] and \ | |
self.doc["page"]["revision"]["text"].startswith("#REDIRECT"): | |
# skip redirect page | |
self.doc = {} | |
self.tags = [] | |
return | |
try: | |
# dict to xml string | |
xmlstr = dicttoxml(self.doc, root=False) | |
except: | |
# if failed, skip this page | |
self.doc = {} | |
self.tags = [] | |
return | |
# insert to db | |
cur = self.conn.cursor() | |
cur.execute(INSERT_STMT % | |
(self.doc['page']['id'], xmlstr.decode("utf-8"))) | |
if self.processed % COMMIT_WINDOW == 0: | |
print("%d pages were processed." % self.processed) | |
self.conn.commit() | |
# clear status | |
self.doc = {} | |
self.tags = [] | |
self.processed += 1 | |
if self.stop and self.processed >= self.stop: | |
raise xml.sax.SAXException("%d docs already processed." % self.processed) | |
def characters(self, content): | |
c = content.strip() | |
if self.in_page and c: | |
self.buff += c | |
if __name__ == "__main__": | |
import sys | |
fname = sys.argv[1] | |
stop = int(sys.argv[2]) if len(sys.argv) > 2 else None | |
conn = psycopg2.connect(host="localhost", port=5432, database="wikipedia") | |
handler = WikidataHandler(conn, stop) | |
try: | |
xml.sax.parse(fname, handler) | |
finally: | |
conn.commit() | |
conn.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment