Last active
August 29, 2015 13:56
-
-
Save ishahid/9245232 to your computer and use it in GitHub Desktop.
Utility to replace variables enclosed in square brackets with the given value in Microsoft Word docx files. Based upon the following blog post. http://virantha.com/2013/08/16/reading-and-writing-microsoft-word-docx-files-with-python/
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os, re, zipfile, shutil, tempfile | |
from lxml import etree | |
class docx(): | |
def __init__(self, docx_filename): | |
self.filename = docx_filename | |
with open(self.filename) as f: | |
self.zipfile = zipfile.ZipFile(f) | |
self.xml_content = self.zipfile.read('word/document.xml') | |
self.xml_tree = self._get_xml_tree(self.xml_content) | |
self._join_tags(self.xml_tree) | |
def _get_xml_tree(self, xml_string): | |
return etree.fromstring(xml_string) | |
def _check_element_is(self, element, type_char): | |
word_schema = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main' | |
return element.tag == '{%s}%s' % (word_schema, type_char) | |
def _itertext(self, tree): | |
for node in tree.iter(tag=etree.Element): | |
if self._check_element_is(node, 't'): | |
yield (node, node.text) | |
def _join_tags(self, tree): | |
chars = [] | |
openbrac = False | |
inside_openbrac_node = False | |
for node, text in self._itertext(tree): | |
# Scan through every node with text | |
for i, c in enumerate(text): | |
# Go through each node's text character by character | |
if c == '[': | |
openbrac = True # Within a tag | |
inside_openbrac_node = True # Tag was opened in this node | |
openbrac_node = node # Save ptr to open bracket containing node | |
chars = [] | |
elif c== ']': | |
assert openbrac | |
if inside_openbrac_node: | |
# Open and close inside same node, no need to do anything | |
pass | |
else: | |
# Open bracket in earlier node, now it's closed | |
# So append all the chars we've encountered since the openbrac_node '[' | |
# to the openbrac_node | |
chars.append(']') | |
openbrac_node.text += ''.join(chars) | |
# Also, don't forget to remove the characters seen so far from current node | |
node.text = text[i+1:] | |
openbrac = False | |
inside_openbrac_node = False | |
else: | |
# Normal text character | |
if openbrac and inside_openbrac_node: | |
# No need to copy text | |
pass | |
elif openbrac and not inside_openbrac_node: | |
chars.append(c) | |
else: | |
# outside of a open/close | |
pass | |
if openbrac and not inside_openbrac_node: | |
# Went through all text that is part of an open bracket/close bracket | |
# in other nodes | |
# need to remove this text completely | |
node.text = "" | |
inside_openbrac_node = False | |
def replace(self, variable, value): | |
var = '[%s]' % variable.lower() | |
for node, text in self._itertext(self.xml_tree): | |
if var in text.lower(): | |
regex = re.compile(re.escape(var), re.IGNORECASE) | |
node.text = regex.sub(value, text) | |
def save_as(self, output_filename): | |
tmp_dir = tempfile.mkdtemp() | |
with open(self.filename) as f: | |
self.zipfile = zipfile.ZipFile(f) | |
self.zipfile.extractall(tmp_dir) | |
with open(os.path.join(tmp_dir,'word/document.xml'), 'w') as f: | |
xmlstr = etree.tostring(self.xml_tree, pretty_print=True) | |
f.write(xmlstr) | |
filenames = self.zipfile.namelist() | |
zip_copy_filename = output_filename | |
with zipfile.ZipFile(zip_copy_filename, "w") as docx: | |
for filename in filenames: | |
docx.write(os.path.join(tmp_dir, filename), filename) | |
# Clean up the temp dir | |
shutil.rmtree(tmp_dir) | |
if __name__ == "__main__": | |
doc = docx('test.docx') | |
doc.replace('variable', 'value') | |
doc.replace('another_variable', 'another value') | |
doc.save_as('test_result.docx') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Search for the required text the same way I am searching/replacing variables with values in the function replace.