Last active
November 7, 2023 09:19
-
-
Save phillipkent/fcd8276d3984089cddd2f72a52fd00eb to your computer and use it in GitHub Desktop.
Python code using the python-docx module to convert a DOCX file to another DOCX file
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Converts a docx file with tables and images to a new docx file | |
# The new file is based on a 'stub' document which contains preamble text and styles | |
# | |
# Requires the Python module 'python-docx' <https://python-docx.readthedocs.io> | |
# Written for Python 3 | |
# | |
# Source documents are taken from the directory 'source' and converted documents are saved | |
# to the directory 'converted' | |
# | |
# Two types of source documents are handled: 'Fiscal Guide' or 'Economics Regime'. Each one | |
# has its own stub document and different conversion options. | |
# ** The stub documents are not included here! The code is offered as an example for adaptation | |
# ** to your own uses. | |
# | |
# Thanks to David Ssali for his code posted at https://medium.com/@dvdssali/docx-to-html-1374eb6491a1 | |
# | |
from docx import Document | |
from docx.document import Document as _Document | |
from docx.oxml.text.paragraph import CT_P | |
from docx.oxml.table import CT_Tbl | |
from docx.table import _Cell, Table | |
from docx.text.paragraph import Paragraph | |
from docx.shared import RGBColor | |
from docx.enum.text import WD_ALIGN_PARAGRAPH | |
import xml.etree.ElementTree as ET | |
from io import BytesIO | |
from copy import copy | |
from copy import deepcopy | |
import datetime | |
from os import listdir | |
from os.path import isfile, join | |
import re | |
def get_docx_text(filepath): | |
""" | |
Take the path of a docx file as argument, return the text in array of strings | |
""" | |
WORD_NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}' | |
PARA = WORD_NAMESPACE + 'p' | |
TEXT = WORD_NAMESPACE + 't' | |
document = zipfile.ZipFile(filepath) | |
xml_content = document.read('word/document.xml') | |
document.close() | |
tree = XML(xml_content) | |
paragraphs = [] | |
for paragraph in tree.getiterator(PARA): | |
texts = [node.text | |
for node in paragraph.getiterator(TEXT) | |
if node.text] | |
if texts: | |
paragraphs.append(''.join(texts)) | |
return paragraphs | |
def iter_block_items(parent): | |
""" | |
Generate a reference to each paragraph and table child within *parent*, | |
in document order. Each returned value is an instance of either Table or | |
Paragraph. *parent* would most commonly be a reference to a main | |
Document object, but also works for a _Cell object, which itself can | |
contain paragraphs and tables. | |
""" | |
if isinstance(parent, _Document): | |
parent_elm = parent.element.body | |
elif isinstance(parent, _Cell): | |
parent_elm = parent._tc | |
else: | |
raise ValueError("ERROR: something not right") | |
for child in parent_elm.iterchildren(): | |
if isinstance(child, CT_P): | |
yield Paragraph(child, parent) | |
elif isinstance(child, CT_Tbl): | |
yield Table(child, parent) | |
def get_heading_type(block): | |
return block.style.name | |
def get_image_Ids(paragraph): | |
ids = [] | |
root = ET.fromstring(paragraph._p.xml) | |
namespace = { | |
'a':"http://schemas.openxmlformats.org/drawingml/2006/main", \ | |
'r':"http://schemas.openxmlformats.org/officeDocument/2006/relationships", \ | |
'wp':"http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing"} | |
inlines = root.findall('.//wp:inline',namespace) | |
for inline in inlines: | |
imgs = inline.findall('.//a:blip', namespace) | |
for img in imgs: | |
id = img.attrib['{{{0}}}embed'.format(namespace['r'])] | |
ids.append(id) | |
inlines = root.findall('.//wp:anchor',namespace) | |
for inline in inlines: | |
imgs = inline.findall('.//a:blip', namespace) | |
for img in imgs: | |
id = img.attrib['{{{0}}}embed'.format(namespace['r'])] | |
ids.append(id) | |
return ids | |
def convertPRL(sourceDocx, stubDocx): | |
# The converted document begins using preamble content and styles from the 'stub' document | |
doc = Document(stubDocx) | |
source_doc = Document('./source/' + sourceDocx) | |
# Set the new Core Properties | |
doc.core_properties.author = 'Documentation team' | |
doc.core_properties.last_modified_by = 'Automatic conversion script' | |
doc.core_properties.created = source_doc.core_properties.created | |
doc.core_properties.revision = 1 + source_doc.core_properties.revision | |
doc.core_properties.modified = datetime.datetime.utcnow() | |
# Title page | |
# Replace 'TITLE_REGIME' from source_doc | |
# Replace 'SUB_REGIME_TYPE' from source_doc | |
# Replace 'SUB_VERSION' from source_doc (not for FiscalGuide stub) | |
sourceDocText = get_docx_text('./source/' + sourceDocx) | |
index_T = [index for index, item in enumerate(doc.paragraphs) if item.text == 'TITLE_REGIME'][0] | |
doc.paragraphs[index_T].text = sourceDocText[1] | |
index_S1 = [index for index, item in enumerate(doc.paragraphs) if item.text == 'SUB_REGIME_TYPE'][0] | |
doc.paragraphs[index_S1].text = sourceDocText[2] | |
if 'EconomicsRegime' in stubDocx: | |
index_S2 = [index for index, item in enumerate(doc.paragraphs) if item.text == 'SUB_VERSION'][0] | |
doc.paragraphs[index_S2].text = sourceDocText[5] | |
# Add the contents of source doc to the new doc | |
if 'EconomicsRegime' in stubDocx: | |
startHeading = 'Economics Regime' | |
elif 'FiscalGuide' in stubDocx: | |
startHeading = 'Fiscal Terms' | |
started = False | |
# started is False until after startHeading is found in source_doc | |
for block in iter_block_items(source_doc): | |
if started: | |
if isinstance(block, Paragraph): | |
# Look for image objects in the block | |
image_Ids = get_image_Ids(block) | |
##print(image_Ids) | |
if len(image_Ids) > 0: | |
#process the images in this block | |
for id in image_Ids: | |
image_part = source_doc.part.related_parts[id] | |
image_stream = BytesIO(image_part.blob) | |
inlineShape = doc.add_picture(image_stream) | |
# width/height are extracted from the source inline_shape object which has this id | |
inlineShape.width = [s.width for s in source_doc.inline_shapes if s._inline.graphic.graphicData.pic.blipFill.blip.embed == id][0] | |
inlineShape.height = [s.height for s in source_doc.inline_shapes if s._inline.graphic.graphicData.pic.blipFill.blip.embed == id][0] | |
# the image will be centered | |
doc.paragraphs[-1].alignment = WD_ALIGN_PARAGRAPH.CENTER | |
else: # process the paragraph as text | |
tmp_heading_type = get_heading_type(block) | |
# check if this is a bulleted paragraph | |
if re.match("List\sParagraph",tmp_heading_type): | |
p = doc.add_paragraph('', style='List Bullet') | |
else: | |
p = doc.add_paragraph('', style=block.style) | |
p.alignment = block.alignment | |
for r in block.runs: | |
new_run = p.add_run(r.text, style=r.style) | |
new_run.bold = r.bold | |
new_run.italic = r.italic | |
new_run.underline = r.underline | |
font = new_run.font | |
font.color.rgb = r.font.color.rgb | |
elif isinstance(block, Table): | |
# For Economics documents: the 3-column tables with images need special handling | |
# Images are only found in column 3 | |
# (Note: funny behavior happens when trying to modify or add images within a copied table, | |
# so the approach here is to build a new table) | |
if 'EconomicsRegime' in stubDocx and len(block.columns) == 3: | |
new_table = doc.add_table(rows = len(block.rows), cols = len(block.columns)) | |
new_table.style = doc.styles['Table Grid'] | |
# Insert Column 1 | |
for cell_index, cell in enumerate(block.columns[0].cells): | |
p = new_table.columns[0].cells[cell_index].paragraphs[0] | |
for r in cell.paragraphs[0].runs: | |
new_run = p.add_run(r.text)##, style=new_table.columns[1].cells[0].paragraphs[0].runs[0].style) | |
new_run.bold = r.bold | |
new_run.italic = r.italic | |
new_run.underline = r.underline | |
font = new_run.font | |
##font.size = new_table.columns[1].cells[0].paragraphs[0].runs[0].font.size | |
font.color.rgb = r.font.color.rgb | |
# Insert Column 2 | |
for cell_index, cell in enumerate(block.columns[1].cells): | |
p = new_table.columns[1].cells[cell_index].paragraphs[0] | |
for r in cell.paragraphs[0].runs: | |
new_run = p.add_run(r.text)##, style=new_table.columns[1].cells[0].paragraphs[0].runs[0].style) | |
new_run.bold = r.bold | |
new_run.italic = r.italic | |
new_run.underline = r.underline | |
font = new_run.font | |
##font.size = new_table.columns[1].cells[0].paragraphs[0].runs[0].font.size | |
font.color.rgb = r.font.color.rgb | |
for cell_index, cell in enumerate(block.columns[2].cells): | |
first_para = True | |
for cblock in iter_block_items(cell): | |
# Look for image objects in the block | |
cblock_image_Ids = get_image_Ids(cblock) | |
if len(cblock_image_Ids) > 0: | |
#process the images in this block | |
for c_id in cblock_image_Ids: | |
c_image_part = source_doc.part.related_parts[c_id] | |
c_para = new_table.columns[2].cells[cell_index].add_paragraph() | |
c_para.alignment = WD_ALIGN_PARAGRAPH.CENTER | |
c_run = c_para.add_run() | |
c_image_stream = BytesIO(c_image_part._blob) | |
c_inlineShape = c_run.add_picture(c_image_stream) | |
# width/height are extracted from the source inline_shape object which has this id | |
c_inlineShape.width = [s.width for s in source_doc.inline_shapes if s._inline.graphic.graphicData.pic.blipFill.blip.embed == c_id][0] | |
c_inlineShape.height = [s.height for s in source_doc.inline_shapes if s._inline.graphic.graphicData.pic.blipFill.blip.embed == c_id][0] | |
else: # process the paragraph as text | |
if first_para: | |
p = new_table.columns[2].cells[cell_index].paragraphs[0] | |
first_para = False | |
else: | |
p = new_table.columns[2].cells[cell_index].add_paragraph() | |
#p.style = cell_style | |
p.alignment = cblock.alignment | |
for r in cblock.runs: | |
new_run = p.add_run(r.text) | |
new_run.bold = r.bold | |
new_run.italic = r.italic | |
new_run.underline = r.underline | |
font = new_run.font | |
font.color.rgb = r.font.color.rgb | |
else: | |
# FOR ALL OTHER TABLES: find the current last paragraph in doc and insert the new_table after it | |
last_p = doc.paragraphs[-1] | |
new_table = block | |
p._p.addnext(new_table._tbl) | |
new_table.style = doc.styles['Table Grid'] | |
else: | |
if isinstance(block, Paragraph): | |
if block.text == startHeading: | |
started = True | |
doc.save('./converted/' + sourceDocx) | |
if __name__ == '__main__': | |
sourceDir = './source/' | |
# Batch convert all the files in directory 'source' | |
sourceFilelist = [f for f in listdir(sourceDir) if isfile(join(sourceDir, f))] | |
for file in sourceFilelist: | |
if 'Economics' in file: | |
convertPRL(file, 'STUB-EconomicsRegime.docx') | |
else: | |
convertPRL(file, 'STUB-FiscalGuide.docx') | |
print('Converted: {0}'.format(file)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment