Skip to content

Instantly share code, notes, and snippets.

@i026e
Created December 30, 2020 15:54
Show Gist options
  • Save i026e/83ab5d4e35a064d2a1b60476531c825d to your computer and use it in GitHub Desktop.
Save i026e/83ab5d4e35a064d2a1b60476531c825d to your computer and use it in GitHub Desktop.
Remove images and disable links in MS Word files using LibreOffice
#!/usr/bin/env python3
# Remove links and images from Word files
# Requirements:
# pip install unotools Wand
import io
import atexit
import time
import logging
import subprocess
import mimetypes
import argparse
from pathlib import Path
from typing import Generator, Any, Tuple
from unotools import Socket, connect_with_socket
from unotools.component.writer import Writer
from unotools.unohelper import convert_path_to_url
from wand.image import Image as WandImage
from PIL import Image as PILImage, ImageFile
def rgb_color(r=0, g=0, b=0) -> int:
"""
Return an integer which represents a color.
The color is specified in RGB notation.
Each of r, g and b must be a number from 0 to 255.
"""
return (int(r) & 255) << 16 | (int(g) & 255) << 8 | (int(b) & 255)
class LinkMatcher:
def match(self, link: str):
return True
class ImageMatcher:
def match(self, pil_image: PILImage, img_name: str):
return True
@staticmethod
def get_pil_image(data, *, fmt: str = None, mime_type: str = None) -> Tuple[PILImage.Image, str]:
if mime_type is not None:
fmt = mimetypes.guess_extension(mime_type)
if fmt is not None:
fmt = fmt.lower().strip(".")
if fmt not in ["bmp", "jpg", "jpeg", "png"]:
try:
with WandImage(file=io.BytesIO(data)) as img:
data = img.make_blob('png')
fmt = 'png'
except Exception as e:
logging.exception(f"Error converting image file {e}")
return PILImage.open(io.BytesIO(data)), fmt
class DocFileCleaner:
BLACK_COLOR = rgb_color(0, 0, 0)
LIBRE_OFFICE_FILTERS = {
".doc": "MS Word 97",
".docx": "MS Word 2007 XML"
}
def __init__(self, office_bin='soffice', host='localhost', port=2002):
self._office_bin = office_bin
self._host = host
self._port = port
self._link_matcher = LinkMatcher()
self._image_matcher = ImageMatcher()
self._server = self._start_server()
self.context = connect_with_socket(
Socket(
self._host,
self._port
)
)
def _start_server(self):
if self._office_bin:
proc = subprocess.Popen(
[
self._office_bin,
f"--accept=socket,host={self._host},port={self._port};urp;StarOffice.Service",
"--norestore",
"--nologo",
"--nodefault",
"--headless"
],
)
time.sleep(5)
atexit.register(proc.terminate)
return proc
def _iter_links_recursion(self, obj) -> Generator[Any, None, None]:
if hasattr(obj, "HyperLinkURL"):
yield obj
if hasattr(obj, "createEnumeration"):
# services = elem.getSupportedServiceNames()
# assert 'com.sun.star.text.Paragraph' in services
enum = obj.createEnumeration()
for sub_obj in enum:
yield from self._iter_links_recursion(sub_obj)
def iter_links(self, doc):
# process text
yield from self._iter_links_recursion(doc.getText())
# process tables
for table in doc.getTextTables():
for cell_name in table.getCellNames():
yield from self._iter_links_recursion(table.getCellByName(cell_name))
# process footnotes
for note in doc.getFootnotes():
yield from self._iter_links_recursion(note)
# process endnotes
for note in doc.getEndnotes():
yield from self._iter_links_recursion(note)
def remove_links(self, doc, input_path: Path):
for elem in self.iter_links(doc):
if elem.HyperLinkURL and self._link_matcher.match(elem.HyperLinkURL):
logging.info(f"Found link '{elem.HyperLinkURL}' '{elem.getString()}' in document '{input_path}'")
elem.HyperLinkURL = ""
elem.setString(elem.getString())
elem.setPropertyToDefault("CharStyleName")
elem.setPropertyToDefault("UnvisitedCharStyleName")
elem.setPropertyToDefault("VisitedCharStyleName")
elem.CharColor = self.BLACK_COLOR
elem.CharUnderline = False
def remove_images(self, doc, input_path: Path):
for img in doc.getGraphicObjects():
pil_image, fmt = self._image_matcher.get_pil_image(bytes(img.Graphic.getDIB()), mime_type=img.Graphic.MimeType)
img_name = f"{input_path.name}_{img.Name}.{fmt}"
if self._image_matcher.match(pil_image, img_name):
logging.info(f"Found matched image '{img_name}' in document '{input_path}'")
# delete image
img.dispose()
def clean(self, input_path: Path, output_path: Path):
extension = input_path.suffix.lower()
output_filter = self.LIBRE_OFFICE_FILTERS[extension]
writer = Writer(self.context, convert_path_to_url(str(input_path)))
document = writer.raw
self.remove_links(document, input_path)
self.remove_images(document, input_path)
writer.store_to_url(convert_path_to_url(str(output_path)), 'FilterName', output_filter)
writer.close(True)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Remove links and images from Word files.')
parser.add_argument('input', type=Path)
parser.add_argument('output', type=Path)
args = parser.parse_args()
doc_file_cleaner = DocFileCleaner()
doc_file_cleaner.clean(args.input, args.output)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment