Created
December 30, 2020 15:54
-
-
Save i026e/83ab5d4e35a064d2a1b60476531c825d to your computer and use it in GitHub Desktop.
Remove images and disable links in MS Word files using LibreOffice
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# Remove links and images from Word files | |
# Requirements: | |
# pip install unotools Wand | |
import io | |
import atexit | |
import time | |
import logging | |
import subprocess | |
import mimetypes | |
import argparse | |
from pathlib import Path | |
from typing import Generator, Any, Tuple | |
from unotools import Socket, connect_with_socket | |
from unotools.component.writer import Writer | |
from unotools.unohelper import convert_path_to_url | |
from wand.image import Image as WandImage | |
from PIL import Image as PILImage, ImageFile | |
def rgb_color(r=0, g=0, b=0) -> int: | |
""" | |
Return an integer which represents a color. | |
The color is specified in RGB notation. | |
Each of r, g and b must be a number from 0 to 255. | |
""" | |
return (int(r) & 255) << 16 | (int(g) & 255) << 8 | (int(b) & 255) | |
class LinkMatcher: | |
def match(self, link: str): | |
return True | |
class ImageMatcher: | |
def match(self, pil_image: PILImage, img_name: str): | |
return True | |
@staticmethod | |
def get_pil_image(data, *, fmt: str = None, mime_type: str = None) -> Tuple[PILImage.Image, str]: | |
if mime_type is not None: | |
fmt = mimetypes.guess_extension(mime_type) | |
if fmt is not None: | |
fmt = fmt.lower().strip(".") | |
if fmt not in ["bmp", "jpg", "jpeg", "png"]: | |
try: | |
with WandImage(file=io.BytesIO(data)) as img: | |
data = img.make_blob('png') | |
fmt = 'png' | |
except Exception as e: | |
logging.exception(f"Error converting image file {e}") | |
return PILImage.open(io.BytesIO(data)), fmt | |
class DocFileCleaner: | |
BLACK_COLOR = rgb_color(0, 0, 0) | |
LIBRE_OFFICE_FILTERS = { | |
".doc": "MS Word 97", | |
".docx": "MS Word 2007 XML" | |
} | |
def __init__(self, office_bin='soffice', host='localhost', port=2002): | |
self._office_bin = office_bin | |
self._host = host | |
self._port = port | |
self._link_matcher = LinkMatcher() | |
self._image_matcher = ImageMatcher() | |
self._server = self._start_server() | |
self.context = connect_with_socket( | |
Socket( | |
self._host, | |
self._port | |
) | |
) | |
def _start_server(self): | |
if self._office_bin: | |
proc = subprocess.Popen( | |
[ | |
self._office_bin, | |
f"--accept=socket,host={self._host},port={self._port};urp;StarOffice.Service", | |
"--norestore", | |
"--nologo", | |
"--nodefault", | |
"--headless" | |
], | |
) | |
time.sleep(5) | |
atexit.register(proc.terminate) | |
return proc | |
def _iter_links_recursion(self, obj) -> Generator[Any, None, None]: | |
if hasattr(obj, "HyperLinkURL"): | |
yield obj | |
if hasattr(obj, "createEnumeration"): | |
# services = elem.getSupportedServiceNames() | |
# assert 'com.sun.star.text.Paragraph' in services | |
enum = obj.createEnumeration() | |
for sub_obj in enum: | |
yield from self._iter_links_recursion(sub_obj) | |
def iter_links(self, doc): | |
# process text | |
yield from self._iter_links_recursion(doc.getText()) | |
# process tables | |
for table in doc.getTextTables(): | |
for cell_name in table.getCellNames(): | |
yield from self._iter_links_recursion(table.getCellByName(cell_name)) | |
# process footnotes | |
for note in doc.getFootnotes(): | |
yield from self._iter_links_recursion(note) | |
# process endnotes | |
for note in doc.getEndnotes(): | |
yield from self._iter_links_recursion(note) | |
def remove_links(self, doc, input_path: Path): | |
for elem in self.iter_links(doc): | |
if elem.HyperLinkURL and self._link_matcher.match(elem.HyperLinkURL): | |
logging.info(f"Found link '{elem.HyperLinkURL}' '{elem.getString()}' in document '{input_path}'") | |
elem.HyperLinkURL = "" | |
elem.setString(elem.getString()) | |
elem.setPropertyToDefault("CharStyleName") | |
elem.setPropertyToDefault("UnvisitedCharStyleName") | |
elem.setPropertyToDefault("VisitedCharStyleName") | |
elem.CharColor = self.BLACK_COLOR | |
elem.CharUnderline = False | |
def remove_images(self, doc, input_path: Path): | |
for img in doc.getGraphicObjects(): | |
pil_image, fmt = self._image_matcher.get_pil_image(bytes(img.Graphic.getDIB()), mime_type=img.Graphic.MimeType) | |
img_name = f"{input_path.name}_{img.Name}.{fmt}" | |
if self._image_matcher.match(pil_image, img_name): | |
logging.info(f"Found matched image '{img_name}' in document '{input_path}'") | |
# delete image | |
img.dispose() | |
def clean(self, input_path: Path, output_path: Path): | |
extension = input_path.suffix.lower() | |
output_filter = self.LIBRE_OFFICE_FILTERS[extension] | |
writer = Writer(self.context, convert_path_to_url(str(input_path))) | |
document = writer.raw | |
self.remove_links(document, input_path) | |
self.remove_images(document, input_path) | |
writer.store_to_url(convert_path_to_url(str(output_path)), 'FilterName', output_filter) | |
writer.close(True) | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser(description='Remove links and images from Word files.') | |
parser.add_argument('input', type=Path) | |
parser.add_argument('output', type=Path) | |
args = parser.parse_args() | |
doc_file_cleaner = DocFileCleaner() | |
doc_file_cleaner.clean(args.input, args.output) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment