i026e · December 30, 2020 15:54
diff --git a/doc_cleaner.py b/doc_cleaner.py
 #!/usr/bin/env python3

 # Remove links and images from Word files
 # Requirements:
 # pip install unotools Wand

 import io
 import atexit
 import time
 import logging
 import subprocess
 import mimetypes
 import argparse

 from pathlib import Path
 from typing import Generator, Any, Tuple


 from unotools import Socket, connect_with_socket
 from unotools.component.writer import Writer
 from unotools.unohelper import convert_path_to_url

 from wand.image import Image as WandImage
 from PIL import Image as PILImage, ImageFile


 def rgb_color(r=0, g=0, b=0) -> int:
    """
    Return an integer which represents a color.
    The color is specified in RGB notation.
    Each of r, g and b must be a number from 0 to 255.
    """
    return (int(r) & 255) << 16 | (int(g) & 255) << 8 | (int(b) & 255)



 class LinkMatcher:    
    def match(self, link: str):
        return True
    
    

 class ImageMatcher:
    def match(self, pil_image: PILImage, img_name: str):
        return True
        
    @staticmethod
    def get_pil_image(data, *, fmt: str = None, mime_type: str = None) -> Tuple[PILImage.Image, str]:
        if mime_type is not None:
            fmt = mimetypes.guess_extension(mime_type)

        if fmt is not None:
            fmt = fmt.lower().strip(".")

        if fmt not in ["bmp", "jpg", "jpeg", "png"]:
            try:
                with WandImage(file=io.BytesIO(data)) as img:
                    data = img.make_blob('png')
                    fmt = 'png'
            except Exception as e:
                logging.exception(f"Error converting image file {e}")

        return PILImage.open(io.BytesIO(data)), fmt



 class DocFileCleaner:
    BLACK_COLOR = rgb_color(0, 0, 0)
    LIBRE_OFFICE_FILTERS = {
        ".doc": "MS Word 97",
        ".docx": "MS Word 2007 XML"
    }

    def __init__(self, office_bin='soffice', host='localhost', port=2002):
        self._office_bin = office_bin
        self._host = host
        self._port = port
    
        self._link_matcher = LinkMatcher()
        self._image_matcher = ImageMatcher()

        self._server = self._start_server()

        self.context = connect_with_socket(
        Socket(
                self._host,
                self._port
            )
        )

    def _start_server(self):
        if self._office_bin:
            proc = subprocess.Popen(
                [
                    self._office_bin,
                    f"--accept=socket,host={self._host},port={self._port};urp;StarOffice.Service",
                    "--norestore",
                    "--nologo",
                    "--nodefault",
                    "--headless"
                ],
            )
            time.sleep(5)
            atexit.register(proc.terminate)
            return proc

    def _iter_links_recursion(self, obj) -> Generator[Any, None, None]:
        if hasattr(obj, "HyperLinkURL"):
            yield obj

        if hasattr(obj, "createEnumeration"):
            # services = elem.getSupportedServiceNames()
            # assert 'com.sun.star.text.Paragraph' in services

            enum = obj.createEnumeration()

            for sub_obj in enum:
                yield from self._iter_links_recursion(sub_obj)

    def iter_links(self, doc):
        # process text
        yield from self._iter_links_recursion(doc.getText())

        # process tables
        for table in doc.getTextTables():
            for cell_name in table.getCellNames():
                yield from self._iter_links_recursion(table.getCellByName(cell_name))

        # process footnotes
        for note in doc.getFootnotes():
            yield from self._iter_links_recursion(note)

        # process endnotes
        for note in doc.getEndnotes():
            yield from self._iter_links_recursion(note)

    def remove_links(self, doc, input_path: Path):
        for elem in self.iter_links(doc):
            if elem.HyperLinkURL and self._link_matcher.match(elem.HyperLinkURL):
                logging.info(f"Found link '{elem.HyperLinkURL}' '{elem.getString()}' in document '{input_path}'")

                elem.HyperLinkURL = ""
                elem.setString(elem.getString())

                elem.setPropertyToDefault("CharStyleName")
                elem.setPropertyToDefault("UnvisitedCharStyleName")
                elem.setPropertyToDefault("VisitedCharStyleName")

                elem.CharColor = self.BLACK_COLOR
                elem.CharUnderline = False

    def remove_images(self, doc, input_path: Path):
        for img in doc.getGraphicObjects():

            pil_image, fmt = self._image_matcher.get_pil_image(bytes(img.Graphic.getDIB()), mime_type=img.Graphic.MimeType)
            img_name = f"{input_path.name}_{img.Name}.{fmt}"
            if self._image_matcher.match(pil_image, img_name):
                logging.info(f"Found matched image '{img_name}' in document '{input_path}'")

                # delete image
                img.dispose()

    def clean(self, input_path: Path, output_path: Path):
        extension = input_path.suffix.lower()
        output_filter = self.LIBRE_OFFICE_FILTERS[extension]

        writer = Writer(self.context, convert_path_to_url(str(input_path)))

        document = writer.raw

        self.remove_links(document, input_path)
        self.remove_images(document, input_path)

        writer.store_to_url(convert_path_to_url(str(output_path)), 'FilterName', output_filter)
        writer.close(True)


 if __name__ == "__main__":
    
    parser = argparse.ArgumentParser(description='Remove links and images from Word files.')
    parser.add_argument('input', type=Path)
    parser.add_argument('output', type=Path)
    
    args = parser.parse_args()
    
    doc_file_cleaner = DocFileCleaner()
    doc_file_cleaner.clean(args.input, args.output)
	#!/usr/bin/env python3

	# Remove links and images from Word files
	# Requirements:
	# pip install unotools Wand

	import io
	import atexit
	import time
	import logging
	import subprocess
	import mimetypes
	import argparse

	from pathlib import Path
	from typing import Generator, Any, Tuple


	from unotools import Socket, connect_with_socket
	from unotools.component.writer import Writer
	from unotools.unohelper import convert_path_to_url

	from wand.image import Image as WandImage
	from PIL import Image as PILImage, ImageFile


	def rgb_color(r=0, g=0, b=0) -> int:
	"""
	Return an integer which represents a color.
	The color is specified in RGB notation.
	Each of r, g and b must be a number from 0 to 255.
	"""
	return (int(r) & 255) << 16 \| (int(g) & 255) << 8 \| (int(b) & 255)



	class LinkMatcher:
	def match(self, link: str):
	return True



	class ImageMatcher:
	def match(self, pil_image: PILImage, img_name: str):
	return True

	@staticmethod
	def get_pil_image(data, *, fmt: str = None, mime_type: str = None) -> Tuple[PILImage.Image, str]:
	if mime_type is not None:
	fmt = mimetypes.guess_extension(mime_type)

	if fmt is not None:
	fmt = fmt.lower().strip(".")

	if fmt not in ["bmp", "jpg", "jpeg", "png"]:
	try:
	with WandImage(file=io.BytesIO(data)) as img:
	data = img.make_blob('png')
	fmt = 'png'
	except Exception as e:
	logging.exception(f"Error converting image file {e}")

	return PILImage.open(io.BytesIO(data)), fmt



	class DocFileCleaner:
	BLACK_COLOR = rgb_color(0, 0, 0)
	LIBRE_OFFICE_FILTERS = {
	".doc": "MS Word 97",
	".docx": "MS Word 2007 XML"
	}

	def __init__(self, office_bin='soffice', host='localhost', port=2002):
	self._office_bin = office_bin
	self._host = host
	self._port = port

	self._link_matcher = LinkMatcher()
	self._image_matcher = ImageMatcher()

	self._server = self._start_server()

	self.context = connect_with_socket(
	Socket(
	self._host,
	self._port
	)
	)

	def _start_server(self):
	if self._office_bin:
	proc = subprocess.Popen(
	[
	self._office_bin,
	f"--accept=socket,host={self._host},port={self._port};urp;StarOffice.Service",
	"--norestore",
	"--nologo",
	"--nodefault",
	"--headless"
	],
	)
	time.sleep(5)
	atexit.register(proc.terminate)
	return proc

	def _iter_links_recursion(self, obj) -> Generator[Any, None, None]:
	if hasattr(obj, "HyperLinkURL"):
	yield obj

	if hasattr(obj, "createEnumeration"):
	# services = elem.getSupportedServiceNames()
	# assert 'com.sun.star.text.Paragraph' in services

	enum = obj.createEnumeration()

	for sub_obj in enum:
	yield from self._iter_links_recursion(sub_obj)

	def iter_links(self, doc):
	# process text
	yield from self._iter_links_recursion(doc.getText())

	# process tables
	for table in doc.getTextTables():
	for cell_name in table.getCellNames():
	yield from self._iter_links_recursion(table.getCellByName(cell_name))

	# process footnotes
	for note in doc.getFootnotes():
	yield from self._iter_links_recursion(note)

	# process endnotes
	for note in doc.getEndnotes():
	yield from self._iter_links_recursion(note)

	def remove_links(self, doc, input_path: Path):
	for elem in self.iter_links(doc):
	if elem.HyperLinkURL and self._link_matcher.match(elem.HyperLinkURL):
	logging.info(f"Found link '{elem.HyperLinkURL}' '{elem.getString()}' in document '{input_path}'")

	elem.HyperLinkURL = ""
	elem.setString(elem.getString())

	elem.setPropertyToDefault("CharStyleName")
	elem.setPropertyToDefault("UnvisitedCharStyleName")
	elem.setPropertyToDefault("VisitedCharStyleName")

	elem.CharColor = self.BLACK_COLOR
	elem.CharUnderline = False

	def remove_images(self, doc, input_path: Path):
	for img in doc.getGraphicObjects():

	pil_image, fmt = self._image_matcher.get_pil_image(bytes(img.Graphic.getDIB()), mime_type=img.Graphic.MimeType)
	img_name = f"{input_path.name}_{img.Name}.{fmt}"
	if self._image_matcher.match(pil_image, img_name):
	logging.info(f"Found matched image '{img_name}' in document '{input_path}'")

	# delete image
	img.dispose()

	def clean(self, input_path: Path, output_path: Path):
	extension = input_path.suffix.lower()
	output_filter = self.LIBRE_OFFICE_FILTERS[extension]

	writer = Writer(self.context, convert_path_to_url(str(input_path)))

	document = writer.raw

	self.remove_links(document, input_path)
	self.remove_images(document, input_path)

	writer.store_to_url(convert_path_to_url(str(output_path)), 'FilterName', output_filter)
	writer.close(True)


	if __name__ == "__main__":

	parser = argparse.ArgumentParser(description='Remove links and images from Word files.')
	parser.add_argument('input', type=Path)
	parser.add_argument('output', type=Path)

	args = parser.parse_args()

	doc_file_cleaner = DocFileCleaner()
	doc_file_cleaner.clean(args.input, args.output)