dchaplinsky · December 17, 2024 21:11
diff --git a/tmx_to_paraconc.py b/tmx_to_paraconc.py
 #!/usr/bin/env python3
 """Convert TMX files to ParaConc format.

 This script converts TMX (Translation Memory eXchange) files to ParaConc format,
 which consists of three separate XML files: source language, target language,
 and alignment information. It supports complex alignment patterns, HTML tag preservation,
 and includes input validation.

 Example usage:
    python tmx_to_paraconc.py input.tmx -o output_prefix
 """

 import argparse
 import xml.etree.ElementTree as ET
 import xml.dom.minidom
 from dataclasses import dataclass
 from pathlib import Path
 from typing import List, Tuple
 from tqdm import tqdm


 @dataclass
 class AlignmentGroup:
    """Represents a group of aligned sentences."""

    source_ids: List[str]
    target_ids: List[str]

    @property
    def alignment_type(self) -> str:
        """Get the alignment type (e.g., '1-1', '2-1', '1-2')."""
        return f"{len(self.source_ids)}-{len(self.target_ids)}"


 class TMXValidationError(Exception):
    """Raised when TMX file validation fails."""

    pass


 def write_xml_file(tree: ET.ElementTree, output_file: str) -> None:
    """Write XML tree to file with proper indentation.

    Args:
        tree: ElementTree object to write
        output_file: Path to output file
    """
    xml_str = ET.tostring(tree.getroot(), encoding="unicode", method="xml")
    dom = xml.dom.minidom.parseString(xml_str)
    with open(output_file, "w", encoding="utf-8") as f:
        f.write(dom.toprettyxml(indent="  "))


 class TMXDocument:
    """Represents a TMX document and handles its parsing."""

    def __init__(self, file_path: Path):
        """Initialize TMX document parser.

        Args:
            file_path: Path to the TMX file

        Raises:
            TMXValidationError: If the TMX file is invalid
        """
        self.file_path = file_path
        self._validate_tmx()
        self.tree = ET.parse(file_path)
        self.root = self.tree.getroot()
        self._source_lang = self._get_source_language()
        self._target_lang = None
        self.current_source_group: List[Tuple[str, str]] = []
        self.current_target_group: List[Tuple[str, str]] = []
        self.alignment_groups: List[AlignmentGroup] = []
        self.source_sentences: List[Tuple[str, str]] = []
        self.target_sentences: List[Tuple[str, str]] = []

    def _validate_tmx(self) -> None:
        """Validate TMX file structure and content.

        Raises:
            TMXValidationError: If validation fails
        """
        try:
            tree = ET.parse(self.file_path)
            root = tree.getroot()

            if root.tag != "tmx":
                raise TMXValidationError("Root element must be 'tmx'")

            header = root.find("header")
            if header is None:
                raise TMXValidationError("Missing required 'header' element")

            if not header.get("srclang"):
                raise TMXValidationError(
                    "Missing required 'srclang' attribute in header"
                )

            body = root.find("body")
            if body is None:
                raise TMXValidationError("Missing required 'body' element")

            if body.find("tu") is None:
                raise TMXValidationError("No translation units found in the TMX file")

        except ET.ParseError as e:
            raise TMXValidationError(f"Invalid XML format: {str(e)}")
        except Exception as e:
            raise TMXValidationError(f"Validation failed: {str(e)}")

    def _get_source_language(self) -> str:
        """Extract source language from TMX header.

        Returns:
            Source language code
        """
        header = self.root.find("header")
        return header.get("srclang")

    def _preserve_html(self, element: ET.Element) -> str:
        """Preserve HTML tags in text content.

        Args:
            element: XML element containing text and possible HTML tags

        Returns:
            String with preserved HTML tags
        """
        return "".join(
            (
                ET.tostring(child, encoding="unicode", method="xml")
                if child.tag not in ["seg", "tuv", "tu"]
                else (child.text or "")
            )
            for child in element.iter()
        )

    def _flush_current_groups(self) -> None:
        """Create alignment group from current source and target sentences."""
        if self.current_source_group or self.current_target_group:
            source_ids = [id_ for id_, _ in self.current_source_group]
            target_ids = [id_ for id_, _ in self.current_target_group]

            self.source_sentences += self.current_source_group
            self.target_sentences += self.current_target_group
            self.alignment_groups.append(AlignmentGroup(source_ids, target_ids))
            self.current_source_group = []
            self.current_target_group = []

    def parse(self) -> None:
        """Parse TMX file and extract sentence pairs."""
        source_sentence_id = 1
        target_sentence_id = 1

        for tu in tqdm(self.root.findall(".//tu"), desc="Parsing TMX"):
            # Get sentence break information for this specific TU
            prop = tu.find('prop[@type="x-sentbreak"]')
            sentence_break = prop.text if prop is not None else None

            source_seg = None
            target_seg = None

            for tuv in tu.findall("tuv"):
                lang = tuv.get("{http://www.w3.org/XML/1998/namespace}lang")
                seg = tuv.find("seg")

                if lang == self._source_lang:
                    source_seg = self._preserve_html(seg)
                elif not self._target_lang:
                    self._target_lang = lang
                    target_seg = self._preserve_html(seg)
                elif lang == self._target_lang:
                    target_seg = self._preserve_html(seg)

            # Handle source segment if present
            if source_seg:
                for sent in map(str.strip, source_seg.split(sentence_break)):
                    if sent:
                        self.current_source_group.append(
                            (f"1:{source_sentence_id}", sent)
                        )
                        source_sentence_id += 1

            # Handle target segment if present
            if target_seg:
                for sent in map(str.strip, target_seg.split(sentence_break)):
                    if sent:
                        self.current_target_group.append(
                            (f"1:{target_sentence_id}", sent)
                        )
                        target_sentence_id += 1

            self._flush_current_groups()


 class ParaConcConverter:
    """Converts TMX document data to ParaConc format."""

    def __init__(self, tmx_doc: TMXDocument, output_path_and_prefix: Path):
        """Initialize ParaConc converter.

        Args:
            tmx_doc: Parsed TMX document
            output_path_and_prefix: Prefix for output files
        """
        self.tmx_doc = tmx_doc
        self.output_dir = output_path_and_prefix.parent
        self.output_dir.mkdir(parents=True, exist_ok=True)
        self.output_prefix = output_path_and_prefix.name

    def _create_language_file(self, lang: str, is_source: bool, desc: str) -> None:
        """Create a language-specific XML file in ParaConc format.

        Args:
            lang: Language code
            is_source: True if creating source language file
            desc: Description for progress bar
        """
        root = ET.Element("doc")
        root.set("id", Path(self.output_prefix).stem)
        root.set("language", lang)
        root.set("version", "00")

        # Create a single paragraph for all sentences
        para = ET.SubElement(root, "p")
        para.set("id", "1")

        # Sort sentences by ID just in case
        sorted_sentences = sorted(
            (
                self.tmx_doc.source_sentences
                if is_source
                else self.tmx_doc.target_sentences
            ),
            key=lambda x: int(x[0].split(":")[1]),
        )

        for sent_id, text in tqdm(sorted_sentences, desc=desc):
            sent = ET.SubElement(para, "s")
            sent.set("id", sent_id)
            sent.text = text

        tree = ET.ElementTree(root)
        output_file = self.output_dir / f"{self.output_prefix}.{lang}-00.xml"
        write_xml_file(tree, output_file)

    def _create_alignment_file(self) -> None:
        """Create alignment XML file with support for complex alignments."""
        root = ET.Element("linkGrp")
        root.set("fromDoc", f"{self.output_prefix}.{self.tmx_doc._source_lang}-00.xml")
        root.set("toDoc", f"{self.output_prefix}.{self.tmx_doc._target_lang}-00.xml")

        for group in tqdm(
            self.tmx_doc.alignment_groups, desc="Creating alignment file"
        ):
            link = ET.SubElement(root, "link")
            link.set("type", group.alignment_type)
            link.set(
                "xtargets", f"{' '.join(group.target_ids)};{' '.join(group.source_ids)}"
            )
            link.set("status", "man")

        tree = ET.ElementTree(root)
        alignment_file = self.output_dir / f"{self.output_prefix}.{self.tmx_doc._source_lang}-00.{self.tmx_doc._target_lang}-00.alignment.xml"
        write_xml_file(tree, alignment_file)

    def convert(self) -> None:
        """Convert TMX data to ParaConc format files."""
        # Create source language file
        self._create_language_file(
            self.tmx_doc._source_lang, True, "Creating source language file"
        )

        # Create target language file
        self._create_language_file(
            self.tmx_doc._target_lang, False, "Creating target language file"
        )

        # Create alignment file
        self._create_alignment_file()


 def main():
    """Main entry point for the script."""
    parser = argparse.ArgumentParser(
        description="Convert TMX files to ParaConc format",
        formatter_class=argparse.RawDescriptionHelpFormatter,
    )
    parser.add_argument("input_file", type=Path, help="Input TMX file")
    parser.add_argument(
        "-o",
        "--output",
        type=Path,
        required=True,
        help="Output file prefix (without extension)",
    )

    args = parser.parse_args()

    try:
        print(f"Processing {args.input_file}")
        tmx_doc = TMXDocument(args.input_file)
        tmx_doc.parse()

        converter = ParaConcConverter(tmx_doc, args.output)
        converter.convert()

        print("Conversion completed successfully!")

    except TMXValidationError as e:
        print(f"Error: Invalid TMX file - {str(e)}")
        exit(1)

 if __name__ == "__main__":
    main()
	#!/usr/bin/env python3
	"""Convert TMX files to ParaConc format.

	This script converts TMX (Translation Memory eXchange) files to ParaConc format,
	which consists of three separate XML files: source language, target language,
	and alignment information. It supports complex alignment patterns, HTML tag preservation,
	and includes input validation.

	Example usage:
	python tmx_to_paraconc.py input.tmx -o output_prefix
	"""

	import argparse
	import xml.etree.ElementTree as ET
	import xml.dom.minidom
	from dataclasses import dataclass
	from pathlib import Path
	from typing import List, Tuple
	from tqdm import tqdm


	@dataclass
	class AlignmentGroup:
	"""Represents a group of aligned sentences."""

	source_ids: List[str]
	target_ids: List[str]

	@property
	def alignment_type(self) -> str:
	"""Get the alignment type (e.g., '1-1', '2-1', '1-2')."""
	return f"{len(self.source_ids)}-{len(self.target_ids)}"


	class TMXValidationError(Exception):
	"""Raised when TMX file validation fails."""

	pass


	def write_xml_file(tree: ET.ElementTree, output_file: str) -> None:
	"""Write XML tree to file with proper indentation.

	Args:
	tree: ElementTree object to write
	output_file: Path to output file
	"""
	xml_str = ET.tostring(tree.getroot(), encoding="unicode", method="xml")
	dom = xml.dom.minidom.parseString(xml_str)
	with open(output_file, "w", encoding="utf-8") as f:
	f.write(dom.toprettyxml(indent=" "))


	class TMXDocument:
	"""Represents a TMX document and handles its parsing."""

	def __init__(self, file_path: Path):
	"""Initialize TMX document parser.

	Args:
	file_path: Path to the TMX file

	Raises:
	TMXValidationError: If the TMX file is invalid
	"""
	self.file_path = file_path
	self._validate_tmx()
	self.tree = ET.parse(file_path)
	self.root = self.tree.getroot()
	self._source_lang = self._get_source_language()
	self._target_lang = None
	self.current_source_group: List[Tuple[str, str]] = []
	self.current_target_group: List[Tuple[str, str]] = []
	self.alignment_groups: List[AlignmentGroup] = []
	self.source_sentences: List[Tuple[str, str]] = []
	self.target_sentences: List[Tuple[str, str]] = []

	def _validate_tmx(self) -> None:
	"""Validate TMX file structure and content.

	Raises:
	TMXValidationError: If validation fails
	"""
	try:
	tree = ET.parse(self.file_path)
	root = tree.getroot()

	if root.tag != "tmx":
	raise TMXValidationError("Root element must be 'tmx'")

	header = root.find("header")
	if header is None:
	raise TMXValidationError("Missing required 'header' element")

	if not header.get("srclang"):
	raise TMXValidationError(
	"Missing required 'srclang' attribute in header"
	)

	body = root.find("body")
	if body is None:
	raise TMXValidationError("Missing required 'body' element")

	if body.find("tu") is None:
	raise TMXValidationError("No translation units found in the TMX file")

	except ET.ParseError as e:
	raise TMXValidationError(f"Invalid XML format: {str(e)}")
	except Exception as e:
	raise TMXValidationError(f"Validation failed: {str(e)}")

	def _get_source_language(self) -> str:
	"""Extract source language from TMX header.

	Returns:
	Source language code
	"""
	header = self.root.find("header")
	return header.get("srclang")

	def _preserve_html(self, element: ET.Element) -> str:
	"""Preserve HTML tags in text content.

	Args:
	element: XML element containing text and possible HTML tags

	Returns:
	String with preserved HTML tags
	"""
	return "".join(
	(
	ET.tostring(child, encoding="unicode", method="xml")
	if child.tag not in ["seg", "tuv", "tu"]
	else (child.text or "")
	)
	for child in element.iter()
	)

	def _flush_current_groups(self) -> None:
	"""Create alignment group from current source and target sentences."""
	if self.current_source_group or self.current_target_group:
	source_ids = [id_ for id_, _ in self.current_source_group]
	target_ids = [id_ for id_, _ in self.current_target_group]

	self.source_sentences += self.current_source_group
	self.target_sentences += self.current_target_group
	self.alignment_groups.append(AlignmentGroup(source_ids, target_ids))
	self.current_source_group = []
	self.current_target_group = []

	def parse(self) -> None:
	"""Parse TMX file and extract sentence pairs."""
	source_sentence_id = 1
	target_sentence_id = 1

	for tu in tqdm(self.root.findall(".//tu"), desc="Parsing TMX"):
	# Get sentence break information for this specific TU
	prop = tu.find('prop[@type="x-sentbreak"]')
	sentence_break = prop.text if prop is not None else None

	source_seg = None
	target_seg = None

	for tuv in tu.findall("tuv"):
	lang = tuv.get("{http://www.w3.org/XML/1998/namespace}lang")
	seg = tuv.find("seg")

	if lang == self._source_lang:
	source_seg = self._preserve_html(seg)
	elif not self._target_lang:
	self._target_lang = lang
	target_seg = self._preserve_html(seg)
	elif lang == self._target_lang:
	target_seg = self._preserve_html(seg)

	# Handle source segment if present
	if source_seg:
	for sent in map(str.strip, source_seg.split(sentence_break)):
	if sent:
	self.current_source_group.append(
	(f"1:{source_sentence_id}", sent)
	)
	source_sentence_id += 1

	# Handle target segment if present
	if target_seg:
	for sent in map(str.strip, target_seg.split(sentence_break)):
	if sent:
	self.current_target_group.append(
	(f"1:{target_sentence_id}", sent)
	)
	target_sentence_id += 1

	self._flush_current_groups()


	class ParaConcConverter:
	"""Converts TMX document data to ParaConc format."""

	def __init__(self, tmx_doc: TMXDocument, output_path_and_prefix: Path):
	"""Initialize ParaConc converter.

	Args:
	tmx_doc: Parsed TMX document
	output_path_and_prefix: Prefix for output files
	"""
	self.tmx_doc = tmx_doc
	self.output_dir = output_path_and_prefix.parent
	self.output_dir.mkdir(parents=True, exist_ok=True)
	self.output_prefix = output_path_and_prefix.name

	def _create_language_file(self, lang: str, is_source: bool, desc: str) -> None:
	"""Create a language-specific XML file in ParaConc format.

	Args:
	lang: Language code
	is_source: True if creating source language file
	desc: Description for progress bar
	"""
	root = ET.Element("doc")
	root.set("id", Path(self.output_prefix).stem)
	root.set("language", lang)
	root.set("version", "00")

	# Create a single paragraph for all sentences
	para = ET.SubElement(root, "p")
	para.set("id", "1")

	# Sort sentences by ID just in case
	sorted_sentences = sorted(
	(
	self.tmx_doc.source_sentences
	if is_source
	else self.tmx_doc.target_sentences
	),
	key=lambda x: int(x[0].split(":")[1]),
	)

	for sent_id, text in tqdm(sorted_sentences, desc=desc):
	sent = ET.SubElement(para, "s")
	sent.set("id", sent_id)
	sent.text = text

	tree = ET.ElementTree(root)
	output_file = self.output_dir / f"{self.output_prefix}.{lang}-00.xml"
	write_xml_file(tree, output_file)

	def _create_alignment_file(self) -> None:
	"""Create alignment XML file with support for complex alignments."""
	root = ET.Element("linkGrp")
	root.set("fromDoc", f"{self.output_prefix}.{self.tmx_doc._source_lang}-00.xml")
	root.set("toDoc", f"{self.output_prefix}.{self.tmx_doc._target_lang}-00.xml")

	for group in tqdm(
	self.tmx_doc.alignment_groups, desc="Creating alignment file"
	):
	link = ET.SubElement(root, "link")
	link.set("type", group.alignment_type)
	link.set(
	"xtargets", f"{' '.join(group.target_ids)};{' '.join(group.source_ids)}"
	)
	link.set("status", "man")

	tree = ET.ElementTree(root)
	alignment_file = self.output_dir / f"{self.output_prefix}.{self.tmx_doc._source_lang}-00.{self.tmx_doc._target_lang}-00.alignment.xml"
	write_xml_file(tree, alignment_file)

	def convert(self) -> None:
	"""Convert TMX data to ParaConc format files."""
	# Create source language file
	self._create_language_file(
	self.tmx_doc._source_lang, True, "Creating source language file"
	)

	# Create target language file
	self._create_language_file(
	self.tmx_doc._target_lang, False, "Creating target language file"
	)

	# Create alignment file
	self._create_alignment_file()


	def main():
	"""Main entry point for the script."""
	parser = argparse.ArgumentParser(
	description="Convert TMX files to ParaConc format",
	formatter_class=argparse.RawDescriptionHelpFormatter,
	)
	parser.add_argument("input_file", type=Path, help="Input TMX file")
	parser.add_argument(
	"-o",
	"--output",
	type=Path,
	required=True,
	help="Output file prefix (without extension)",
	)

	args = parser.parse_args()

	try:
	print(f"Processing {args.input_file}")
	tmx_doc = TMXDocument(args.input_file)
	tmx_doc.parse()

	converter = ParaConcConverter(tmx_doc, args.output)
	converter.convert()

	print("Conversion completed successfully!")

	except TMXValidationError as e:
	print(f"Error: Invalid TMX file - {str(e)}")
	exit(1)

	if __name__ == "__main__":
	main()