Last active
December 17, 2024 21:11
-
-
Save dchaplinsky/25d6621febb64deebc1c34471ccfe5d6 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
"""Convert TMX files to ParaConc format. | |
This script converts TMX (Translation Memory eXchange) files to ParaConc format, | |
which consists of three separate XML files: source language, target language, | |
and alignment information. It supports complex alignment patterns, HTML tag preservation, | |
and includes input validation. | |
Example usage: | |
python tmx_to_paraconc.py input.tmx -o output_prefix | |
""" | |
import argparse | |
import xml.etree.ElementTree as ET | |
import xml.dom.minidom | |
from dataclasses import dataclass | |
from pathlib import Path | |
from typing import List, Tuple | |
from tqdm import tqdm | |
@dataclass | |
class AlignmentGroup: | |
"""Represents a group of aligned sentences.""" | |
source_ids: List[str] | |
target_ids: List[str] | |
@property | |
def alignment_type(self) -> str: | |
"""Get the alignment type (e.g., '1-1', '2-1', '1-2').""" | |
return f"{len(self.source_ids)}-{len(self.target_ids)}" | |
class TMXValidationError(Exception): | |
"""Raised when TMX file validation fails.""" | |
pass | |
def write_xml_file(tree: ET.ElementTree, output_file: str) -> None: | |
"""Write XML tree to file with proper indentation. | |
Args: | |
tree: ElementTree object to write | |
output_file: Path to output file | |
""" | |
xml_str = ET.tostring(tree.getroot(), encoding="unicode", method="xml") | |
dom = xml.dom.minidom.parseString(xml_str) | |
with open(output_file, "w", encoding="utf-8") as f: | |
f.write(dom.toprettyxml(indent=" ")) | |
class TMXDocument: | |
"""Represents a TMX document and handles its parsing.""" | |
def __init__(self, file_path: Path): | |
"""Initialize TMX document parser. | |
Args: | |
file_path: Path to the TMX file | |
Raises: | |
TMXValidationError: If the TMX file is invalid | |
""" | |
self.file_path = file_path | |
self._validate_tmx() | |
self.tree = ET.parse(file_path) | |
self.root = self.tree.getroot() | |
self._source_lang = self._get_source_language() | |
self._target_lang = None | |
self.current_source_group: List[Tuple[str, str]] = [] | |
self.current_target_group: List[Tuple[str, str]] = [] | |
self.alignment_groups: List[AlignmentGroup] = [] | |
self.source_sentences: List[Tuple[str, str]] = [] | |
self.target_sentences: List[Tuple[str, str]] = [] | |
def _validate_tmx(self) -> None: | |
"""Validate TMX file structure and content. | |
Raises: | |
TMXValidationError: If validation fails | |
""" | |
try: | |
tree = ET.parse(self.file_path) | |
root = tree.getroot() | |
if root.tag != "tmx": | |
raise TMXValidationError("Root element must be 'tmx'") | |
header = root.find("header") | |
if header is None: | |
raise TMXValidationError("Missing required 'header' element") | |
if not header.get("srclang"): | |
raise TMXValidationError( | |
"Missing required 'srclang' attribute in header" | |
) | |
body = root.find("body") | |
if body is None: | |
raise TMXValidationError("Missing required 'body' element") | |
if body.find("tu") is None: | |
raise TMXValidationError("No translation units found in the TMX file") | |
except ET.ParseError as e: | |
raise TMXValidationError(f"Invalid XML format: {str(e)}") | |
except Exception as e: | |
raise TMXValidationError(f"Validation failed: {str(e)}") | |
def _get_source_language(self) -> str: | |
"""Extract source language from TMX header. | |
Returns: | |
Source language code | |
""" | |
header = self.root.find("header") | |
return header.get("srclang") | |
def _preserve_html(self, element: ET.Element) -> str: | |
"""Preserve HTML tags in text content. | |
Args: | |
element: XML element containing text and possible HTML tags | |
Returns: | |
String with preserved HTML tags | |
""" | |
return "".join( | |
( | |
ET.tostring(child, encoding="unicode", method="xml") | |
if child.tag not in ["seg", "tuv", "tu"] | |
else (child.text or "") | |
) | |
for child in element.iter() | |
) | |
def _flush_current_groups(self) -> None: | |
"""Create alignment group from current source and target sentences.""" | |
if self.current_source_group or self.current_target_group: | |
source_ids = [id_ for id_, _ in self.current_source_group] | |
target_ids = [id_ for id_, _ in self.current_target_group] | |
self.source_sentences += self.current_source_group | |
self.target_sentences += self.current_target_group | |
self.alignment_groups.append(AlignmentGroup(source_ids, target_ids)) | |
self.current_source_group = [] | |
self.current_target_group = [] | |
def parse(self) -> None: | |
"""Parse TMX file and extract sentence pairs.""" | |
source_sentence_id = 1 | |
target_sentence_id = 1 | |
for tu in tqdm(self.root.findall(".//tu"), desc="Parsing TMX"): | |
# Get sentence break information for this specific TU | |
prop = tu.find('prop[@type="x-sentbreak"]') | |
sentence_break = prop.text if prop is not None else None | |
source_seg = None | |
target_seg = None | |
for tuv in tu.findall("tuv"): | |
lang = tuv.get("{http://www.w3.org/XML/1998/namespace}lang") | |
seg = tuv.find("seg") | |
if lang == self._source_lang: | |
source_seg = self._preserve_html(seg) | |
elif not self._target_lang: | |
self._target_lang = lang | |
target_seg = self._preserve_html(seg) | |
elif lang == self._target_lang: | |
target_seg = self._preserve_html(seg) | |
# Handle source segment if present | |
if source_seg: | |
for sent in map(str.strip, source_seg.split(sentence_break)): | |
if sent: | |
self.current_source_group.append( | |
(f"1:{source_sentence_id}", sent) | |
) | |
source_sentence_id += 1 | |
# Handle target segment if present | |
if target_seg: | |
for sent in map(str.strip, target_seg.split(sentence_break)): | |
if sent: | |
self.current_target_group.append( | |
(f"1:{target_sentence_id}", sent) | |
) | |
target_sentence_id += 1 | |
self._flush_current_groups() | |
class ParaConcConverter: | |
"""Converts TMX document data to ParaConc format.""" | |
def __init__(self, tmx_doc: TMXDocument, output_path_and_prefix: Path): | |
"""Initialize ParaConc converter. | |
Args: | |
tmx_doc: Parsed TMX document | |
output_path_and_prefix: Prefix for output files | |
""" | |
self.tmx_doc = tmx_doc | |
self.output_dir = output_path_and_prefix.parent | |
self.output_dir.mkdir(parents=True, exist_ok=True) | |
self.output_prefix = output_path_and_prefix.name | |
def _create_language_file(self, lang: str, is_source: bool, desc: str) -> None: | |
"""Create a language-specific XML file in ParaConc format. | |
Args: | |
lang: Language code | |
is_source: True if creating source language file | |
desc: Description for progress bar | |
""" | |
root = ET.Element("doc") | |
root.set("id", Path(self.output_prefix).stem) | |
root.set("language", lang) | |
root.set("version", "00") | |
# Create a single paragraph for all sentences | |
para = ET.SubElement(root, "p") | |
para.set("id", "1") | |
# Sort sentences by ID just in case | |
sorted_sentences = sorted( | |
( | |
self.tmx_doc.source_sentences | |
if is_source | |
else self.tmx_doc.target_sentences | |
), | |
key=lambda x: int(x[0].split(":")[1]), | |
) | |
for sent_id, text in tqdm(sorted_sentences, desc=desc): | |
sent = ET.SubElement(para, "s") | |
sent.set("id", sent_id) | |
sent.text = text | |
tree = ET.ElementTree(root) | |
output_file = self.output_dir / f"{self.output_prefix}.{lang}-00.xml" | |
write_xml_file(tree, output_file) | |
def _create_alignment_file(self) -> None: | |
"""Create alignment XML file with support for complex alignments.""" | |
root = ET.Element("linkGrp") | |
root.set("fromDoc", f"{self.output_prefix}.{self.tmx_doc._source_lang}-00.xml") | |
root.set("toDoc", f"{self.output_prefix}.{self.tmx_doc._target_lang}-00.xml") | |
for group in tqdm( | |
self.tmx_doc.alignment_groups, desc="Creating alignment file" | |
): | |
link = ET.SubElement(root, "link") | |
link.set("type", group.alignment_type) | |
link.set( | |
"xtargets", f"{' '.join(group.target_ids)};{' '.join(group.source_ids)}" | |
) | |
link.set("status", "man") | |
tree = ET.ElementTree(root) | |
alignment_file = self.output_dir / f"{self.output_prefix}.{self.tmx_doc._source_lang}-00.{self.tmx_doc._target_lang}-00.alignment.xml" | |
write_xml_file(tree, alignment_file) | |
def convert(self) -> None: | |
"""Convert TMX data to ParaConc format files.""" | |
# Create source language file | |
self._create_language_file( | |
self.tmx_doc._source_lang, True, "Creating source language file" | |
) | |
# Create target language file | |
self._create_language_file( | |
self.tmx_doc._target_lang, False, "Creating target language file" | |
) | |
# Create alignment file | |
self._create_alignment_file() | |
def main(): | |
"""Main entry point for the script.""" | |
parser = argparse.ArgumentParser( | |
description="Convert TMX files to ParaConc format", | |
formatter_class=argparse.RawDescriptionHelpFormatter, | |
) | |
parser.add_argument("input_file", type=Path, help="Input TMX file") | |
parser.add_argument( | |
"-o", | |
"--output", | |
type=Path, | |
required=True, | |
help="Output file prefix (without extension)", | |
) | |
args = parser.parse_args() | |
try: | |
print(f"Processing {args.input_file}") | |
tmx_doc = TMXDocument(args.input_file) | |
tmx_doc.parse() | |
converter = ParaConcConverter(tmx_doc, args.output) | |
converter.convert() | |
print("Conversion completed successfully!") | |
except TMXValidationError as e: | |
print(f"Error: Invalid TMX file - {str(e)}") | |
exit(1) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment