Skip to content

Instantly share code, notes, and snippets.

@dchaplinsky
Last active December 17, 2024 21:11
Show Gist options
  • Save dchaplinsky/25d6621febb64deebc1c34471ccfe5d6 to your computer and use it in GitHub Desktop.
Save dchaplinsky/25d6621febb64deebc1c34471ccfe5d6 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
"""Convert TMX files to ParaConc format.
This script converts TMX (Translation Memory eXchange) files to ParaConc format,
which consists of three separate XML files: source language, target language,
and alignment information. It supports complex alignment patterns, HTML tag preservation,
and includes input validation.
Example usage:
python tmx_to_paraconc.py input.tmx -o output_prefix
"""
import argparse
import xml.etree.ElementTree as ET
import xml.dom.minidom
from dataclasses import dataclass
from pathlib import Path
from typing import List, Tuple
from tqdm import tqdm
@dataclass
class AlignmentGroup:
"""Represents a group of aligned sentences."""
source_ids: List[str]
target_ids: List[str]
@property
def alignment_type(self) -> str:
"""Get the alignment type (e.g., '1-1', '2-1', '1-2')."""
return f"{len(self.source_ids)}-{len(self.target_ids)}"
class TMXValidationError(Exception):
"""Raised when TMX file validation fails."""
pass
def write_xml_file(tree: ET.ElementTree, output_file: str) -> None:
"""Write XML tree to file with proper indentation.
Args:
tree: ElementTree object to write
output_file: Path to output file
"""
xml_str = ET.tostring(tree.getroot(), encoding="unicode", method="xml")
dom = xml.dom.minidom.parseString(xml_str)
with open(output_file, "w", encoding="utf-8") as f:
f.write(dom.toprettyxml(indent=" "))
class TMXDocument:
"""Represents a TMX document and handles its parsing."""
def __init__(self, file_path: Path):
"""Initialize TMX document parser.
Args:
file_path: Path to the TMX file
Raises:
TMXValidationError: If the TMX file is invalid
"""
self.file_path = file_path
self._validate_tmx()
self.tree = ET.parse(file_path)
self.root = self.tree.getroot()
self._source_lang = self._get_source_language()
self._target_lang = None
self.current_source_group: List[Tuple[str, str]] = []
self.current_target_group: List[Tuple[str, str]] = []
self.alignment_groups: List[AlignmentGroup] = []
self.source_sentences: List[Tuple[str, str]] = []
self.target_sentences: List[Tuple[str, str]] = []
def _validate_tmx(self) -> None:
"""Validate TMX file structure and content.
Raises:
TMXValidationError: If validation fails
"""
try:
tree = ET.parse(self.file_path)
root = tree.getroot()
if root.tag != "tmx":
raise TMXValidationError("Root element must be 'tmx'")
header = root.find("header")
if header is None:
raise TMXValidationError("Missing required 'header' element")
if not header.get("srclang"):
raise TMXValidationError(
"Missing required 'srclang' attribute in header"
)
body = root.find("body")
if body is None:
raise TMXValidationError("Missing required 'body' element")
if body.find("tu") is None:
raise TMXValidationError("No translation units found in the TMX file")
except ET.ParseError as e:
raise TMXValidationError(f"Invalid XML format: {str(e)}")
except Exception as e:
raise TMXValidationError(f"Validation failed: {str(e)}")
def _get_source_language(self) -> str:
"""Extract source language from TMX header.
Returns:
Source language code
"""
header = self.root.find("header")
return header.get("srclang")
def _preserve_html(self, element: ET.Element) -> str:
"""Preserve HTML tags in text content.
Args:
element: XML element containing text and possible HTML tags
Returns:
String with preserved HTML tags
"""
return "".join(
(
ET.tostring(child, encoding="unicode", method="xml")
if child.tag not in ["seg", "tuv", "tu"]
else (child.text or "")
)
for child in element.iter()
)
def _flush_current_groups(self) -> None:
"""Create alignment group from current source and target sentences."""
if self.current_source_group or self.current_target_group:
source_ids = [id_ for id_, _ in self.current_source_group]
target_ids = [id_ for id_, _ in self.current_target_group]
self.source_sentences += self.current_source_group
self.target_sentences += self.current_target_group
self.alignment_groups.append(AlignmentGroup(source_ids, target_ids))
self.current_source_group = []
self.current_target_group = []
def parse(self) -> None:
"""Parse TMX file and extract sentence pairs."""
source_sentence_id = 1
target_sentence_id = 1
for tu in tqdm(self.root.findall(".//tu"), desc="Parsing TMX"):
# Get sentence break information for this specific TU
prop = tu.find('prop[@type="x-sentbreak"]')
sentence_break = prop.text if prop is not None else None
source_seg = None
target_seg = None
for tuv in tu.findall("tuv"):
lang = tuv.get("{http://www.w3.org/XML/1998/namespace}lang")
seg = tuv.find("seg")
if lang == self._source_lang:
source_seg = self._preserve_html(seg)
elif not self._target_lang:
self._target_lang = lang
target_seg = self._preserve_html(seg)
elif lang == self._target_lang:
target_seg = self._preserve_html(seg)
# Handle source segment if present
if source_seg:
for sent in map(str.strip, source_seg.split(sentence_break)):
if sent:
self.current_source_group.append(
(f"1:{source_sentence_id}", sent)
)
source_sentence_id += 1
# Handle target segment if present
if target_seg:
for sent in map(str.strip, target_seg.split(sentence_break)):
if sent:
self.current_target_group.append(
(f"1:{target_sentence_id}", sent)
)
target_sentence_id += 1
self._flush_current_groups()
class ParaConcConverter:
"""Converts TMX document data to ParaConc format."""
def __init__(self, tmx_doc: TMXDocument, output_path_and_prefix: Path):
"""Initialize ParaConc converter.
Args:
tmx_doc: Parsed TMX document
output_path_and_prefix: Prefix for output files
"""
self.tmx_doc = tmx_doc
self.output_dir = output_path_and_prefix.parent
self.output_dir.mkdir(parents=True, exist_ok=True)
self.output_prefix = output_path_and_prefix.name
def _create_language_file(self, lang: str, is_source: bool, desc: str) -> None:
"""Create a language-specific XML file in ParaConc format.
Args:
lang: Language code
is_source: True if creating source language file
desc: Description for progress bar
"""
root = ET.Element("doc")
root.set("id", Path(self.output_prefix).stem)
root.set("language", lang)
root.set("version", "00")
# Create a single paragraph for all sentences
para = ET.SubElement(root, "p")
para.set("id", "1")
# Sort sentences by ID just in case
sorted_sentences = sorted(
(
self.tmx_doc.source_sentences
if is_source
else self.tmx_doc.target_sentences
),
key=lambda x: int(x[0].split(":")[1]),
)
for sent_id, text in tqdm(sorted_sentences, desc=desc):
sent = ET.SubElement(para, "s")
sent.set("id", sent_id)
sent.text = text
tree = ET.ElementTree(root)
output_file = self.output_dir / f"{self.output_prefix}.{lang}-00.xml"
write_xml_file(tree, output_file)
def _create_alignment_file(self) -> None:
"""Create alignment XML file with support for complex alignments."""
root = ET.Element("linkGrp")
root.set("fromDoc", f"{self.output_prefix}.{self.tmx_doc._source_lang}-00.xml")
root.set("toDoc", f"{self.output_prefix}.{self.tmx_doc._target_lang}-00.xml")
for group in tqdm(
self.tmx_doc.alignment_groups, desc="Creating alignment file"
):
link = ET.SubElement(root, "link")
link.set("type", group.alignment_type)
link.set(
"xtargets", f"{' '.join(group.target_ids)};{' '.join(group.source_ids)}"
)
link.set("status", "man")
tree = ET.ElementTree(root)
alignment_file = self.output_dir / f"{self.output_prefix}.{self.tmx_doc._source_lang}-00.{self.tmx_doc._target_lang}-00.alignment.xml"
write_xml_file(tree, alignment_file)
def convert(self) -> None:
"""Convert TMX data to ParaConc format files."""
# Create source language file
self._create_language_file(
self.tmx_doc._source_lang, True, "Creating source language file"
)
# Create target language file
self._create_language_file(
self.tmx_doc._target_lang, False, "Creating target language file"
)
# Create alignment file
self._create_alignment_file()
def main():
"""Main entry point for the script."""
parser = argparse.ArgumentParser(
description="Convert TMX files to ParaConc format",
formatter_class=argparse.RawDescriptionHelpFormatter,
)
parser.add_argument("input_file", type=Path, help="Input TMX file")
parser.add_argument(
"-o",
"--output",
type=Path,
required=True,
help="Output file prefix (without extension)",
)
args = parser.parse_args()
try:
print(f"Processing {args.input_file}")
tmx_doc = TMXDocument(args.input_file)
tmx_doc.parse()
converter = ParaConcConverter(tmx_doc, args.output)
converter.convert()
print("Conversion completed successfully!")
except TMXValidationError as e:
print(f"Error: Invalid TMX file - {str(e)}")
exit(1)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment