Created
April 25, 2025 23:17
-
-
Save dmd/85105e5a6e90381267f96aa341f61eb3 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
"""Convert Amicas-style JPEG-2000 images (*.jp2) into fully tagged | |
DICOM Part-10 files. | |
The companion XML contains one *Demographic_AmicasImage* element per | |
image as well as study-, series- and patient-level elements. The | |
converter | |
1. parses the XML once, | |
2. associates every *.jp2 file with the *correct* AmicasImage element, | |
3. merges patient / study / series / image attributes, | |
4. wraps the JPEG-2000 codestream into Pixel Data (no recompression), | |
5. writes the resulting DICOM file. | |
Example | |
------- | |
python jp2_to_dicom.py \ | |
--xml 1487_273074.XML \ | |
--in-dir 1487_273074 \ | |
--out-dir dicom_out | |
Dependencies (install once) | |
-------------------------- | |
pip install pydicom pylibjpeg pylibjpeg-openjpeg | |
`pylibjpeg` is required only when you also want to *read* the output | |
DICOMs; the converter itself never decompresses the pixel data. | |
""" | |
from __future__ import annotations | |
import argparse | |
import os | |
import sys | |
import xml.etree.ElementTree as ET | |
from datetime import datetime | |
from pathlib import Path | |
from typing import Dict, List | |
import pydicom | |
from pydicom.dataset import Dataset, FileMetaDataset | |
from pydicom.encaps import encapsulate | |
from pydicom.uid import JPEG2000, ExplicitVRLittleEndian, generate_uid | |
# --------------------------------------------------------------------- | |
# XML → plain python --------------------------------------------------- | |
# --------------------------------------------------------------------- | |
def _parse_xml(xml_path: str) -> Dict[str, dict]: | |
"""Return four dictionaries: patient, study, series_map, image_map. | |
*series_map*: key = SeriesInstanceUID | |
*image_map*: key = jp2 file stem (e.g. 'S799660_I25827012') | |
""" | |
root = ET.parse(xml_path).getroot() | |
# ---------------- Patient + Study (singletons) ------------------- | |
patient_elem = root.find("Demographic_AmicasPatient") | |
study_elem = root.find("Demographic_AmicasStudy") | |
patient: dict = patient_elem.attrib if patient_elem is not None else {} | |
study: dict = study_elem.attrib if study_elem is not None else {} | |
# ---------------- Series (zero-to-many) --------------------------- | |
series_map: Dict[str, dict] = {} | |
for selem in root.findall("Demographic_AmicasSeries"): | |
uid = selem.attrib.get("SeriesInstanceUID") | |
if uid: | |
series_map[uid] = selem.attrib | |
# ---------------- Image (zero-to-many) ---------------------------- | |
image_map: Dict[str, dict] = {} | |
for ielem in root.findall("Demographic_AmicasImage"): | |
obj_file = ielem.attrib.get("ObjectFile", "") | |
# Path may contain *backslashes* even on POSIX; we only want the | |
# filename stem without directory and extension. | |
if obj_file: | |
filename = os.path.basename(obj_file.replace("\\", "/")) | |
stem = os.path.splitext(filename)[0] | |
else: | |
stem = "" | |
if not stem: | |
# fallback: build from AmicasImageID, will still match | |
stem = f"{ielem.attrib.get('AmicasImageID', '')}" | |
image_map[stem] = ielem.attrib | |
return { | |
"patient": patient, | |
"study": study, | |
"series_map": series_map, | |
"image_map": image_map, | |
} | |
# --------------------------------------------------------------------- | |
# Mapping helpers ------------------------------------------------------ | |
# --------------------------------------------------------------------- | |
# Patient level: XML attr → (DICOM keyword, callable for conversion) | |
PATIENT_MAP = { | |
"PatientID": ("PatientID", str), | |
"N_PatientID": ("PatientID", str), # fallback | |
"NormPatientID": ("PatientID", str), # another variant seen in patient element | |
"PatientName": ("PatientName", str), | |
"N_PatientName": ("PatientName", str), # fallback (no ^) | |
"PatientBirthDate": ("PatientBirthDate", str), | |
"PatientSex": ("PatientSex", str), | |
"PatientAge": ("PatientAge", str), | |
} | |
# Study level | |
STUDY_MAP = { | |
"StudyInstanceUID": ("StudyInstanceUID", str), | |
"StudyDate": ("StudyDate", str), | |
"StudyTime": ("StudyTime", str), | |
"StudyID": ("StudyID", str), | |
"AccessionNumber": ("AccessionNumber", str), | |
"StudyDescription": ("StudyDescription", str), | |
"ReferPhysician": ("ReferringPhysicianName", str), | |
"N_ReferPhysician": ("ReferringPhysicianName", str), | |
} | |
# Series level | |
SERIES_MAP = { | |
"SeriesInstanceUID": ("SeriesInstanceUID", str), | |
"SeriesNumber": ("SeriesNumber", int), | |
"SeriesDescription": ("SeriesDescription", str), | |
"SeriesDate": ("SeriesDate", str), | |
"SeriesTime": ("SeriesTime", str), | |
"Modality": ("Modality", str), | |
"Manufacturer": ("Manufacturer", str), | |
"InstitutionName": ("InstitutionName", str), | |
"PatientPosition": ("PatientPosition", str), | |
} | |
# Image level | |
IMAGE_MAP = { | |
"SOPInstanceUID": ("SOPInstanceUID", str), | |
"SOPClassUID": ("SOPClassUID", str), | |
"ImageNumber": ("InstanceNumber", int), | |
"ImageDate": ("ContentDate", str), # fallback below | |
"ImageTime": ("ContentTime", str), | |
"NRows": ("Rows", int), | |
"NColumns": ("Columns", int), | |
"ImagePositionPt": ("ImagePositionPatient", str), | |
"ImageOrientPt": ("ImageOrientationPatient", str), | |
"SliceLocation": ("SliceLocation", float), | |
"PixelSpacing": ("PixelSpacing", str), | |
"SliceThickness": ("SliceThickness", float), | |
"KVP": ("KVP", str), | |
"RepetitionTime": ("RepetitionTime", str), | |
"EchoTime": ("EchoTime", str), | |
"EchoNumbers": ("EchoNumbers", str), | |
"WindowCenter": ("WindowCenter", str), | |
"WindowWidth": ("WindowWidth", str), | |
"BitsAlloc": ("BitsAllocated", int), | |
"BitsStored": ("BitsStored", int), | |
"PixelRep": ("PixelRepresentation", int), | |
"PhotometricI": ("PhotometricInterpretation", str), | |
"NumberOfFrames": ("NumberOfFrames", int), | |
} | |
def _copy_mapped(attrs: dict, mapping: dict, ds: Dataset) -> None: | |
"""Apply *mapping* (xml-name → (dcm keyword, cast)) to dataset.""" | |
for xml_key, (dcm_kw, caster) in mapping.items(): | |
val = attrs.get(xml_key) | |
if val is None or val == "": | |
continue | |
try: | |
setattr(ds, dcm_kw, caster(val)) | |
except Exception: | |
# Ignore malformed value – continue with other attributes | |
continue | |
# --------------------------------------------------------------------- | |
# Dataset factory ------------------------------------------------------ | |
# --------------------------------------------------------------------- | |
def _person_name_from_normalised(norm_name: str) -> str: | |
"""Very naive *LASTFIRST* → *LAST^FIRST* conversion.""" | |
if "^" in norm_name: | |
return norm_name # already proper | |
# split the string in two halves (best guess) when no caret present | |
half = len(norm_name) // 2 | |
return f"{norm_name[:half]}^{norm_name[half:]}" | |
def build_dataset( | |
patient: dict, | |
study: dict, | |
series: dict, | |
image: dict, | |
) -> Dataset: | |
"""Merge attributes and return a populated pydicom Dataset.""" | |
ds = Dataset() | |
# Patient --------------------------------------------------------- | |
_copy_mapped(patient, PATIENT_MAP, ds) | |
# fallbacks / fixes for patient | |
if "PatientName" not in ds: | |
# try exact PatientName from study first | |
pname = study.get("PatientName") or patient.get("PatientName") | |
if pname: | |
ds.PatientName = pname | |
else: | |
n_name = patient.get("N_PatientName") or study.get("N_PatientName") | |
if n_name: | |
ds.PatientName = _person_name_from_normalised(n_name) | |
# Study ----------------------------------------------------------- | |
_copy_mapped(study, STUDY_MAP, ds) | |
# If PatientID or BirthDate still missing, try study-level values | |
if "PatientID" not in ds: | |
for key in ("PatientID", "N_PatientID"): | |
if key in study and study[key]: | |
ds.PatientID = study[key] | |
break | |
if "PatientBirthDate" not in ds and study.get("PatientBirthDate"): | |
ds.PatientBirthDate = study["PatientBirthDate"] | |
# Series ---------------------------------------------------------- | |
_copy_mapped(series, SERIES_MAP, ds) | |
# Image ----------------------------------------------------------- | |
_copy_mapped(image, IMAGE_MAP, ds) | |
# ----- Mandatory fallbacks -------------------------------------- | |
if "StudyInstanceUID" not in ds: | |
ds.StudyInstanceUID = generate_uid() | |
if "SeriesInstanceUID" not in ds: | |
ds.SeriesInstanceUID = generate_uid() | |
if "SOPInstanceUID" not in ds: | |
ds.SOPInstanceUID = generate_uid() | |
# SOP Class: if missing default to Secondary Capture | |
if "SOPClassUID" not in ds: | |
ds.SOPClassUID = "1.2.840.10008.5.1.4.1.1.7" # Secondary Capture | |
# ContentDate/Time fallback to now when blank -------------------- | |
now = datetime.now() | |
if not getattr(ds, "ContentDate", None): | |
ds.ContentDate = now.strftime("%Y%m%d") | |
if not getattr(ds, "ContentTime", None): | |
ds.ContentTime = now.strftime("%H%M%S") | |
# Pixel module minimal defaults when absent ---------------------- | |
if "BitsAllocated" not in ds: | |
ds.BitsAllocated = 8 | |
if "BitsStored" not in ds: | |
ds.BitsStored = ds.BitsAllocated | |
if "HighBit" not in ds: | |
ds.HighBit = ds.BitsStored - 1 | |
if "PixelRepresentation" not in ds: | |
ds.PixelRepresentation = 0 | |
if "SamplesPerPixel" not in ds: | |
ds.SamplesPerPixel = 1 | |
if "PhotometricInterpretation" not in ds: | |
ds.PhotometricInterpretation = "MONOCHROME2" | |
if "PlanarConfiguration" not in ds and ds.SamplesPerPixel > 1: | |
ds.PlanarConfiguration = 0 | |
return ds | |
# --------------------------------------------------------------------- | |
# Main conversion loop ------------------------------------------------ | |
# --------------------------------------------------------------------- | |
def convert_all(jp2_dir: str, xml_path: str, out_dir: str) -> None: | |
"""Convert every *.jp2 under *jp2_dir*.""" | |
data = _parse_xml(xml_path) | |
patient = data["patient"] | |
study = data["study"] | |
series_map = data["series_map"] | |
image_map = data["image_map"] | |
jp2_pathlist: List[Path] = sorted(Path(jp2_dir).glob("*.jp2")) | |
if not jp2_pathlist: | |
sys.exit(f"[error] No .jp2 files found in {jp2_dir}") | |
os.makedirs(out_dir, exist_ok=True) | |
for jp2_path in jp2_pathlist: | |
stem = jp2_path.stem # S799660_I25827012 | |
image_attrs = image_map.get(stem) | |
if image_attrs is None: | |
print(f"[warning] Skipping {stem}: not present in XML") | |
continue | |
series_uid = image_attrs.get("SeriesInstanceUID", "") | |
series_attrs = series_map.get(series_uid, {}) | |
ds = build_dataset(patient, study, series_attrs, image_attrs) | |
# Encapsulate JPEG-2000 codestream --------------------------- | |
with open(jp2_path, "rb") as fp: | |
ds.PixelData = encapsulate([fp.read()]) | |
ds.file_meta = FileMetaDataset() | |
ds.file_meta.FileMetaInformationVersion = b"\x00\x01" | |
ds.file_meta.TransferSyntaxUID = JPEG2000 | |
ds.file_meta.MediaStorageSOPClassUID = ds.SOPClassUID | |
ds.file_meta.MediaStorageSOPInstanceUID = ds.SOPInstanceUID | |
ds.file_meta.ImplementationClassUID = "1.2.826.0.1.3680043.10.511.1" | |
ds.is_little_endian = True | |
ds.is_implicit_VR = False # explicit VR as required for encapsulated TS | |
out_path = Path(out_dir) / f"{stem}.dcm" | |
ds.save_as(out_path, write_like_original=False) | |
print("✓", out_path) | |
# --------------------------------------------------------------------- | |
# CLI ----------------------------------------------------------------- | |
# --------------------------------------------------------------------- | |
def _cli() -> argparse.Namespace: | |
ap = argparse.ArgumentParser(description="Wrap Amicas JPEG-2000 files into DICOM") | |
ap.add_argument("--xml", required=True, help="Amicas metadata XML, e.g. 1487_273074.XML") | |
ap.add_argument("--in-dir", default=".", help="Directory containing *.jp2 files (default: .)") | |
ap.add_argument("--out-dir", default="dicom_out", help="Output directory (default: dicom_out)") | |
return ap.parse_args() | |
def main() -> None: | |
args = _cli() | |
convert_all(args.in_dir, args.xml, args.out_dir) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment