Skip to content

Instantly share code, notes, and snippets.

@dmd
Created April 25, 2025 23:17
Show Gist options
  • Save dmd/85105e5a6e90381267f96aa341f61eb3 to your computer and use it in GitHub Desktop.
Save dmd/85105e5a6e90381267f96aa341f61eb3 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
"""Convert Amicas-style JPEG-2000 images (*.jp2) into fully tagged
DICOM Part-10 files.
The companion XML contains one *Demographic_AmicasImage* element per
image as well as study-, series- and patient-level elements. The
converter
1. parses the XML once,
2. associates every *.jp2 file with the *correct* AmicasImage element,
3. merges patient / study / series / image attributes,
4. wraps the JPEG-2000 codestream into Pixel Data (no recompression),
5. writes the resulting DICOM file.
Example
-------
python jp2_to_dicom.py \
--xml 1487_273074.XML \
--in-dir 1487_273074 \
--out-dir dicom_out
Dependencies (install once)
--------------------------
pip install pydicom pylibjpeg pylibjpeg-openjpeg
`pylibjpeg` is required only when you also want to *read* the output
DICOMs; the converter itself never decompresses the pixel data.
"""
from __future__ import annotations
import argparse
import os
import sys
import xml.etree.ElementTree as ET
from datetime import datetime
from pathlib import Path
from typing import Dict, List
import pydicom
from pydicom.dataset import Dataset, FileMetaDataset
from pydicom.encaps import encapsulate
from pydicom.uid import JPEG2000, ExplicitVRLittleEndian, generate_uid
# ---------------------------------------------------------------------
# XML → plain python ---------------------------------------------------
# ---------------------------------------------------------------------
def _parse_xml(xml_path: str) -> Dict[str, dict]:
"""Return four dictionaries: patient, study, series_map, image_map.
*series_map*: key = SeriesInstanceUID
*image_map*: key = jp2 file stem (e.g. 'S799660_I25827012')
"""
root = ET.parse(xml_path).getroot()
# ---------------- Patient + Study (singletons) -------------------
patient_elem = root.find("Demographic_AmicasPatient")
study_elem = root.find("Demographic_AmicasStudy")
patient: dict = patient_elem.attrib if patient_elem is not None else {}
study: dict = study_elem.attrib if study_elem is not None else {}
# ---------------- Series (zero-to-many) ---------------------------
series_map: Dict[str, dict] = {}
for selem in root.findall("Demographic_AmicasSeries"):
uid = selem.attrib.get("SeriesInstanceUID")
if uid:
series_map[uid] = selem.attrib
# ---------------- Image (zero-to-many) ----------------------------
image_map: Dict[str, dict] = {}
for ielem in root.findall("Demographic_AmicasImage"):
obj_file = ielem.attrib.get("ObjectFile", "")
# Path may contain *backslashes* even on POSIX; we only want the
# filename stem without directory and extension.
if obj_file:
filename = os.path.basename(obj_file.replace("\\", "/"))
stem = os.path.splitext(filename)[0]
else:
stem = ""
if not stem:
# fallback: build from AmicasImageID, will still match
stem = f"{ielem.attrib.get('AmicasImageID', '')}"
image_map[stem] = ielem.attrib
return {
"patient": patient,
"study": study,
"series_map": series_map,
"image_map": image_map,
}
# ---------------------------------------------------------------------
# Mapping helpers ------------------------------------------------------
# ---------------------------------------------------------------------
# Patient level: XML attr → (DICOM keyword, callable for conversion)
PATIENT_MAP = {
"PatientID": ("PatientID", str),
"N_PatientID": ("PatientID", str), # fallback
"NormPatientID": ("PatientID", str), # another variant seen in patient element
"PatientName": ("PatientName", str),
"N_PatientName": ("PatientName", str), # fallback (no ^)
"PatientBirthDate": ("PatientBirthDate", str),
"PatientSex": ("PatientSex", str),
"PatientAge": ("PatientAge", str),
}
# Study level
STUDY_MAP = {
"StudyInstanceUID": ("StudyInstanceUID", str),
"StudyDate": ("StudyDate", str),
"StudyTime": ("StudyTime", str),
"StudyID": ("StudyID", str),
"AccessionNumber": ("AccessionNumber", str),
"StudyDescription": ("StudyDescription", str),
"ReferPhysician": ("ReferringPhysicianName", str),
"N_ReferPhysician": ("ReferringPhysicianName", str),
}
# Series level
SERIES_MAP = {
"SeriesInstanceUID": ("SeriesInstanceUID", str),
"SeriesNumber": ("SeriesNumber", int),
"SeriesDescription": ("SeriesDescription", str),
"SeriesDate": ("SeriesDate", str),
"SeriesTime": ("SeriesTime", str),
"Modality": ("Modality", str),
"Manufacturer": ("Manufacturer", str),
"InstitutionName": ("InstitutionName", str),
"PatientPosition": ("PatientPosition", str),
}
# Image level
IMAGE_MAP = {
"SOPInstanceUID": ("SOPInstanceUID", str),
"SOPClassUID": ("SOPClassUID", str),
"ImageNumber": ("InstanceNumber", int),
"ImageDate": ("ContentDate", str), # fallback below
"ImageTime": ("ContentTime", str),
"NRows": ("Rows", int),
"NColumns": ("Columns", int),
"ImagePositionPt": ("ImagePositionPatient", str),
"ImageOrientPt": ("ImageOrientationPatient", str),
"SliceLocation": ("SliceLocation", float),
"PixelSpacing": ("PixelSpacing", str),
"SliceThickness": ("SliceThickness", float),
"KVP": ("KVP", str),
"RepetitionTime": ("RepetitionTime", str),
"EchoTime": ("EchoTime", str),
"EchoNumbers": ("EchoNumbers", str),
"WindowCenter": ("WindowCenter", str),
"WindowWidth": ("WindowWidth", str),
"BitsAlloc": ("BitsAllocated", int),
"BitsStored": ("BitsStored", int),
"PixelRep": ("PixelRepresentation", int),
"PhotometricI": ("PhotometricInterpretation", str),
"NumberOfFrames": ("NumberOfFrames", int),
}
def _copy_mapped(attrs: dict, mapping: dict, ds: Dataset) -> None:
"""Apply *mapping* (xml-name → (dcm keyword, cast)) to dataset."""
for xml_key, (dcm_kw, caster) in mapping.items():
val = attrs.get(xml_key)
if val is None or val == "":
continue
try:
setattr(ds, dcm_kw, caster(val))
except Exception:
# Ignore malformed value – continue with other attributes
continue
# ---------------------------------------------------------------------
# Dataset factory ------------------------------------------------------
# ---------------------------------------------------------------------
def _person_name_from_normalised(norm_name: str) -> str:
"""Very naive *LASTFIRST* → *LAST^FIRST* conversion."""
if "^" in norm_name:
return norm_name # already proper
# split the string in two halves (best guess) when no caret present
half = len(norm_name) // 2
return f"{norm_name[:half]}^{norm_name[half:]}"
def build_dataset(
patient: dict,
study: dict,
series: dict,
image: dict,
) -> Dataset:
"""Merge attributes and return a populated pydicom Dataset."""
ds = Dataset()
# Patient ---------------------------------------------------------
_copy_mapped(patient, PATIENT_MAP, ds)
# fallbacks / fixes for patient
if "PatientName" not in ds:
# try exact PatientName from study first
pname = study.get("PatientName") or patient.get("PatientName")
if pname:
ds.PatientName = pname
else:
n_name = patient.get("N_PatientName") or study.get("N_PatientName")
if n_name:
ds.PatientName = _person_name_from_normalised(n_name)
# Study -----------------------------------------------------------
_copy_mapped(study, STUDY_MAP, ds)
# If PatientID or BirthDate still missing, try study-level values
if "PatientID" not in ds:
for key in ("PatientID", "N_PatientID"):
if key in study and study[key]:
ds.PatientID = study[key]
break
if "PatientBirthDate" not in ds and study.get("PatientBirthDate"):
ds.PatientBirthDate = study["PatientBirthDate"]
# Series ----------------------------------------------------------
_copy_mapped(series, SERIES_MAP, ds)
# Image -----------------------------------------------------------
_copy_mapped(image, IMAGE_MAP, ds)
# ----- Mandatory fallbacks --------------------------------------
if "StudyInstanceUID" not in ds:
ds.StudyInstanceUID = generate_uid()
if "SeriesInstanceUID" not in ds:
ds.SeriesInstanceUID = generate_uid()
if "SOPInstanceUID" not in ds:
ds.SOPInstanceUID = generate_uid()
# SOP Class: if missing default to Secondary Capture
if "SOPClassUID" not in ds:
ds.SOPClassUID = "1.2.840.10008.5.1.4.1.1.7" # Secondary Capture
# ContentDate/Time fallback to now when blank --------------------
now = datetime.now()
if not getattr(ds, "ContentDate", None):
ds.ContentDate = now.strftime("%Y%m%d")
if not getattr(ds, "ContentTime", None):
ds.ContentTime = now.strftime("%H%M%S")
# Pixel module minimal defaults when absent ----------------------
if "BitsAllocated" not in ds:
ds.BitsAllocated = 8
if "BitsStored" not in ds:
ds.BitsStored = ds.BitsAllocated
if "HighBit" not in ds:
ds.HighBit = ds.BitsStored - 1
if "PixelRepresentation" not in ds:
ds.PixelRepresentation = 0
if "SamplesPerPixel" not in ds:
ds.SamplesPerPixel = 1
if "PhotometricInterpretation" not in ds:
ds.PhotometricInterpretation = "MONOCHROME2"
if "PlanarConfiguration" not in ds and ds.SamplesPerPixel > 1:
ds.PlanarConfiguration = 0
return ds
# ---------------------------------------------------------------------
# Main conversion loop ------------------------------------------------
# ---------------------------------------------------------------------
def convert_all(jp2_dir: str, xml_path: str, out_dir: str) -> None:
"""Convert every *.jp2 under *jp2_dir*."""
data = _parse_xml(xml_path)
patient = data["patient"]
study = data["study"]
series_map = data["series_map"]
image_map = data["image_map"]
jp2_pathlist: List[Path] = sorted(Path(jp2_dir).glob("*.jp2"))
if not jp2_pathlist:
sys.exit(f"[error] No .jp2 files found in {jp2_dir}")
os.makedirs(out_dir, exist_ok=True)
for jp2_path in jp2_pathlist:
stem = jp2_path.stem # S799660_I25827012
image_attrs = image_map.get(stem)
if image_attrs is None:
print(f"[warning] Skipping {stem}: not present in XML")
continue
series_uid = image_attrs.get("SeriesInstanceUID", "")
series_attrs = series_map.get(series_uid, {})
ds = build_dataset(patient, study, series_attrs, image_attrs)
# Encapsulate JPEG-2000 codestream ---------------------------
with open(jp2_path, "rb") as fp:
ds.PixelData = encapsulate([fp.read()])
ds.file_meta = FileMetaDataset()
ds.file_meta.FileMetaInformationVersion = b"\x00\x01"
ds.file_meta.TransferSyntaxUID = JPEG2000
ds.file_meta.MediaStorageSOPClassUID = ds.SOPClassUID
ds.file_meta.MediaStorageSOPInstanceUID = ds.SOPInstanceUID
ds.file_meta.ImplementationClassUID = "1.2.826.0.1.3680043.10.511.1"
ds.is_little_endian = True
ds.is_implicit_VR = False # explicit VR as required for encapsulated TS
out_path = Path(out_dir) / f"{stem}.dcm"
ds.save_as(out_path, write_like_original=False)
print("✓", out_path)
# ---------------------------------------------------------------------
# CLI -----------------------------------------------------------------
# ---------------------------------------------------------------------
def _cli() -> argparse.Namespace:
ap = argparse.ArgumentParser(description="Wrap Amicas JPEG-2000 files into DICOM")
ap.add_argument("--xml", required=True, help="Amicas metadata XML, e.g. 1487_273074.XML")
ap.add_argument("--in-dir", default=".", help="Directory containing *.jp2 files (default: .)")
ap.add_argument("--out-dir", default="dicom_out", help="Output directory (default: dicom_out)")
return ap.parse_args()
def main() -> None:
args = _cli()
convert_all(args.in_dir, args.xml, args.out_dir)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment