Skip to content

Instantly share code, notes, and snippets.

@ChocopieKewpie
Last active July 16, 2024 23:31
Show Gist options
  • Save ChocopieKewpie/693565b7a0773c3ce25004d7257a1d7e to your computer and use it in GitHub Desktop.
Save ChocopieKewpie/693565b7a0773c3ce25004d7257a1d7e to your computer and use it in GitHub Desktop.
ISO19139 to Dublin Core (click)
# -*- coding: utf-8 -*-
"""
Created on Wed Sep 6 15:28:29 2023
@author: ArdoJ
"""
import click
from lxml import etree
@click.command()
@click.argument("f_input", required=True, type=click.Path(), nargs=1)
@click.argument("f_output", required=True, type=click.Path(), nargs=1)
def dcmi_19115(f_input, f_output):
file=(str(f_input))
fileout=(str(f_output))
# Load the ISO 19139 XML document
iso19139_xml = etree.parse(file)
namespaces = {
'gmd': 'http://www.isotc211.org/2005/gmd',
'gco': 'http://www.isotc211.org/2005/gco',
'gml': 'http://www.opengis.net/gml/3.2',
}
#DCMI Mapping from ISO19139
title = iso19139_xml.xpath('//gmd:title/gco:CharacterString/text()', namespaces=namespaces)
################################Section outlines the CREATOR#####################################
creator = {}
creator_individual_name = iso19139_xml.xpath(
'//gmd:CI_ResponsibleParty[gmd:role/gmd:CI_RoleCode/text()="originator"]/gmd:individualName/gco:CharacterString/text()',
namespaces=namespaces
)
creator_organization_name = iso19139_xml.xpath(
'//gmd:CI_ResponsibleParty[gmd:role/gmd:CI_RoleCode/text()="originator"]/gmd:organisationName/gco:CharacterString/text()',
namespaces=namespaces
)
creator_email = iso19139_xml.xpath(
'//gmd:CI_ResponsibleParty[gmd:role/gmd:CI_RoleCode/text()="originator"]/gmd:contactInfo/gmd:CI_Contact/gmd:address/gmd:CI_Address/gmd:electronicMailAddress/gco:CharacterString/text()',
namespaces=namespaces
)
# Populate the 'creator' dictionary
if creator_individual_name:
creator['individualName'] = creator_individual_name[0] if creator_individual_name else None
if creator_organization_name:
creator['organizationName'] = creator_organization_name[0] if creator_organization_name else None
if creator_email:
creator['email'] = creator_email[0] if creator_email else None
creator = [
"Name="+ creator['individualName'] if creator else None,
"Organization="+ creator['organizationName'] if creator else None,
"email="+ creator['email'] if creator else None,
]
###################################################################################################
####################################Section outlines the Publisher#################################
"""
THIS ONE NEEDS SOME FURTHER REFINEMENT,
AS WHAT ROLE SHOULD THE PUBLISHER BE MAPPED TO?
Right now I've decided to go with Publisher, or else fall back to point of contact of the Data
"""
publisher = {}
pub_individual_name = iso19139_xml.xpath(
'//gmd:CI_ResponsibleParty[gmd:role/gmd:CI_RoleCode/text()="publisher"]/gmd:individualName/gco:CharacterString/text()',
namespaces=namespaces
)
pub_organization_name = iso19139_xml.xpath(
'//gmd:CI_ResponsibleParty[gmd:role/gmd:CI_RoleCode/text()="publisher"]/gmd:organisationName/gco:CharacterString/text()',
namespaces=namespaces
)
pub_email = iso19139_xml.xpath(
'//gmd:CI_ResponsibleParty[gmd:role/gmd:CI_RoleCode/text()="publisher"]/gmd:contactInfo/gmd:CI_Contact/gmd:address/gmd:CI_Address/gmd:electronicMailAddress/gco:CharacterString/text()',
namespaces=namespaces
)
# Populate the 'publisher' dictionary
if pub_individual_name:
publisher['individualName'] = pub_individual_name[0] if pub_individual_name else None
if pub_organization_name:
publisher['organizationName'] = pub_organization_name[0] if pub_organization_name else None
if pub_email:
publisher['email'] = pub_email[0] if pub_email else None
publisher = [
"Name="+ publisher['individualName'] if pub_individual_name else None,
"Organization="+ publisher['organizationName'] if pub_organization_name else None,
"email="+ publisher['email'] if pub_email else None,
]
poc = {}
poc_name = iso19139_xml.xpath(
'//gmd:MD_DataIdentification/gmd:pointOfContact/gmd:CI_ResponsibleParty/gmd:individualName/gco:CharacterString/text()',
namespaces=namespaces
)
poc_org_name = iso19139_xml.xpath(
'//gmd:MD_DataIdentification/gmd:pointOfContact/gmd:CI_ResponsibleParty/gmd:organisationName/gco:CharacterString/text()',
namespaces=namespaces
)
poc_email = iso19139_xml.xpath(
'//gmd:MD_DataIdentification/gmd:pointOfContact/gmd:CI_ResponsibleParty/gmd:contactInfo/gmd:CI_Contact/gmd:address/gmd:CI_Address/gmd:electronicMailAddress/gco:CharacterString/text()',
namespaces=namespaces
)
# Populate the 'poc' dictionary
if poc_name:
poc['individualName'] = ', '.join(poc_name)
else:
poc['individualName'] = None
if poc_org_name:
poc['organizationName'] = ', '.join(poc_org_name)
else:
poc['organizationName'] = None
if poc_email:
poc['email'] = ', '.join(poc_email)
else:
poc['email'] = None
poc = [
"Name="+ poc['individualName'] if poc else None,
"Organization="+ poc['organizationName'] if poc else None,
"email="+ poc['email'] if poc else None,
]
###################################################################################################
subject = iso19139_xml.xpath('//gmd:descriptiveKeywords/gmd:MD_Keywords/gmd:keyword/gco:CharacterString/text()', namespaces=namespaces)
description = iso19139_xml.xpath('//gmd:abstract/gco:CharacterString/text()', namespaces=namespaces)
datetime = iso19139_xml.xpath('//gmd:dateStamp/gco:DateTime/text()', namespaces=namespaces)
date = iso19139_xml.xpath('//gmd:dateStamp/gco:Date/text()', namespaces=namespaces)
type_code = iso19139_xml.xpath('/gmd:MD_Metadata/gmd:identificationInfo/gmd:MD_DataIdentification/gmd:spatialRepresentationType/gmd:MD_SpatialRepresentationTypeCode/@codeListValue', namespaces=namespaces)
Format = iso19139_xml.xpath('//gmd:distributionInfo/gmd:MD_Distribution/gmd:distributionFormat/gmd:MD_Format/gmd:name/gco:CharacterString/text()', namespaces=namespaces)
identifier = iso19139_xml.xpath('//gmd:fileIdentifier/gco:CharacterString/text()', namespaces=namespaces)
source= iso19139_xml.xpath('/gmd:MD_Metadata/gmd:dataQualityInfo/gmd:DQ_DataQuality/gmd:lineage/gmd:LI_Lineage/gmd:statement/gco:CharacterString/text()', namespaces=namespaces)
lang_code= iso19139_xml.xpath('/gmd:MD_Metadata/gmd:language/gmd:LanguageCode/@codeListValue', namespaces=namespaces)
lang= iso19139_xml.xpath('/gmd:MD_Metadata/gmd:language/gco:CharacterString/text()', namespaces=namespaces)
west_bound_longitude = iso19139_xml.xpath('//gmd:extent/gmd:EX_Extent/gmd:geographicElement/gmd:EX_GeographicBoundingBox/gmd:westBoundLongitude/gco:Decimal/text()', namespaces=namespaces)
east_bound_longitude = iso19139_xml.xpath('//gmd:extent/gmd:EX_Extent/gmd:geographicElement/gmd:EX_GeographicBoundingBox/gmd:eastBoundLongitude/gco:Decimal/text()', namespaces=namespaces)
north_bound_latitude = iso19139_xml.xpath('//gmd:extent/gmd:EX_Extent/gmd:geographicElement/gmd:EX_GeographicBoundingBox/gmd:northBoundLatitude/gco:Decimal/text()', namespaces=namespaces)
south_bound_latitude = iso19139_xml.xpath('//gmd:extent/gmd:EX_Extent/gmd:geographicElement/gmd:EX_GeographicBoundingBox/gmd:southBoundLatitude/gco:Decimal/text()', namespaces=namespaces)
temporal_start = iso19139_xml.xpath('//gmd:temporalElement/gmd:EX_TemporalExtent/gmd:extent/gml:TimePeriod/gml:beginPosition/text()', namespaces=namespaces)
temporal_end = iso19139_xml.xpath('//gmd:temporalElement/gmd:EX_TemporalExtent/gmd:extent/gml:TimePeriod/gml:endPosition/text()', namespaces=namespaces)
coverage_values = [
"North "+ north_bound_latitude[0] if north_bound_latitude else None,
"South "+south_bound_latitude[0] if south_bound_latitude else None,
"East "+east_bound_longitude[0] if east_bound_longitude else None,
"West "+west_bound_longitude[0] if west_bound_longitude else None,
"start= "+ temporal_start[0] if temporal_start else 'start=N/A',
"end="+temporal_end[0] if temporal_end else 'end=N/A',
]
rights=iso19139_xml.xpath('//gmd:resourceConstraints/gmd:MD_LegalConstraints/gmd:useLimitation/gco:CharacterString/text()', namespaces=namespaces)
#DCMI dictionary
dcmi_metadata = {
'title': title,
'creator': creator if any(item is not None for item in creator) else 'N/A',
'subject': subject,
'description': description,
'publisher': publisher if any(item is not None for item in publisher) else poc, #get publisher, else fall back to point of contact
'date': datetime[0] if datetime else date,
'type': type_code[0] if type_code else None,
'format': Format if Format else 'N/A',
'identifier': identifier,
'source': source,
'language': lang_code if lang_code else lang,
#TODO RELATION
'coverage': coverage_values,
#TODO LINEAGE
'rights': rights,
}
def list_to_string(value):
if value is None:
return ""
elif isinstance(value, list):
value = [str(item) if item is not None and str(item) != "None" else "N/A" for item in value]
return ', '.join(value)
else:
return str(value)
#XML CREATION BEGINS
import xml.etree.ElementTree as ET
# Define the root element
root = ET.Element(
"simpledc",
attrib={
'xmlns:dc': 'http://purl.org/dc/elements/1.1/',
'xmlns:xsi': 'http://www.w3.org/2001/XMLSchema-instance',
}
)
element_order = [
'title',
'creator',
'subject',
'description',
'publisher',
'date',
'type',
'format',
'identifier',
'source',
'language',
'coverage',
'temporalCoverage', #TODO rename this as coverage element too
'rights',
]
# Create elements based on the dcmi_metadata dictionary in the desired order
for key in element_order:
if key == 'subject':
# Create a single <dc:subject> element for each subject
for subject_value in dcmi_metadata['subject']:
subject_element = ET.SubElement(root, 'dc:subject')
subject_element.text = subject_value
elif key in dcmi_metadata:
value = dcmi_metadata[key]
if isinstance(value, dict):
# Handle nested dictionaries, DCMI shouldnt have these?
sub_element = ET.SubElement(root, f"dc:{key}")
for sub_key, sub_value in value.items():
sub_sub_element = ET.SubElement(sub_element, f"dc:{sub_key}")
sub_sub_element.text = list_to_string(sub_value)
else:
element = ET.SubElement(root, f"dc:{key}")
element.text = list_to_string(value)
# Create an XML string from the root element
xml_string = ET.tostring(root, encoding='utf-8', method='xml')
# Write the XML string to a new file
with open(f'{fileout}.xml', 'wb') as xml_file:
xml_file.write(xml_string)
print(f'Dublin Core generated @{fileout}.xml!!!')
if __name__ == '__main__':
dcmi_19115()
@ChocopieKewpie
Copy link
Author

Messing around with metadata translating between iso 19139 to dublin core. Configured using click, so should work as a cli tool.

@ChocopieKewpie
Copy link
Author

Modifications...

from lxml import etree

def iso19139_to_dublincore(iso_xml_path, dc_xml_path):
    # Parse the ISO 19139 XML
    iso_tree = etree.parse(iso_xml_path)
    iso_root = iso_tree.getroot()

    # Define namespaces
    nsmap = {
        'gmd': 'http://www.isotc211.org/2005/gmd',
        'gco': 'http://www.isotc211.org/2005/gco'
    }

    # Create the root element for Dublin Core
    dc_root = etree.Element('oai_dc:dc', nsmap={
        'oai_dc': 'http://www.openarchives.org/OAI/2.0/oai_dc/',
        'dc': 'http://purl.org/dc/elements/1.1/'
    })

    # Helper function to add elements to the Dublin Core tree
    def add_dc_element(tag, text):
        element = etree.SubElement(dc_root, f'dc:{tag}')
        element.text = text

    # Map ISO 19139 elements to Dublin Core
    title = iso_root.find('.//gmd:title/gco:CharacterString', namespaces=nsmap)
    if title is not None:
        add_dc_element('title', title.text)

    creator = iso_root.find('.//gmd:CI_ResponsibleParty/gmd:individualName/gco:CharacterString', namespaces=nsmap)
    if creator is not None:
        add_dc_element('creator', creator.text)

    subject = iso_root.find('.//gmd:descriptiveKeywords/gmd:MD_Keywords/gmd:keyword/gco:CharacterString', namespaces=nsmap)
    if subject is not None:
        add_dc_element('subject', subject.text)

    description = iso_root.find('.//gmd:abstract/gco:CharacterString', namespaces=nsmap)
    if description is not None:
        add_dc_element('description', description.text)

    publisher = iso_root.find('.//gmd:CI_ResponsibleParty/gmd:organisationName/gco:CharacterString', namespaces=nsmap)
    if publisher is not None:
        add_dc_element('publisher', publisher.text)

    contributor = iso_root.find('.//gmd:CI_ResponsibleParty/gmd:individualName/gco:CharacterString', namespaces=nsmap)
    if contributor is not None:
        add_dc_element('contributor', contributor.text)

    date = iso_root.find('.//gmd:date/gco:Date', namespaces=nsmap)
    if date is not None:
        add_dc_element('date', date.text)

    type_ = iso_root.find('.//gmd:MD_ScopeCode', namespaces=nsmap)
    if type_ is not None:
        add_dc_element('type', type_.text)

    format_ = iso_root.find('.//gmd:MD_Format/gmd:name/gco:CharacterString', namespaces=nsmap)
    if format_ is not None:
        add_dc_element('format', format_.text)

    identifier = iso_root.find('.//gmd:fileIdentifier/gco:CharacterString', namespaces=nsmap)
    if identifier is not None:
        add_dc_element('identifier', identifier.text)

    source = iso_root.find('.//gmd:MD_Identification/gmd:aggregationInfo/gmd:MD_AggregateInformation/gmd:aggregateDataSetIdentifier/gmd:MD_Identifier/gmd:code/gco:CharacterString', namespaces=nsmap)
    if source is not None:
        add_dc_element('source', source.text)

    language = iso_root.find('.//gmd:language/gco:CharacterString', namespaces=nsmap)
    if language is not None:
        add_dc_element('language', language.text)

    relation = iso_root.find('.//gmd:aggregationInfo/gmd:MD_AggregateInformation/gmd:aggregateDataSetIdentifier/gmd:MD_Identifier/gmd:code/gco:CharacterString', namespaces=nsmap)
    if relation is not None:
        add_dc_element('relation', relation.text)

    coverage = iso_root.find('.//gmd:extent/gmd:EX_Extent/gmd:geographicElement/gmd:EX_GeographicBoundingBox', namespaces=nsmap)
    if coverage is not None:
        west = coverage.find('.//gmd:westBoundLongitude/gco:Decimal', namespaces=nsmap).text
        east = coverage.find('.//gmd:eastBoundLongitude/gco:Decimal', namespaces=nsmap).text
        north = coverage.find('.//gmd:northBoundLatitude/gco:Decimal', namespaces=nsmap).text
        south = coverage.find('.//gmd:southBoundLatitude/gco:Decimal', namespaces=nsmap).text
        add_dc_element('coverage', f"West: {west}, East: {east}, North: {north}, South: {south}")

    rights = iso_root.find('.//gmd:resourceConstraints/gmd:MD_LegalConstraints/gmd:useConstraints/gmd:MD_RestrictionCode', namespaces=nsmap)
    if rights is not None:
        add_dc_element('rights', rights.text)

    # Write the Dublin Core XML to a file
    dc_tree = etree.ElementTree(dc_root)
    dc_tree.write(dc_xml_path, pretty_print=True, xml_declaration=True, encoding='UTF-8')

# Example usage
iso_xml_path = 'iso19139.xml'
dc_xml_path = 'dublincore.xml'
iso19139_to_dublincore(iso_xml_path, dc_xml_path)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment