Last active
July 16, 2024 23:31
-
-
Save ChocopieKewpie/693565b7a0773c3ce25004d7257a1d7e to your computer and use it in GitHub Desktop.
ISO19139 to Dublin Core (click)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
Created on Wed Sep 6 15:28:29 2023 | |
@author: ArdoJ | |
""" | |
import click | |
from lxml import etree | |
@click.command() | |
@click.argument("f_input", required=True, type=click.Path(), nargs=1) | |
@click.argument("f_output", required=True, type=click.Path(), nargs=1) | |
def dcmi_19115(f_input, f_output): | |
file=(str(f_input)) | |
fileout=(str(f_output)) | |
# Load the ISO 19139 XML document | |
iso19139_xml = etree.parse(file) | |
namespaces = { | |
'gmd': 'http://www.isotc211.org/2005/gmd', | |
'gco': 'http://www.isotc211.org/2005/gco', | |
'gml': 'http://www.opengis.net/gml/3.2', | |
} | |
#DCMI Mapping from ISO19139 | |
title = iso19139_xml.xpath('//gmd:title/gco:CharacterString/text()', namespaces=namespaces) | |
################################Section outlines the CREATOR##################################### | |
creator = {} | |
creator_individual_name = iso19139_xml.xpath( | |
'//gmd:CI_ResponsibleParty[gmd:role/gmd:CI_RoleCode/text()="originator"]/gmd:individualName/gco:CharacterString/text()', | |
namespaces=namespaces | |
) | |
creator_organization_name = iso19139_xml.xpath( | |
'//gmd:CI_ResponsibleParty[gmd:role/gmd:CI_RoleCode/text()="originator"]/gmd:organisationName/gco:CharacterString/text()', | |
namespaces=namespaces | |
) | |
creator_email = iso19139_xml.xpath( | |
'//gmd:CI_ResponsibleParty[gmd:role/gmd:CI_RoleCode/text()="originator"]/gmd:contactInfo/gmd:CI_Contact/gmd:address/gmd:CI_Address/gmd:electronicMailAddress/gco:CharacterString/text()', | |
namespaces=namespaces | |
) | |
# Populate the 'creator' dictionary | |
if creator_individual_name: | |
creator['individualName'] = creator_individual_name[0] if creator_individual_name else None | |
if creator_organization_name: | |
creator['organizationName'] = creator_organization_name[0] if creator_organization_name else None | |
if creator_email: | |
creator['email'] = creator_email[0] if creator_email else None | |
creator = [ | |
"Name="+ creator['individualName'] if creator else None, | |
"Organization="+ creator['organizationName'] if creator else None, | |
"email="+ creator['email'] if creator else None, | |
] | |
################################################################################################### | |
####################################Section outlines the Publisher################################# | |
""" | |
THIS ONE NEEDS SOME FURTHER REFINEMENT, | |
AS WHAT ROLE SHOULD THE PUBLISHER BE MAPPED TO? | |
Right now I've decided to go with Publisher, or else fall back to point of contact of the Data | |
""" | |
publisher = {} | |
pub_individual_name = iso19139_xml.xpath( | |
'//gmd:CI_ResponsibleParty[gmd:role/gmd:CI_RoleCode/text()="publisher"]/gmd:individualName/gco:CharacterString/text()', | |
namespaces=namespaces | |
) | |
pub_organization_name = iso19139_xml.xpath( | |
'//gmd:CI_ResponsibleParty[gmd:role/gmd:CI_RoleCode/text()="publisher"]/gmd:organisationName/gco:CharacterString/text()', | |
namespaces=namespaces | |
) | |
pub_email = iso19139_xml.xpath( | |
'//gmd:CI_ResponsibleParty[gmd:role/gmd:CI_RoleCode/text()="publisher"]/gmd:contactInfo/gmd:CI_Contact/gmd:address/gmd:CI_Address/gmd:electronicMailAddress/gco:CharacterString/text()', | |
namespaces=namespaces | |
) | |
# Populate the 'publisher' dictionary | |
if pub_individual_name: | |
publisher['individualName'] = pub_individual_name[0] if pub_individual_name else None | |
if pub_organization_name: | |
publisher['organizationName'] = pub_organization_name[0] if pub_organization_name else None | |
if pub_email: | |
publisher['email'] = pub_email[0] if pub_email else None | |
publisher = [ | |
"Name="+ publisher['individualName'] if pub_individual_name else None, | |
"Organization="+ publisher['organizationName'] if pub_organization_name else None, | |
"email="+ publisher['email'] if pub_email else None, | |
] | |
poc = {} | |
poc_name = iso19139_xml.xpath( | |
'//gmd:MD_DataIdentification/gmd:pointOfContact/gmd:CI_ResponsibleParty/gmd:individualName/gco:CharacterString/text()', | |
namespaces=namespaces | |
) | |
poc_org_name = iso19139_xml.xpath( | |
'//gmd:MD_DataIdentification/gmd:pointOfContact/gmd:CI_ResponsibleParty/gmd:organisationName/gco:CharacterString/text()', | |
namespaces=namespaces | |
) | |
poc_email = iso19139_xml.xpath( | |
'//gmd:MD_DataIdentification/gmd:pointOfContact/gmd:CI_ResponsibleParty/gmd:contactInfo/gmd:CI_Contact/gmd:address/gmd:CI_Address/gmd:electronicMailAddress/gco:CharacterString/text()', | |
namespaces=namespaces | |
) | |
# Populate the 'poc' dictionary | |
if poc_name: | |
poc['individualName'] = ', '.join(poc_name) | |
else: | |
poc['individualName'] = None | |
if poc_org_name: | |
poc['organizationName'] = ', '.join(poc_org_name) | |
else: | |
poc['organizationName'] = None | |
if poc_email: | |
poc['email'] = ', '.join(poc_email) | |
else: | |
poc['email'] = None | |
poc = [ | |
"Name="+ poc['individualName'] if poc else None, | |
"Organization="+ poc['organizationName'] if poc else None, | |
"email="+ poc['email'] if poc else None, | |
] | |
################################################################################################### | |
subject = iso19139_xml.xpath('//gmd:descriptiveKeywords/gmd:MD_Keywords/gmd:keyword/gco:CharacterString/text()', namespaces=namespaces) | |
description = iso19139_xml.xpath('//gmd:abstract/gco:CharacterString/text()', namespaces=namespaces) | |
datetime = iso19139_xml.xpath('//gmd:dateStamp/gco:DateTime/text()', namespaces=namespaces) | |
date = iso19139_xml.xpath('//gmd:dateStamp/gco:Date/text()', namespaces=namespaces) | |
type_code = iso19139_xml.xpath('/gmd:MD_Metadata/gmd:identificationInfo/gmd:MD_DataIdentification/gmd:spatialRepresentationType/gmd:MD_SpatialRepresentationTypeCode/@codeListValue', namespaces=namespaces) | |
Format = iso19139_xml.xpath('//gmd:distributionInfo/gmd:MD_Distribution/gmd:distributionFormat/gmd:MD_Format/gmd:name/gco:CharacterString/text()', namespaces=namespaces) | |
identifier = iso19139_xml.xpath('//gmd:fileIdentifier/gco:CharacterString/text()', namespaces=namespaces) | |
source= iso19139_xml.xpath('/gmd:MD_Metadata/gmd:dataQualityInfo/gmd:DQ_DataQuality/gmd:lineage/gmd:LI_Lineage/gmd:statement/gco:CharacterString/text()', namespaces=namespaces) | |
lang_code= iso19139_xml.xpath('/gmd:MD_Metadata/gmd:language/gmd:LanguageCode/@codeListValue', namespaces=namespaces) | |
lang= iso19139_xml.xpath('/gmd:MD_Metadata/gmd:language/gco:CharacterString/text()', namespaces=namespaces) | |
west_bound_longitude = iso19139_xml.xpath('//gmd:extent/gmd:EX_Extent/gmd:geographicElement/gmd:EX_GeographicBoundingBox/gmd:westBoundLongitude/gco:Decimal/text()', namespaces=namespaces) | |
east_bound_longitude = iso19139_xml.xpath('//gmd:extent/gmd:EX_Extent/gmd:geographicElement/gmd:EX_GeographicBoundingBox/gmd:eastBoundLongitude/gco:Decimal/text()', namespaces=namespaces) | |
north_bound_latitude = iso19139_xml.xpath('//gmd:extent/gmd:EX_Extent/gmd:geographicElement/gmd:EX_GeographicBoundingBox/gmd:northBoundLatitude/gco:Decimal/text()', namespaces=namespaces) | |
south_bound_latitude = iso19139_xml.xpath('//gmd:extent/gmd:EX_Extent/gmd:geographicElement/gmd:EX_GeographicBoundingBox/gmd:southBoundLatitude/gco:Decimal/text()', namespaces=namespaces) | |
temporal_start = iso19139_xml.xpath('//gmd:temporalElement/gmd:EX_TemporalExtent/gmd:extent/gml:TimePeriod/gml:beginPosition/text()', namespaces=namespaces) | |
temporal_end = iso19139_xml.xpath('//gmd:temporalElement/gmd:EX_TemporalExtent/gmd:extent/gml:TimePeriod/gml:endPosition/text()', namespaces=namespaces) | |
coverage_values = [ | |
"North "+ north_bound_latitude[0] if north_bound_latitude else None, | |
"South "+south_bound_latitude[0] if south_bound_latitude else None, | |
"East "+east_bound_longitude[0] if east_bound_longitude else None, | |
"West "+west_bound_longitude[0] if west_bound_longitude else None, | |
"start= "+ temporal_start[0] if temporal_start else 'start=N/A', | |
"end="+temporal_end[0] if temporal_end else 'end=N/A', | |
] | |
rights=iso19139_xml.xpath('//gmd:resourceConstraints/gmd:MD_LegalConstraints/gmd:useLimitation/gco:CharacterString/text()', namespaces=namespaces) | |
#DCMI dictionary | |
dcmi_metadata = { | |
'title': title, | |
'creator': creator if any(item is not None for item in creator) else 'N/A', | |
'subject': subject, | |
'description': description, | |
'publisher': publisher if any(item is not None for item in publisher) else poc, #get publisher, else fall back to point of contact | |
'date': datetime[0] if datetime else date, | |
'type': type_code[0] if type_code else None, | |
'format': Format if Format else 'N/A', | |
'identifier': identifier, | |
'source': source, | |
'language': lang_code if lang_code else lang, | |
#TODO RELATION | |
'coverage': coverage_values, | |
#TODO LINEAGE | |
'rights': rights, | |
} | |
def list_to_string(value): | |
if value is None: | |
return "" | |
elif isinstance(value, list): | |
value = [str(item) if item is not None and str(item) != "None" else "N/A" for item in value] | |
return ', '.join(value) | |
else: | |
return str(value) | |
#XML CREATION BEGINS | |
import xml.etree.ElementTree as ET | |
# Define the root element | |
root = ET.Element( | |
"simpledc", | |
attrib={ | |
'xmlns:dc': 'http://purl.org/dc/elements/1.1/', | |
'xmlns:xsi': 'http://www.w3.org/2001/XMLSchema-instance', | |
} | |
) | |
element_order = [ | |
'title', | |
'creator', | |
'subject', | |
'description', | |
'publisher', | |
'date', | |
'type', | |
'format', | |
'identifier', | |
'source', | |
'language', | |
'coverage', | |
'temporalCoverage', #TODO rename this as coverage element too | |
'rights', | |
] | |
# Create elements based on the dcmi_metadata dictionary in the desired order | |
for key in element_order: | |
if key == 'subject': | |
# Create a single <dc:subject> element for each subject | |
for subject_value in dcmi_metadata['subject']: | |
subject_element = ET.SubElement(root, 'dc:subject') | |
subject_element.text = subject_value | |
elif key in dcmi_metadata: | |
value = dcmi_metadata[key] | |
if isinstance(value, dict): | |
# Handle nested dictionaries, DCMI shouldnt have these? | |
sub_element = ET.SubElement(root, f"dc:{key}") | |
for sub_key, sub_value in value.items(): | |
sub_sub_element = ET.SubElement(sub_element, f"dc:{sub_key}") | |
sub_sub_element.text = list_to_string(sub_value) | |
else: | |
element = ET.SubElement(root, f"dc:{key}") | |
element.text = list_to_string(value) | |
# Create an XML string from the root element | |
xml_string = ET.tostring(root, encoding='utf-8', method='xml') | |
# Write the XML string to a new file | |
with open(f'{fileout}.xml', 'wb') as xml_file: | |
xml_file.write(xml_string) | |
print(f'Dublin Core generated @{fileout}.xml!!!') | |
if __name__ == '__main__': | |
dcmi_19115() |
Modifications...
from lxml import etree
def iso19139_to_dublincore(iso_xml_path, dc_xml_path):
# Parse the ISO 19139 XML
iso_tree = etree.parse(iso_xml_path)
iso_root = iso_tree.getroot()
# Define namespaces
nsmap = {
'gmd': 'http://www.isotc211.org/2005/gmd',
'gco': 'http://www.isotc211.org/2005/gco'
}
# Create the root element for Dublin Core
dc_root = etree.Element('oai_dc:dc', nsmap={
'oai_dc': 'http://www.openarchives.org/OAI/2.0/oai_dc/',
'dc': 'http://purl.org/dc/elements/1.1/'
})
# Helper function to add elements to the Dublin Core tree
def add_dc_element(tag, text):
element = etree.SubElement(dc_root, f'dc:{tag}')
element.text = text
# Map ISO 19139 elements to Dublin Core
title = iso_root.find('.//gmd:title/gco:CharacterString', namespaces=nsmap)
if title is not None:
add_dc_element('title', title.text)
creator = iso_root.find('.//gmd:CI_ResponsibleParty/gmd:individualName/gco:CharacterString', namespaces=nsmap)
if creator is not None:
add_dc_element('creator', creator.text)
subject = iso_root.find('.//gmd:descriptiveKeywords/gmd:MD_Keywords/gmd:keyword/gco:CharacterString', namespaces=nsmap)
if subject is not None:
add_dc_element('subject', subject.text)
description = iso_root.find('.//gmd:abstract/gco:CharacterString', namespaces=nsmap)
if description is not None:
add_dc_element('description', description.text)
publisher = iso_root.find('.//gmd:CI_ResponsibleParty/gmd:organisationName/gco:CharacterString', namespaces=nsmap)
if publisher is not None:
add_dc_element('publisher', publisher.text)
contributor = iso_root.find('.//gmd:CI_ResponsibleParty/gmd:individualName/gco:CharacterString', namespaces=nsmap)
if contributor is not None:
add_dc_element('contributor', contributor.text)
date = iso_root.find('.//gmd:date/gco:Date', namespaces=nsmap)
if date is not None:
add_dc_element('date', date.text)
type_ = iso_root.find('.//gmd:MD_ScopeCode', namespaces=nsmap)
if type_ is not None:
add_dc_element('type', type_.text)
format_ = iso_root.find('.//gmd:MD_Format/gmd:name/gco:CharacterString', namespaces=nsmap)
if format_ is not None:
add_dc_element('format', format_.text)
identifier = iso_root.find('.//gmd:fileIdentifier/gco:CharacterString', namespaces=nsmap)
if identifier is not None:
add_dc_element('identifier', identifier.text)
source = iso_root.find('.//gmd:MD_Identification/gmd:aggregationInfo/gmd:MD_AggregateInformation/gmd:aggregateDataSetIdentifier/gmd:MD_Identifier/gmd:code/gco:CharacterString', namespaces=nsmap)
if source is not None:
add_dc_element('source', source.text)
language = iso_root.find('.//gmd:language/gco:CharacterString', namespaces=nsmap)
if language is not None:
add_dc_element('language', language.text)
relation = iso_root.find('.//gmd:aggregationInfo/gmd:MD_AggregateInformation/gmd:aggregateDataSetIdentifier/gmd:MD_Identifier/gmd:code/gco:CharacterString', namespaces=nsmap)
if relation is not None:
add_dc_element('relation', relation.text)
coverage = iso_root.find('.//gmd:extent/gmd:EX_Extent/gmd:geographicElement/gmd:EX_GeographicBoundingBox', namespaces=nsmap)
if coverage is not None:
west = coverage.find('.//gmd:westBoundLongitude/gco:Decimal', namespaces=nsmap).text
east = coverage.find('.//gmd:eastBoundLongitude/gco:Decimal', namespaces=nsmap).text
north = coverage.find('.//gmd:northBoundLatitude/gco:Decimal', namespaces=nsmap).text
south = coverage.find('.//gmd:southBoundLatitude/gco:Decimal', namespaces=nsmap).text
add_dc_element('coverage', f"West: {west}, East: {east}, North: {north}, South: {south}")
rights = iso_root.find('.//gmd:resourceConstraints/gmd:MD_LegalConstraints/gmd:useConstraints/gmd:MD_RestrictionCode', namespaces=nsmap)
if rights is not None:
add_dc_element('rights', rights.text)
# Write the Dublin Core XML to a file
dc_tree = etree.ElementTree(dc_root)
dc_tree.write(dc_xml_path, pretty_print=True, xml_declaration=True, encoding='UTF-8')
# Example usage
iso_xml_path = 'iso19139.xml'
dc_xml_path = 'dublincore.xml'
iso19139_to_dublincore(iso_xml_path, dc_xml_path)
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Messing around with metadata translating between iso 19139 to dublin core. Configured using click, so should work as a cli tool.