Created
February 7, 2013 17:53
-
-
Save saverkamp/4732757 to your computer and use it in GitHub Desktop.
Sample script to harvest metadata through CONTENTdm v6 API and format as csv for upload into ui-libraries fork of Omeka/Scripto. See ui-libraries/plugin-Scripto for documentation. Uses pycdm, a python library for working with the CONTENTdm v6 API (saverkamp/pycdm).
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import codecs | |
import csv | |
import datetime | |
import pycdm | |
from HTMLParser import HTMLParser | |
#get input: alias + items to retrieve | |
alias = raw_input('collection alias: ') | |
items = raw_input('item identifiers (separate by commas): ') | |
ptrs = items.split(',') | |
#current date-time for output filenames | |
today = datetime.datetime.now().strftime('%Y-%m-%d--%H-%M') | |
#create file-level metadata csv file | |
fileOutput = alias + today + '_File.csv' | |
fFile = codecs.open(fileOutput, 'wb', encoding='utf_8') | |
wtrFile = csv.writer(fFile, delimiter=',') | |
#header row for file-level csv file | |
fileHeaderRow = ['filename', 'title', 'identifier', 'source', 'status', 'transcription', 'Omeka file order'] | |
wtrFile.writerow(fileHeaderRow) | |
#create item-level metadata csv file | |
itemOutput = alias + today + '_Item.csv' | |
fItem = codecs.open(itemOutput, 'wb', encoding='utf_8') | |
wtrItem = csv.writer(fItem, delimiter=',') | |
#header row for item-level csv file | |
itemHeaderRow = ['title', 'identifier', 'source', 'ispartof', 'relation', 'audience', 'files'] | |
wtrItem.writerow(itemHeaderRow) | |
#get data for each item | |
for ptr in ptrs: | |
#call api for item metadata | |
item = pycdm.item(alias, ptr, 'on') | |
#set item-level metadata | |
#create unique item id for use outside CDM | |
itemID = alias + '_' + ptr | |
source = item.refurl | |
itemtitle = item.info['title'] | |
#digital collection url | |
ispartof = item.collection.url | |
#default sorting number, maps to dc:Audience in Omeka | |
sort = '000000' | |
#collection guide url | |
if ('findin' in item.info): | |
relation = item.info['findin'] | |
elif ('collea' in item.info): | |
relation = item.info['collea'] | |
#list for file locations | |
files = [] | |
#set counter for file order | |
order = 1 | |
#set file-level metadata | |
for page in item.pages: | |
#create unique page id for use outside CDM | |
fileID = itemID + '_' + page.id | |
pagelabel = page.label | |
pageRefURL = page.refurl | |
#set transcription, if available | |
#assumes you have a field for full text with nickname 'full' or 'fula' | |
if (('full' in page.info) and page.info['full']): | |
transcription = str(page.info['full'].encode('ascii', 'ignore')) | |
transcription = HTMLParser().unescape(transcription) | |
elif (('fula' in page.info) and page.info['fula']): | |
transcription = str(page.info['fula'].encode('ascii', 'ignore')) | |
transcription = HTMLParser().unescape(transcription) | |
else: | |
transcription = '' | |
#set transcription status | |
if (transcription == ''): | |
status = 'Not Started' | |
else: | |
status = 'Needs Review' | |
url = page.fileurl | |
files.append(url) | |
#write file metadata to file-level csv file | |
filerow = [url, pagelabel, fileID, pageRefURL, status, transcription.encode('ascii', 'ignore'), order] | |
wtrFile.writerow(filerow) | |
order += 1 | |
#write item metadata to item-level csv file | |
files = ','.join(files) | |
itemrow = [itemtitle, itemID, source, ispartof, relation, sort, files] | |
wtrItem.writerow(itemrow) | |
print ptr | |
fItem.close() | |
fFile.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment