Skip to content

Instantly share code, notes, and snippets.

@jamesoutterside
Created October 19, 2012 15:09
Show Gist options
  • Save jamesoutterside/3918736 to your computer and use it in GitHub Desktop.
Save jamesoutterside/3918736 to your computer and use it in GitHub Desktop.
Sample structure of saving resources from a Learning registry node
# Sample structure of saving resources from a Learning registry node.
# The original code was developed for the RIDLR project using django,
# the code below has had all references of django removed so it will run as
# a standalone python script.
# The removed code saved/updated the resource dictonary in a database, in this case DC_MAP was used
# to map db fields to dublin core fields
# requires http://pypi.python.org/pypi/BeautifulSoup
import json as simplejson
from BeautifulSoup import BeautifulStoneSoup
import datetime
import time
import urllib2
HARVEST_TAGS = ['Bladder','Kidney','Ureter','Urethra']
PAYLOAD_SCHEMAS_TO_HARVEST = ['nsdl_dc','DC_1.1','GEM_DC', 'NSDL_DC_1.02.020','oai_dc']
# map dc fields to database fields
DC_MAP = {"dc:identifier":"url",
"dc:date":"resource_date",
"dc:title":"resource_title",
"dc:description":"description",
"dc:subject":"subject",
"dc:creator":"creator",
"dc:publisher":"provider",
}
DC_MAP_MULTIPLES = ['dc:subject', 'dc:type', 'dc:format']
NODE_URLS = ["http://alpha.mimas.ac.uk/", "http://sandbox.learningregistry.org/"]
NODE_URL = NODE_URLS[0]
def is_service_avaliable():
""" Check is node is avaliable """
url = "%sstatus" % NODE_URL
req = urllib2.Request(url)
response = urllib2.urlopen(req)
data = simplejson.loads(response.read())
if 'active' in data:
if data['active']:
return True
raise Exception('LR %s node is not active ATM' % NODE_URL)
def get_keyword_nodes(flat=False):
# call to DLMs in RILDR project - set to HARVEST_TAGS here
return ','.join([s.lower() for s in HARVEST_TAGS])
def get_node_resources():
is_service_avaliable()
url = "%sslice?any_tags=%s" % (NODE_URL, get_keyword_nodes(flat=True))
url = url.encode('utf-8')
req = urllib2.Request(url)
opener = urllib2.build_opener()
data = opener.open(req)
try:
resources = simplejson.load(data)
except:
resources = [{'documents':[]}]
return resources
def store_resources(resources):
dc_resources = [r for r in resources['documents'] if r['resource_data_description']['payload_schema'][0] in PAYLOAD_SCHEMAS_TO_HARVEST]
for resource in dc_resources:
doc_id = resource['doc_ID']
dc_data = resource['resource_data_description']['resource_data']
update_timestamp = resource['resource_data_description']['update_timestamp']
try:
signer = resource['resource_data_description']['identity']['signer']
except:
signer = resource['resource_data_description']['identity']['submitter']
doc_type = resource['resource_data_description']['doc_type']
meta_keywords= resource['resource_data_description']['keys']
not_updated = False
soup = BeautifulStoneSoup(dc_data)
# Get/create and save resource
# if exists then update rather then create
# Duplicate if document matches all 3 rules below (outlined by LR/JLeRN):
# 1. Same doc_type
# 2. Submitter (signer)
# 3. Resource identifier (url)
save_type = ""
try:
# get existing
url = soup.find('dc:identifier').text
node_resource = None # In RIDLR this is a call to the database to get a resource
save_type = "Updating"
if node_resource.update_timestamp == update_timestamp:
not_updated = True
except Exception as e:
save_type = "Creating"
# create new resource, for RIDLR this is a django model
resource = {}
if not not_updated:
for key, field in DC_MAP.iteritems():
try:
if key in DC_MAP_MULTIPLES:
value = ','.join([e.text for e in soup.findAll(key)])
else:
value = soup.find(key).text
except:
value = None
if key == 'dc:title':
print '*** %s - "%s" ...' %(save_type,value)
if key == 'dc:date':
if value == "0000-00-00":
value = datetime.datetime.now()
try:
value = time.strptime(value, '%Y%m%d')
except:
value = datetime.datetime.now()
resource[field] = value
#setattr(node_resource, field, value) # map to django model
resource['learning_registry_information__doc_id'] = doc_id
resource['learning_registry_information__node_url'] = NODE_URL
resource['learning_registry_information__update_timestamp'] = update_timestamp
resource['learning_registry_information__signer'] = signer
resource['learning_registry_information__doc_type'] = doc_type
print resource
print "Stored %s resources with schema in %s and keywords in %s \n" % (len(dc_resources), PAYLOAD_SCHEMAS_TO_HARVEST, HARVEST_TAGS)
if __name__ == '__main__':
resources = get_node_resources()
store_resources(resources)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment