Created
October 19, 2012 15:09
-
-
Save jamesoutterside/3918736 to your computer and use it in GitHub Desktop.
Sample structure of saving resources from a Learning registry node
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Sample structure of saving resources from a Learning registry node. | |
# The original code was developed for the RIDLR project using django, | |
# the code below has had all references of django removed so it will run as | |
# a standalone python script. | |
# The removed code saved/updated the resource dictonary in a database, in this case DC_MAP was used | |
# to map db fields to dublin core fields | |
# requires http://pypi.python.org/pypi/BeautifulSoup | |
import json as simplejson | |
from BeautifulSoup import BeautifulStoneSoup | |
import datetime | |
import time | |
import urllib2 | |
HARVEST_TAGS = ['Bladder','Kidney','Ureter','Urethra'] | |
PAYLOAD_SCHEMAS_TO_HARVEST = ['nsdl_dc','DC_1.1','GEM_DC', 'NSDL_DC_1.02.020','oai_dc'] | |
# map dc fields to database fields | |
DC_MAP = {"dc:identifier":"url", | |
"dc:date":"resource_date", | |
"dc:title":"resource_title", | |
"dc:description":"description", | |
"dc:subject":"subject", | |
"dc:creator":"creator", | |
"dc:publisher":"provider", | |
} | |
DC_MAP_MULTIPLES = ['dc:subject', 'dc:type', 'dc:format'] | |
NODE_URLS = ["http://alpha.mimas.ac.uk/", "http://sandbox.learningregistry.org/"] | |
NODE_URL = NODE_URLS[0] | |
def is_service_avaliable(): | |
""" Check is node is avaliable """ | |
url = "%sstatus" % NODE_URL | |
req = urllib2.Request(url) | |
response = urllib2.urlopen(req) | |
data = simplejson.loads(response.read()) | |
if 'active' in data: | |
if data['active']: | |
return True | |
raise Exception('LR %s node is not active ATM' % NODE_URL) | |
def get_keyword_nodes(flat=False): | |
# call to DLMs in RILDR project - set to HARVEST_TAGS here | |
return ','.join([s.lower() for s in HARVEST_TAGS]) | |
def get_node_resources(): | |
is_service_avaliable() | |
url = "%sslice?any_tags=%s" % (NODE_URL, get_keyword_nodes(flat=True)) | |
url = url.encode('utf-8') | |
req = urllib2.Request(url) | |
opener = urllib2.build_opener() | |
data = opener.open(req) | |
try: | |
resources = simplejson.load(data) | |
except: | |
resources = [{'documents':[]}] | |
return resources | |
def store_resources(resources): | |
dc_resources = [r for r in resources['documents'] if r['resource_data_description']['payload_schema'][0] in PAYLOAD_SCHEMAS_TO_HARVEST] | |
for resource in dc_resources: | |
doc_id = resource['doc_ID'] | |
dc_data = resource['resource_data_description']['resource_data'] | |
update_timestamp = resource['resource_data_description']['update_timestamp'] | |
try: | |
signer = resource['resource_data_description']['identity']['signer'] | |
except: | |
signer = resource['resource_data_description']['identity']['submitter'] | |
doc_type = resource['resource_data_description']['doc_type'] | |
meta_keywords= resource['resource_data_description']['keys'] | |
not_updated = False | |
soup = BeautifulStoneSoup(dc_data) | |
# Get/create and save resource | |
# if exists then update rather then create | |
# Duplicate if document matches all 3 rules below (outlined by LR/JLeRN): | |
# 1. Same doc_type | |
# 2. Submitter (signer) | |
# 3. Resource identifier (url) | |
save_type = "" | |
try: | |
# get existing | |
url = soup.find('dc:identifier').text | |
node_resource = None # In RIDLR this is a call to the database to get a resource | |
save_type = "Updating" | |
if node_resource.update_timestamp == update_timestamp: | |
not_updated = True | |
except Exception as e: | |
save_type = "Creating" | |
# create new resource, for RIDLR this is a django model | |
resource = {} | |
if not not_updated: | |
for key, field in DC_MAP.iteritems(): | |
try: | |
if key in DC_MAP_MULTIPLES: | |
value = ','.join([e.text for e in soup.findAll(key)]) | |
else: | |
value = soup.find(key).text | |
except: | |
value = None | |
if key == 'dc:title': | |
print '*** %s - "%s" ...' %(save_type,value) | |
if key == 'dc:date': | |
if value == "0000-00-00": | |
value = datetime.datetime.now() | |
try: | |
value = time.strptime(value, '%Y%m%d') | |
except: | |
value = datetime.datetime.now() | |
resource[field] = value | |
#setattr(node_resource, field, value) # map to django model | |
resource['learning_registry_information__doc_id'] = doc_id | |
resource['learning_registry_information__node_url'] = NODE_URL | |
resource['learning_registry_information__update_timestamp'] = update_timestamp | |
resource['learning_registry_information__signer'] = signer | |
resource['learning_registry_information__doc_type'] = doc_type | |
print resource | |
print "Stored %s resources with schema in %s and keywords in %s \n" % (len(dc_resources), PAYLOAD_SCHEMAS_TO_HARVEST, HARVEST_TAGS) | |
if __name__ == '__main__': | |
resources = get_node_resources() | |
store_resources(resources) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment