jamesoutterside · October 19, 2012 15:09
diff --git a/lr_store.py b/lr_store.py
 # Sample structure of saving resources from a Learning registry node.

 # The original code was developed for the RIDLR project using django,
 # the code below has had all references of django removed so it will run as
 # a standalone python script.

 # The removed code saved/updated the resource dictonary in a database, in this case DC_MAP was used
 # to map db fields to dublin core fields

 # requires http://pypi.python.org/pypi/BeautifulSoup

 import json as simplejson
 from BeautifulSoup import BeautifulStoneSoup
 import datetime
 import time
 import urllib2

 HARVEST_TAGS = ['Bladder','Kidney','Ureter','Urethra']

 PAYLOAD_SCHEMAS_TO_HARVEST = ['nsdl_dc','DC_1.1','GEM_DC', 'NSDL_DC_1.02.020','oai_dc']

 # map dc fields to database fields
 DC_MAP = {"dc:identifier":"url",
          "dc:date":"resource_date",
          "dc:title":"resource_title",
          "dc:description":"description",
          "dc:subject":"subject",
          "dc:creator":"creator",
          "dc:publisher":"provider",
          }

 DC_MAP_MULTIPLES = ['dc:subject', 'dc:type', 'dc:format']

 NODE_URLS = ["http://alpha.mimas.ac.uk/", "http://sandbox.learningregistry.org/"]
 NODE_URL = NODE_URLS[0]

 def is_service_avaliable():
    """ Check is node is avaliable """
    url = "%sstatus" % NODE_URL
    req = urllib2.Request(url)
    response = urllib2.urlopen(req)
    data = simplejson.loads(response.read())
    if 'active' in data:
        if data['active']:
            return True

    raise Exception('LR %s node is not active ATM' % NODE_URL)

 def get_keyword_nodes(flat=False):
    # call to DLMs in RILDR project - set to HARVEST_TAGS here
    return ','.join([s.lower() for s in HARVEST_TAGS])

 def get_node_resources():
    is_service_avaliable()

    url = "%sslice?any_tags=%s" % (NODE_URL, get_keyword_nodes(flat=True))
    url = url.encode('utf-8')

    req = urllib2.Request(url)
    opener = urllib2.build_opener()
    data = opener.open(req)

    try:
        resources = simplejson.load(data)
    except:
        resources = [{'documents':[]}]

    return resources

 def store_resources(resources):
    dc_resources = [r for r in resources['documents'] if r['resource_data_description']['payload_schema'][0] in PAYLOAD_SCHEMAS_TO_HARVEST]
    for resource in dc_resources:
        doc_id = resource['doc_ID']
        dc_data = resource['resource_data_description']['resource_data']
        update_timestamp = resource['resource_data_description']['update_timestamp']

        try:
            signer = resource['resource_data_description']['identity']['signer']
        except:
            signer = resource['resource_data_description']['identity']['submitter']

        doc_type = resource['resource_data_description']['doc_type']
        meta_keywords= resource['resource_data_description']['keys']
        not_updated = False
        soup = BeautifulStoneSoup(dc_data)

        # Get/create and save resource
        # if exists then update rather then create
        # Duplicate if document matches all 3 rules below (outlined by LR/JLeRN):
        # 1. Same doc_type
        # 2. Submitter (signer)
        # 3. Resource identifier (url)
        save_type = ""
        try:
            # get existing
            url = soup.find('dc:identifier').text
            node_resource = None # In RIDLR this is a call to the database to get a resource
            save_type = "Updating"
            if node_resource.update_timestamp == update_timestamp:
                not_updated = True

        except Exception as e:
            save_type = "Creating"
            # create new resource, for RIDLR this is a django model
            resource = {}

        if not not_updated:
            for key, field in DC_MAP.iteritems():
                try:
                    if key in DC_MAP_MULTIPLES:
                        value = ','.join([e.text for e in soup.findAll(key)])
                    else:
                        value = soup.find(key).text
                except:
                    value = None

                if key == 'dc:title':
                    print '*** %s - "%s" ...' %(save_type,value)

                if key == 'dc:date':
                    if value == "0000-00-00":
                        value = datetime.datetime.now()
                    try:
                        value = time.strptime(value, '%Y%m%d')
                    except:
                        value = datetime.datetime.now()

                resource[field] = value
                #setattr(node_resource, field, value) # map to django model

            resource['learning_registry_information__doc_id'] = doc_id
            resource['learning_registry_information__node_url'] = NODE_URL
            resource['learning_registry_information__update_timestamp'] = update_timestamp
            resource['learning_registry_information__signer'] = signer
            resource['learning_registry_information__doc_type'] = doc_type
            print resource
    print "Stored %s resources with schema in %s and keywords in %s \n" % (len(dc_resources), PAYLOAD_SCHEMAS_TO_HARVEST, HARVEST_TAGS)

 if __name__ == '__main__':
    resources = get_node_resources()
    store_resources(resources)
	# Sample structure of saving resources from a Learning registry node.

	# The original code was developed for the RIDLR project using django,
	# the code below has had all references of django removed so it will run as
	# a standalone python script.

	# The removed code saved/updated the resource dictonary in a database, in this case DC_MAP was used
	# to map db fields to dublin core fields

	# requires http://pypi.python.org/pypi/BeautifulSoup

	import json as simplejson
	from BeautifulSoup import BeautifulStoneSoup
	import datetime
	import time
	import urllib2

	HARVEST_TAGS = ['Bladder','Kidney','Ureter','Urethra']

	PAYLOAD_SCHEMAS_TO_HARVEST = ['nsdl_dc','DC_1.1','GEM_DC', 'NSDL_DC_1.02.020','oai_dc']

	# map dc fields to database fields
	DC_MAP = {"dc:identifier":"url",
	"dc:date":"resource_date",
	"dc:title":"resource_title",
	"dc:description":"description",
	"dc:subject":"subject",
	"dc:creator":"creator",
	"dc:publisher":"provider",
	}

	DC_MAP_MULTIPLES = ['dc:subject', 'dc:type', 'dc:format']

	NODE_URLS = ["http://alpha.mimas.ac.uk/", "http://sandbox.learningregistry.org/"]
	NODE_URL = NODE_URLS[0]

	def is_service_avaliable():
	""" Check is node is avaliable """
	url = "%sstatus" % NODE_URL
	req = urllib2.Request(url)
	response = urllib2.urlopen(req)
	data = simplejson.loads(response.read())
	if 'active' in data:
	if data['active']:
	return True

	raise Exception('LR %s node is not active ATM' % NODE_URL)

	def get_keyword_nodes(flat=False):
	# call to DLMs in RILDR project - set to HARVEST_TAGS here
	return ','.join([s.lower() for s in HARVEST_TAGS])

	def get_node_resources():
	is_service_avaliable()

	url = "%sslice?any_tags=%s" % (NODE_URL, get_keyword_nodes(flat=True))
	url = url.encode('utf-8')

	req = urllib2.Request(url)
	opener = urllib2.build_opener()
	data = opener.open(req)

	try:
	resources = simplejson.load(data)
	except:
	resources = [{'documents':[]}]

	return resources

	def store_resources(resources):
	dc_resources = [r for r in resources['documents'] if r['resource_data_description']['payload_schema'][0] in PAYLOAD_SCHEMAS_TO_HARVEST]
	for resource in dc_resources:
	doc_id = resource['doc_ID']
	dc_data = resource['resource_data_description']['resource_data']
	update_timestamp = resource['resource_data_description']['update_timestamp']

	try:
	signer = resource['resource_data_description']['identity']['signer']
	except:
	signer = resource['resource_data_description']['identity']['submitter']

	doc_type = resource['resource_data_description']['doc_type']
	meta_keywords= resource['resource_data_description']['keys']
	not_updated = False
	soup = BeautifulStoneSoup(dc_data)

	# Get/create and save resource
	# if exists then update rather then create
	# Duplicate if document matches all 3 rules below (outlined by LR/JLeRN):
	# 1. Same doc_type
	# 2. Submitter (signer)
	# 3. Resource identifier (url)
	save_type = ""
	try:
	# get existing
	url = soup.find('dc:identifier').text
	node_resource = None # In RIDLR this is a call to the database to get a resource
	save_type = "Updating"
	if node_resource.update_timestamp == update_timestamp:
	not_updated = True

	except Exception as e:
	save_type = "Creating"
	# create new resource, for RIDLR this is a django model
	resource = {}

	if not not_updated:
	for key, field in DC_MAP.iteritems():
	try:
	if key in DC_MAP_MULTIPLES:
	value = ','.join([e.text for e in soup.findAll(key)])
	else:
	value = soup.find(key).text
	except:
	value = None

	if key == 'dc:title':
	print '*** %s - "%s" ...' %(save_type,value)

	if key == 'dc:date':
	if value == "0000-00-00":
	value = datetime.datetime.now()
	try:
	value = time.strptime(value, '%Y%m%d')
	except:
	value = datetime.datetime.now()

	resource[field] = value
	#setattr(node_resource, field, value) # map to django model

	resource['learning_registry_information__doc_id'] = doc_id
	resource['learning_registry_information__node_url'] = NODE_URL
	resource['learning_registry_information__update_timestamp'] = update_timestamp
	resource['learning_registry_information__signer'] = signer
	resource['learning_registry_information__doc_type'] = doc_type
	print resource
	print "Stored %s resources with schema in %s and keywords in %s \n" % (len(dc_resources), PAYLOAD_SCHEMAS_TO_HARVEST, HARVEST_TAGS)

	if __name__ == '__main__':
	resources = get_node_resources()
	store_resources(resources)