Created
October 25, 2013 03:47
-
-
Save tsunagun/7149190 to your computer and use it in GitHub Desktop.
LOD Statsで公開されているメタデータを収集するスクリプト
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# LOD Statsから,データセットのメタデータを収集するスクリプト | |
require 'nokogiri' | |
require 'uri' | |
require 'open-uri' | |
require 'rdf' | |
require 'rdf/rdfxml' | |
require 'rdf/n3' | |
require 'rdf/turtle' | |
module RDF | |
class DCAT < Vocabulary('http://www.w3.org/ns/dcat#') | |
end | |
end | |
class Dataset | |
attr_accessor :uri, :tags, :distribution_uri, :stats_uri, :void_uri | |
def initialize(uri) | |
@uri = uri | |
@tags = get_tags | |
end | |
# DataHubから,データセットのタグを取得 | |
def get_tags | |
parser = Nokogiri::XML.parse(open(self.uri).read) | |
tags = parser.xpath("//xmlns:ul[@class='tags clearfix']/xmlns:li/xmlns:a", parser.namespaces).map do |x| x.text end rescue [] | |
return tags | |
end | |
# データセットのメタデータをRDFで出力 | |
def to_rdf | |
graph = RDF::Graph.new | |
p self.void_uri | |
graph << RDF::Turtle::Reader.new(open(self.void_uri).read) | |
self.tags.each do |tag| | |
graph << RDF::Statement.new(RDF::URI.new(self.uri), RDF::DCAT.keyword, tag) | |
end | |
graph << RDF::Statement.new(RDF::URI.new(self.uri), RDF.type, RDF::DCAT.Dataset) | |
graph << RDF::Statement.new(RDF::URI.new(self.uri), RDF::DCAT.distribution, RDF::URI.new(distribution_uri)) | |
return graph.dump(:turtle) | |
end | |
end | |
# LOD Statsの,全データセットの統計ページURIの配列を取得 | |
def stats_pages | |
stats_page_uris = Array.new | |
(1..13).each do |index| | |
list_uri = "http://stats.lod2.eu/rdfdocs?page=#{index}" | |
parser = Nokogiri::XML.parse(open(list_uri).read) | |
parser.xpath("//xmlns:tr[@class='rdfdoc']//xmlns:a/@href", parser.namespaces).each do |node| | |
stats_page_uris << URI.join(list_uri, node.text) | |
end | |
end | |
return stats_page_uris | |
end | |
# LOD Statsの任意の統計ページURIから,そのデータセットのRDFファイルURIとVoIDを取得 | |
def extract(stats_uri) | |
parser = Nokogiri::XML.parse(open(stats_uri).read) | |
datahub_uri = parser.at_xpath("//xmlns:h2/xmlns:a/@href", parser.namespaces) | |
dataset = Dataset.new(datahub_uri) | |
dataset.distribution_uri = parser.at_xpath("//xmlns:div[@class='content']/xmlns:ul/xmlns:li/xmlns:a/@href", parser.namespaces) rescue nil | |
dataset.void_uri = URI.join(stats_uri, parser.at_xpath("//xmlns:a[preceding::xmlns:span[@id='void']]/@href", parser.namespaces)).to_s rescue nil | |
return dataset | |
end | |
# RDF SesameへRDF形式のデータをHTTP Postする | |
def post_rdf(rdf_string, format, context = nil) | |
repository_id = "datasets" | |
statement_api_uri = URI.parse("http://localhost:8080/openrdf-sesame/repositories/#{repository_id}/statements") | |
graph_api_uri = URI.parse("http://localhost:8080/openrdf-sesame/repositories/#{repository_id}/rdf-graphs/service") | |
content_type = case format | |
when :rdfxml then 'application/rdf+xml' | |
when :turtle then 'application/x-turtle' | |
when :n3 then 'text/rdf+n3' | |
when :ntriples then 'text/plain' | |
end | |
header = { | |
'Content-Type' => content_type | |
} | |
begin | |
timeout(30) do | |
if context.nil? | |
client = Net::HTTP.new(statement_api_uri.host, statement_api_uri.port) | |
res = client.post(statement_api_uri.path, rdf_string, header) | |
else | |
client = Net::HTTP.new(graph_api_uri.host, graph_api_uri.port) | |
res = client.post(graph_api_uri.path+"?graph=#{CGI.escape(context)}", rdf_string, header) | |
end | |
return res | |
end | |
rescue Timeout::Error | |
return false | |
rescue | |
return false | |
end | |
end | |
stats_pages.each do |stats_page| | |
dataset = extract(stats_page) | |
puts dataset.to_rdf | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment