Skip to content

Instantly share code, notes, and snippets.

@tsunagun
Created October 25, 2013 03:47
Show Gist options
  • Save tsunagun/7149190 to your computer and use it in GitHub Desktop.
Save tsunagun/7149190 to your computer and use it in GitHub Desktop.
LOD Statsで公開されているメタデータを収集するスクリプト
# LOD Statsから,データセットのメタデータを収集するスクリプト
require 'nokogiri'
require 'uri'
require 'open-uri'
require 'rdf'
require 'rdf/rdfxml'
require 'rdf/n3'
require 'rdf/turtle'
module RDF
class DCAT < Vocabulary('http://www.w3.org/ns/dcat#')
end
end
class Dataset
attr_accessor :uri, :tags, :distribution_uri, :stats_uri, :void_uri
def initialize(uri)
@uri = uri
@tags = get_tags
end
# DataHubから,データセットのタグを取得
def get_tags
parser = Nokogiri::XML.parse(open(self.uri).read)
tags = parser.xpath("//xmlns:ul[@class='tags clearfix']/xmlns:li/xmlns:a", parser.namespaces).map do |x| x.text end rescue []
return tags
end
# データセットのメタデータをRDFで出力
def to_rdf
graph = RDF::Graph.new
p self.void_uri
graph << RDF::Turtle::Reader.new(open(self.void_uri).read)
self.tags.each do |tag|
graph << RDF::Statement.new(RDF::URI.new(self.uri), RDF::DCAT.keyword, tag)
end
graph << RDF::Statement.new(RDF::URI.new(self.uri), RDF.type, RDF::DCAT.Dataset)
graph << RDF::Statement.new(RDF::URI.new(self.uri), RDF::DCAT.distribution, RDF::URI.new(distribution_uri))
return graph.dump(:turtle)
end
end
# LOD Statsの,全データセットの統計ページURIの配列を取得
def stats_pages
stats_page_uris = Array.new
(1..13).each do |index|
list_uri = "http://stats.lod2.eu/rdfdocs?page=#{index}"
parser = Nokogiri::XML.parse(open(list_uri).read)
parser.xpath("//xmlns:tr[@class='rdfdoc']//xmlns:a/@href", parser.namespaces).each do |node|
stats_page_uris << URI.join(list_uri, node.text)
end
end
return stats_page_uris
end
# LOD Statsの任意の統計ページURIから,そのデータセットのRDFファイルURIとVoIDを取得
def extract(stats_uri)
parser = Nokogiri::XML.parse(open(stats_uri).read)
datahub_uri = parser.at_xpath("//xmlns:h2/xmlns:a/@href", parser.namespaces)
dataset = Dataset.new(datahub_uri)
dataset.distribution_uri = parser.at_xpath("//xmlns:div[@class='content']/xmlns:ul/xmlns:li/xmlns:a/@href", parser.namespaces) rescue nil
dataset.void_uri = URI.join(stats_uri, parser.at_xpath("//xmlns:a[preceding::xmlns:span[@id='void']]/@href", parser.namespaces)).to_s rescue nil
return dataset
end
# RDF SesameへRDF形式のデータをHTTP Postする
def post_rdf(rdf_string, format, context = nil)
repository_id = "datasets"
statement_api_uri = URI.parse("http://localhost:8080/openrdf-sesame/repositories/#{repository_id}/statements")
graph_api_uri = URI.parse("http://localhost:8080/openrdf-sesame/repositories/#{repository_id}/rdf-graphs/service")
content_type = case format
when :rdfxml then 'application/rdf+xml'
when :turtle then 'application/x-turtle'
when :n3 then 'text/rdf+n3'
when :ntriples then 'text/plain'
end
header = {
'Content-Type' => content_type
}
begin
timeout(30) do
if context.nil?
client = Net::HTTP.new(statement_api_uri.host, statement_api_uri.port)
res = client.post(statement_api_uri.path, rdf_string, header)
else
client = Net::HTTP.new(graph_api_uri.host, graph_api_uri.port)
res = client.post(graph_api_uri.path+"?graph=#{CGI.escape(context)}", rdf_string, header)
end
return res
end
rescue Timeout::Error
return false
rescue
return false
end
end
stats_pages.each do |stats_page|
dataset = extract(stats_page)
puts dataset.to_rdf
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment