Created
May 30, 2013 01:03
-
-
Save tsunagun/5675114 to your computer and use it in GitHub Desktop.
LOD Statsで公開されているVoIDファイルをRDF Sesameに登録するスクリプト.
post_rdfのrepository_idとdomainは環境に合わせて変更すること.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# LOD Statsから,データセットのメタデータを収集するスクリプト | |
require 'timeout' | |
require 'nokogiri' | |
require 'uri' | |
require 'cgi' | |
require 'open-uri' | |
require 'rdf' | |
require 'rdf/rdfxml' | |
require 'rdf/n3' | |
require 'rdf/turtle' | |
module LODStats | |
# LOD Statsの検索結果一覧ページから,データセットの統計ページURIの配列と,次の一覧ページへのリンクを取得 | |
def self.stats_pages(stats_list_page = "http://stats.lod2.eu/rdfdocs?valid=1") | |
stats_pages = Array.new | |
parser = Nokogiri::XML.parse(open(stats_list_page).read) | |
parser.xpath("//xmlns:tr[@class='rdfdoc']//xmlns:a/@href", parser.namespaces).each do |node| | |
stats_pages << URI.join(stats_list_page, node.text) | |
end | |
pointer = URI.join(stats_list_page, parser.at_xpath("//xmlns:div[@class='pager']//xmlns:a[preceding::xmlns:span[@class='pager_curpage']]/@href", parser.namespaces).text) rescue nil | |
return stats_pages, pointer | |
end | |
# LOD Statsの任意の統計ページURIから,そのデータセットのVoID URIを取得 | |
def self.extract_void_uri(stats_uri) | |
parser = Nokogiri::XML.parse(open(stats_uri).read) | |
URI.join(stats_uri, parser.at_xpath("//xmlns:a[preceding::xmlns:span[@id='void']]/@href", parser.namespaces).text).to_s rescue nil | |
end | |
end | |
# RDF SesameへRDF形式のデータをHTTP Postする | |
def post_rdf(rdf_string, format, context = nil) | |
repository_id = "datasets" | |
domain = "http://localhost:8080" | |
statement_api_uri = URI.parse("#{domain}/openrdf-sesame/repositories/#{repository_id}/statements") | |
graph_api_uri = URI.parse("#{domain}/openrdf-sesame/repositories/#{repository_id}/rdf-graphs/service") | |
content_type = case format | |
when :rdfxml then 'application/rdf+xml' | |
when :turtle then 'application/x-turtle' | |
when :n3 then 'text/rdf+n3' | |
when :ntriples then 'text/plain' | |
end | |
header = { | |
'Content-Type' => content_type | |
} | |
begin | |
timeout(30) do | |
if context.nil? | |
client = Net::HTTP.new(statement_api_uri.host, statement_api_uri.port) | |
res = client.post(statement_api_uri.path, rdf_string, header) | |
else | |
client = Net::HTTP.new(graph_api_uri.host, graph_api_uri.port) | |
res = client.post(graph_api_uri.path+"?graph=#{CGI.escape(context)}", rdf_string, header) | |
end | |
return res | |
end | |
rescue Timeout::Error | |
return false | |
rescue | |
return false | |
end | |
end | |
# LOD Statsの各データセットの統計ページを取得する | |
all_stats_pages = Array.new | |
uri = "http://stats.lod2.eu/rdfdocs?valid=1" | |
loop do | |
stats_pages, pointer = LODStats.stats_pages(uri) | |
all_stats_pages << stats_pages | |
if pointer.nil? | |
break | |
else | |
uri = pointer | |
end | |
end | |
all_stats_pages.flatten! | |
# 各統計ページからvoidファイルを取得しRDFリポジトリに登録する | |
all_stats_pages.each do |stats_page| | |
begin | |
timeout(30) do | |
void_uri = LODStats.extract_void_uri(stats_page) | |
if void_uri.nil? | |
puts "no_void : #{stats_page}" | |
else | |
response = post_rdf(open(void_uri).read, :turtle, void_uri) | |
puts "#{response.code} : #{stats_page}" | |
end | |
end | |
rescue Timeout::Error | |
puts "timeout : #{stats_page}" | |
rescue | |
puts "false : #{stats_page}" | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment