ktym · December 17, 2015 03:59 · Feb 4, 2014 · Sep 11, 2013 · Sep 10, 2013 · Sep 10, 2013
diff --git a/refseq2ttl.rb b/refseq2ttl.rb
@@ -13,8 +13,16 @@ def dblink
       fetch('DBLINK')
     end
 
+    def project
+      dblink[/Project: (\d+)/, 1]
+    end
+
     def bioproject
-      dblink[/\d+/]
+      dblink[/BioProject: (\S+)/, 1]
+    end
+
+    def biosample
+      dblink[/BioSample: (\S+)/, 1]
     end
   end
 end
@@ -158,7 +166,7 @@ def set_prefixes
     @prefix = default_prefix + [
       #triple("@prefix", "genome:", "<http://purl.jp/bio/10/genome/>"),
       #triple("@prefix", "insdc:", "<http://insdc.org/owl/>"),
-      triple("@prefix", "insdc:", "<http://ddbj.nig.ac.jp/ontologies/sequence#>"),
+      triple("@prefix", "insdc:", "<http://ddbj.nig.ac.jp/ontologies/sequence/>"),
     ]
   end
 
@@ -176,9 +184,9 @@ def xref(subject, db, id)
     end
 
     if hash = @rs_id.fetch(db)
-      uri = "<#{hash['prefix']}#{id}>"
+      uri = "<#{hash['prefix']}/#{id}>"
       puts triple(subject, "rdfs:seeAlso", uri)
-      puts triple(uri, "rdfs:label", quote("#{db}:#{id}"))
+      puts triple(uri, "rdfs:label", quote(id))
       #puts triple(uri, "rdf:type", "idorg:#{hash['class']}")
       puts triple(uri, "rdf:type", "<#{hash['prefix']}>")
     else
@@ -204,9 +212,10 @@ def new_location(pos, subpart_type = false)
     puts triple(loc_id, "rdf:type", "faldo:Region")
     puts triple(loc_id, "faldo:begin", pos_start)
     puts triple(loc_id, "faldo:end", pos_end)
-    # [TODO] need to check join(100000..1000289,1..1234) type object located over the origin
-    new_position(pos_start, @locations.range.min, @locations.first.strand)
-    new_position(pos_end, @locations.range.max, @locations.last.strand)
+    # [TODO] Note that positions of an object located over the origin can be faldo:begin > faldo:end
+    # e.g., join(800..900,1000..1024,1..234) will be faldo:start 800 and faldo:end 234
+    new_position(pos_start, @locations.first.from, @locations.first.strand)
+    new_position(pos_end, @locations.last.to, @locations.last.strand)
 
     list = []
     if subpart_type
@@ -280,7 +289,14 @@ def parse_sequence
     # [TODO] rdfs:seeAlso (like UniProt) or dc:relation, owl:sameAs
     sequence_link_gi(@entry.gi.sub('GI:',''))
     sequence_link_accver(@entry.acc_version)
-    sequence_link_bioproject(@entry.bioproject)
+    if bioproject = @entry.bioproject
+      sequence_link_bioproject(bioproject)
+    elsif project = @entry.project
+      sequence_link_bioproject("PRJNA#{project}")
+    end
+    if biosample = @entry.biosample
+      sequence_link_biosample(biosample)
+    end
     # [TODO] how to deal with direct submissions (references without PMID)?
     sequence_ref(@entry.references)
   end
@@ -323,7 +339,7 @@ def sequence_length(int)
 
   def sequence_seq(str)
     # [TODO] Where to privide the actual DNA sequence?
-    fasta_uri = "<http://togows.dbcls.jp/entry/nucleotide/#{str}.fasta>"
+    fasta_uri = "<http://togows.org/entry/nucleotide/#{str}.fasta>"
     #fasta_uri = "<http://www.ncbi.nlm.nih.gov/nuccore/#{str}?report=fasta>"
     puts triple(@sequence_id, "insdc:sequence_fasta", fasta_uri)
   end
@@ -350,7 +366,21 @@ def sequence_link_accver(str)
   end
 
   def sequence_link_bioproject(str)
-    xref(@sequence_id, 'BioProject', "PRJNA#{str}")
+    xref_id = new_uuid
+    id_pfx = "http://identifiers.org/bioproject"
+    puts triple(@sequence_id, 'insdc:dblink', xref_id)
+    puts triple(xref_id, 'rdfs:label', quote(str))
+    puts triple(xref_id, 'rdfs:seeAlso', "<#{id_pfx}/#{str}>")
+    puts triple("<#{id_pfx}/#{str}>", 'rdf:type', "<#{id_pfx}>")
+  end
+
+  def sequence_link_biosample(str)
+    xref_id = new_uuid
+    id_pfx = "http://identifiers.org/biosample"
+    puts triple(@sequence_id, 'insdc:dblink', xref_id)
+    puts triple(xref_id, 'rdfs:label', quote(str))
+    puts triple(xref_id, 'rdfs:seeAlso', "<#{id_pfx}/#{str}>")
+    puts triple("<#{id_pfx}/#{str}>", 'rdf:type', "<#{id_pfx}>")
   end
 
   def sequence_ref(refs)

diff --git a/refseq2ttl.rb b/refseq2ttl.rb
@@ -138,8 +138,9 @@ def initialize(io = nil, seqtype = nil)
     @rs_id = RS_ID.new
     @ft_so = FT_SO.new
 
-    @locus = {}
+    @gene = {}
     @xref_warn = {}
+    @feature_count = Hash.new(0)
 
     puts prefix
     puts
@@ -192,7 +193,7 @@ def xref(subject, db, id)
   ### FALDO http://biohackathon.org/faldo
   ###
 
-  def new_location(pos, elem_type = false)
+  def new_location(pos, subpart_type = false)
     loc_id = new_uuid
 
     puts triple(loc_id, "insdc:location", quote(pos))
@@ -208,19 +209,19 @@ def new_location(pos, elem_type = false)
     new_position(pos_end, @locations.range.max, @locations.last.strand)
 
     list = []
-    if elem_type
+    if subpart_type
       @locations.each do |loc|
-        elem_id = new_uuid
-        elem_start = new_uuid
-        elem_end = new_uuid
-        puts triple(elem_id, "obo:so_part_of", loc_id)
-        puts triple(elem_id, "rdf:type", elem_type[:id]) + "  # #{elem_type[:term]}"
-        puts triple(elem_id, "rdf:type", "faldo:Region")
-        puts triple(elem_id, "faldo:begin", elem_start)
-        puts triple(elem_id, "faldo:end", elem_end)
-        new_position(elem_start, loc.from, loc.strand)
-        new_position(elem_end, loc.to, loc.strand)
-        list << elem_id
+        subpart_id = new_uuid
+        subpart_start = new_uuid
+        subpart_end = new_uuid
+        puts triple(subpart_id, "obo:so_part_of", loc_id)
+        puts triple(subpart_id, "rdf:type", subpart_type[:id]) + "  # #{subpart_type[:term]}"
+        puts triple(subpart_id, "rdf:type", "faldo:Region")
+        puts triple(subpart_id, "faldo:begin", subpart_start)
+        puts triple(subpart_id, "faldo:end", subpart_end)
+        new_position(subpart_start, loc.from, loc.strand)
+        new_position(subpart_end, loc.to, loc.strand)
+        list << subpart_id
       end
     end
 
@@ -250,8 +251,7 @@ def parse_refseq(io)
       @source = @features.shift
       parse_sequence
       parse_source
-      parse_parent_features # gene
-      parse_child_features  # CDS, mRNA etc.
+      parse_genes
       parse_features
     end
   end
@@ -407,13 +407,12 @@ def source_qualifiers(hash)
   end
 
   ###
-  ### Features (parent: gene)
+  ### Genes
   ###
 
-  def parse_parent_features
+  def parse_genes
     genes = @features.select {|x| x.feature == "gene"}
 
-    count = 1
     genes.each do |gene|
       gene_id = new_uuid
       hash = gene.to_hash
@@ -426,40 +425,40 @@ def parse_parent_features
 
       if hash["locus_tag"]
         locus_tag = hash["locus_tag"].first
-        @locus[locus_tag] = gene_id
+        @gene[locus_tag] = gene_id
         puts triple(gene_id, "rdfs:label", quote(locus_tag))
       elsif hash["gene"]
         gene = hash["gene"].first
-        @locus[gene] = gene_id
-        puts triple(gene_id, "rdfs:label", quote(hash["gene"].first))
+        @gene[gene] = gene_id
+        puts triple(gene_id, "rdfs:label", quote(gene))
       else
         # [TODO] Where else to find gene name?
-        puts triple(gene_id, "rdfs:label", quote("gene#{count}"))
       end
-      count += 1
-
       parse_qualifiers(gene_id, hash)
     end
   end
 
   ###
-  ### Features (child: CDS, mRNA, misc_RNA, precursor_RNA, ncRNA, tRNA, rRNA)
+  ### Features (part of gene: CDS, mRNA, misc_RNA, precursor_RNA, ncRNA, tRNA, rRNA)
   ###
 
-  def parse_child_features
-    features = @features.select {|x| x.feature[/^(CDS|mRNA|misc_RNA|precursor_RNA|ncRNA|tRNA|rRNA)$/] }
+  def parse_features
+    features = @features.select {|x| x.feature != "gene" }
 
     features.each do |feat|
       feature_id = new_uuid
       hash = feat.to_hash
 
-      case feat.feature
+      feature = feat.feature
+      @feature_count[feature] += 1
+
+      case feature
       when "CDS"
         puts triple(feature_id, "rdf:type", "obo:SO_0000316") + "  # SO:CDS"
       when "mRNA"
         puts triple(feature_id, "rdf:type", "obo:SO_0000234") + "  # SO:mRNA"
       when "misc_RNA"
-        puts triple(feature_id, "rdf:type", "obo:SO_0000673") + "  # SO:transcription"
+        puts triple(feature_id, "rdf:type", "obo:SO_0000673") + "  # SO:transcript"
       when "precursor_RNA"
         puts triple(feature_id, "rdf:type", "obo:SO_0000185") + "  # SO:primary_transcript"
       when "ncRNA"
@@ -468,70 +467,46 @@ def parse_child_features
         puts triple(feature_id, "rdf:type", "obo:SO_0000253") + "  # SO:tRNA"
       when "rRNA"
         puts triple(feature_id, "rdf:type", "obo:SO_0000252") + "  # SO:rRNA"
+      else
+        if so_id = @ft_so.so_id(feature)
+          if so_id != "undefined"
+            so = so_id.sub(':', '_')
+            puts triple(feature_id, "rdf:type", "obo:#{so}") + "  # SO:#{@ft_so.so_term(feature)}"
+          else
+            puts triple(feature_id, "rdf:type", "obo:SO_0000110") + "  # SO:sequence_feature"
+          end
+        end
       end
 
+      gene_id = locus_tag = gene = nil
       if hash["locus_tag"]
         if locus_tag = hash["locus_tag"].first
-          gene_id = @locus[locus_tag]
-        elsif gene = hash["gene"].first
-          gene_id = @locus[gene]
+          gene_id = @gene[locus_tag]
+        end
+      elsif hash["gene"]
+        if gene = hash["gene"].first
+          gene_id = @gene[gene]
         end
       end
 
+      puts triple(feature_id, "rdfs:label", quote(locus_tag || gene || feature))
+      puts triple(feature_id, "obo:so_part_of", gene_id || @sequence_id)
+
       if gene_id
-        puts triple(feature_id, "obo:so_part_of", gene_id)
+        subpart_type = { :id => "obo:SO_0000147", :term => "SO:exon" }
       else
-        # [TODO] sure to do this?
-        puts triple(feature_id, "obo:so_part_of", @sequence_id)
+        subpart_type = { :id => "obo:SO_0000001", :term => "SO:region" }
       end
 
-      if locus_tag
-        puts triple(feature_id, "rdfs:label", quote(locus_tag))
-      elsif gene
-        puts triple(feature_id, "rdfs:label", quote(gene))
-      end
-      elem_type = { :id => "obo:SO_0000147", :term => "SO:exon" }
-      loc_id, exons = new_location(feat.position, elem_type)
+      loc_id, subparts = new_location(feat.position, subpart_type)
       puts triple(feature_id, "faldo:location", loc_id)
-      puts triple(feature_id, "obo:so_has_part", "(#{exons.join(' ')})")  # rdf:List
-
-      parse_qualifiers(feature_id, hash)
-    end
-  end
-
-  ###
-  ### Features
-  ###
-
-  def parse_features
-    features = @features.select {|x| ! x.feature[/^(gene|CDS|mRNA|misc_RNA|precursor_RNA|ncRNA|tRNA|rRNA)$/] }
-    feature_count = Hash.new(0)
-
-    features.each do |feat|
-      feature = feat.feature
-      feature_count[feature] += 1
-      feature_id = new_uuid
-      hash = feat.to_hash
-
-      puts triple(feature_id, "obo:so_part_of", @sequence_id)
-      puts triple(feature_id, "rdfs:label", quote(feature))
-
-      if so_id = @ft_so.so_id(feature)
-        if so_id != "undefined"
-          so = so_id.sub(':', '_')
-          puts triple(feature_id, "rdf:type", "obo:#{so}") + "  # SO:#{@ft_so.so_term(feature)}"
-        else
-          puts triple(feature_id, "rdf:type", "obo:SO_0000110") + "  # SO:sequence_feature"
-        end
+      unless subparts.empty?
+        puts triple(feature_id, "obo:so_has_part", "(#{subparts.join(' ')})")  # rdf:List
       end
 
-      loc_id, _ = new_location(feat.position)
-      puts triple(feature_id, "faldo:location", loc_id)
-
       parse_qualifiers(feature_id, hash)
     end
-
-    $stderr.puts "Other features: #{feature_count.to_json}"
+    $stderr.puts "Features: #{@feature_count.to_json}"
   end
 
   def parse_qualifiers(feature_id, hash)

diff --git a/refseq2ttl.rb b/refseq2ttl.rb
@@ -88,6 +88,12 @@ def labels
 class FT_SO
   def initialize
     @data = JSON.parse(File.read(File.dirname(__FILE__) + "/refseq2ttl/ft_so.json"))
+    @data["ncRNA"] = {
+      "so_id" => "SO:0000655",
+      "so_term" => "ncRNA",
+      "ft_desc" => "noncoding RNA",
+      "so_desc" => "An RNA transcript that does not encode for a protein rather the RNA molecule is the gene product."
+    }
   end
 
   # ftso = FT_SO.new
@@ -244,8 +250,8 @@ def parse_refseq(io)
       @source = @features.shift
       parse_sequence
       parse_source
-      parse_genes
-      parse_cds
+      parse_parent_features # gene
+      parse_child_features  # CDS, mRNA etc.
       parse_features
     end
   end
@@ -401,10 +407,10 @@ def source_qualifiers(hash)
   end
 
   ###
-  ### genes
+  ### Features (parent: gene)
   ###
 
-  def parse_genes
+  def parse_parent_features
     genes = @features.select {|x| x.feature == "gene"}
 
     count = 1
@@ -423,6 +429,8 @@ def parse_genes
         @locus[locus_tag] = gene_id
         puts triple(gene_id, "rdfs:label", quote(locus_tag))
       elsif hash["gene"]
+        gene = hash["gene"].first
+        @locus[gene] = gene_id
         puts triple(gene_id, "rdfs:label", quote(hash["gene"].first))
       else
         # [TODO] Where else to find gene name?
@@ -435,46 +443,59 @@ def parse_genes
   end
 
   ###
-  ### CDS
+  ### Features (child: CDS, mRNA, misc_RNA, precursor_RNA, ncRNA, tRNA, rRNA)
   ###
 
-  def parse_cds
-    cdss = @features.select {|x| x.feature == "CDS"}
+  def parse_child_features
+    features = @features.select {|x| x.feature[/^(CDS|mRNA|misc_RNA|precursor_RNA|ncRNA|tRNA|rRNA)$/] }
 
-    count = 1
-    cdss.each do |cds|
-      cds_id = new_uuid
-      hash = cds.to_hash
+    features.each do |feat|
+      feature_id = new_uuid
+      hash = feat.to_hash
 
-      puts triple(cds_id, "rdf:type", "obo:SO_0000316") + "  # SO:CDS"
+      case feat.feature
+      when "CDS"
+        puts triple(feature_id, "rdf:type", "obo:SO_0000316") + "  # SO:CDS"
+      when "mRNA"
+        puts triple(feature_id, "rdf:type", "obo:SO_0000234") + "  # SO:mRNA"
+      when "misc_RNA"
+        puts triple(feature_id, "rdf:type", "obo:SO_0000673") + "  # SO:transcription"
+      when "precursor_RNA"
+        puts triple(feature_id, "rdf:type", "obo:SO_0000185") + "  # SO:primary_transcript"
+      when "ncRNA"
+        puts triple(feature_id, "rdf:type", "obo:SO_0000655") + "  # SO:ncRNA"
+      when "tRNA"
+        puts triple(feature_id, "rdf:type", "obo:SO_0000253") + "  # SO:tRNA"
+      when "rRNA"
+        puts triple(feature_id, "rdf:type", "obo:SO_0000252") + "  # SO:rRNA"
+      end
 
       if hash["locus_tag"]
         if locus_tag = hash["locus_tag"].first
           gene_id = @locus[locus_tag]
+        elsif gene = hash["gene"].first
+          gene_id = @locus[gene]
         end
       end
 
       if gene_id
-        puts triple(cds_id, "obo:so_part_of", gene_id)
+        puts triple(feature_id, "obo:so_part_of", gene_id)
       else
         # [TODO] sure to do this?
-        puts triple(cds_id, "obo:so_part_of", @sequence_id)
+        puts triple(feature_id, "obo:so_part_of", @sequence_id)
       end
 
       if locus_tag
-        puts triple(cds_id, "rdfs:label", quote(locus_tag))
-      elsif hash["gene"]
-        puts triple(cds_id, "rdfs:label", quote(hash["gene"].first))
-      else
-        puts triple(cds_id, "rdfs:label", quote("CDS#{count}"))
+        puts triple(feature_id, "rdfs:label", quote(locus_tag))
+      elsif gene
+        puts triple(feature_id, "rdfs:label", quote(gene))
       end
-      count += 1
       elem_type = { :id => "obo:SO_0000147", :term => "SO:exon" }
-      loc_id, exons = new_location(cds.position, elem_type)
-      puts triple(cds_id, "faldo:location", loc_id)
-      puts triple(cds_id, "obo:so_has_part", "(#{exons.join(' ')})")  # rdf:List
+      loc_id, exons = new_location(feat.position, elem_type)
+      puts triple(feature_id, "faldo:location", loc_id)
+      puts triple(feature_id, "obo:so_has_part", "(#{exons.join(' ')})")  # rdf:List
 
-      parse_qualifiers(cds_id, hash)
+      parse_qualifiers(feature_id, hash)
     end
   end
 
@@ -483,10 +504,12 @@ def parse_cds
   ###
 
   def parse_features
-    features = @features.select {|x| ! x.feature[/^(gene|CDS)$/]}
+    features = @features.select {|x| ! x.feature[/^(gene|CDS|mRNA|misc_RNA|precursor_RNA|ncRNA|tRNA|rRNA)$/] }
+    feature_count = Hash.new(0)
 
     features.each do |feat|
       feature = feat.feature
+      feature_count[feature] += 1
       feature_id = new_uuid
       hash = feat.to_hash
 
@@ -507,6 +530,8 @@ def parse_features
 
       parse_qualifiers(feature_id, hash)
     end
+
+    $stderr.puts "Other features: #{feature_count.to_json}"
   end
 
   def parse_qualifiers(feature_id, hash)

diff --git a/refseq2ttl.rb b/refseq2ttl.rb
@@ -125,7 +125,7 @@ class RefSeq2RDF
 
   include RDFSupport
 
-  def initialize(io = ARGF, seqtype = nil)
+  def initialize(io = nil, seqtype = nil)
     set_prefixes
 
     @seqtype = seqtype
@@ -138,16 +138,20 @@ def initialize(io = ARGF, seqtype = nil)
     puts prefix
     puts
 
-    parse_refseq(io)
+    parse_refseq(io) if io
   end
 
   attr_accessor :prefix
 
+  def self.prefixes
+    puts prefix
+  end
+
   def set_prefixes
     @prefix = default_prefix + [
       #triple("@prefix", "genome:", "<http://purl.jp/bio/10/genome/>"),
-      triple("@prefix", "idorg:", "<http://rdf.identifiers.org/database/>"),
-      triple("@prefix", "insdc:", "<http://insdc.org/owl/>"),
+      #triple("@prefix", "insdc:", "<http://insdc.org/owl/>"),
+      triple("@prefix", "insdc:", "<http://ddbj.nig.ac.jp/ontologies/sequence#>"),
     ]
   end
 
@@ -168,7 +172,8 @@ def xref(subject, db, id)
       uri = "<#{hash['prefix']}#{id}>"
       puts triple(subject, "rdfs:seeAlso", uri)
       puts triple(uri, "rdfs:label", quote("#{db}:#{id}"))
-      puts triple(uri, "rdf:type", "idorg:#{hash['class']}")
+      #puts triple(uri, "rdf:type", "idorg:#{hash['class']}")
+      puts triple(uri, "rdf:type", "<#{hash['prefix']}>")
     else
       unless @xref_warn[db]
         $stderr.puts "Error: New database '#{db}' found. Add it to the rs_id.json file and/or Identifiers.org."
@@ -339,7 +344,7 @@ def sequence_link_accver(str)
   end
 
   def sequence_link_bioproject(str)
-    xref(@sequence_id, 'BioProject', str)
+    xref(@sequence_id, 'BioProject', "PRJNA#{str}")
   end
 
   def sequence_ref(refs)
@@ -382,13 +387,13 @@ def source_qualifiers(hash)
     hash.each do |qual, vals|
       vals.each do |val|
         if val == true
-          puts triple(@source_id, "insdc:source_#{qual}", true)
+          puts triple(@source_id, "insdc:#{qual}", true)
         else        
           data = val.to_s.gsub(/\s+/, ' ').strip
           if data[/^\d+$/]
-            puts triple(@source_id, "insdc:source_#{qual}", data)
+            puts triple(@source_id, "insdc:#{qual}", data)
           else
-            puts triple(@source_id, "insdc:source_#{qual}", quote(data))
+            puts triple(@source_id, "insdc:#{qual}", quote(data))
           end
         end
       end
@@ -508,7 +513,7 @@ def parse_qualifiers(feature_id, hash)
     hash.each do |qual, vals|
       vals.each do |val|
         if val == true
-          puts triple(feature_id, "insdc:feature_#{qual}", true)
+          puts triple(feature_id, "insdc:#{qual}", true)
         else
           data = val.to_s.gsub(/\s+/, ' ').strip
           case qual
@@ -553,9 +558,9 @@ def parse_qualifiers(feature_id, hash)
             end
           else
             if data[/^\d+$/]
-              puts triple(feature_id, "insdc:feature_#{qual}", data)
+              puts triple(feature_id, "insdc:#{qual}", data)
             else
-              puts triple(feature_id, "insdc:feature_#{qual}", quote(data))
+              puts triple(feature_id, "insdc:#{qual}", quote(data))
             end
           end
         end
@@ -571,6 +576,7 @@ def parse_qualifiers(feature_id, hash)
 
   args = GetoptLong.new(
     [ '--seqtype', '-t', GetoptLong::REQUIRED_ARGUMENT ],
+    [ '--prefixes', '-p', GetoptLong::NO_ARGUMENT ],
   )
 
   opts = {
@@ -581,10 +587,16 @@ def parse_qualifiers(feature_id, hash)
     case name
     when /--seqtype/
       opts[:seqtype] = value
+    when /--prefixes/
+      opts[:prefixes] = true
     end
   end
 
-  RefSeq2RDF.new(ARGF, opts[:seqtype])
+  if opts[:prefixes]
+    RefSeq2RDF.new
+  else
+    RefSeq2RDF.new(ARGF, opts[:seqtype])
+  end
 end
 
 __END__
diff --git a/refseq2ttl.rb b/refseq2ttl.rb
@@ -0,0 +1,590 @@
+#!/usr/bin/env ruby
+
+require 'rubygems'
+require 'uri'
+require 'bio'
+require 'json'
+require 'securerandom'
+
+# [TODO] integrate this into BioRuby
+module Bio
+  class GenBank
+    def dblink
+      fetch('DBLINK')
+    end
+
+    def bioproject
+      dblink[/\d+/]
+    end
+  end
+end
+
+###
+### Utilities for RDF generation
+###
+
+module RDFSupport
+  def new_uuid(prefix = "http://purl.jp/bio/10/genome/uuid/")
+    #return "<#{prefix}#{SecureRandom.uuid}>"
+    #return "genome:uuid-#{SecureRandom.uuid}"
+    return "<urn:uuid:#{SecureRandom.uuid}>"
+  end
+
+  def quote(str)
+    return str.gsub('\\', '\\\\').gsub("\t", '\\t').gsub("\n", '\\n').gsub("\r", '\\r').gsub('"', '\\"').inspect
+  end
+
+  def triple(s, p, o)
+    return [s, p, o].join("\t") + " ."
+  end
+
+  def default_prefix
+    return [
+      triple("@prefix", "rdf:", "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>"),
+      triple("@prefix", "rdfs:", "<http://www.w3.org/2000/01/rdf-schema#>"),
+      #triple("@prefix", "dcterms:", "<http://purl.org/dc/terms/>"),
+      triple("@prefix", "xsd:", "<http://www.w3.org/2001/XMLSchema#>"),
+      #triple("@prefix", "sio:", "<http://semanticscience.org/resource#>"),
+      #triple("@prefix", "so:", "<http://purl.org/obo/owl/SO#>"),
+      triple("@prefix", "obo:", "<http://purl.obolibrary.org/obo/>"),
+      triple("@prefix", "faldo:", "<http://biohackathon.org/resource/faldo#>"),
+    ]
+  end
+
+  def usdate2date(str)
+    return Date.parse(str).strftime("%Y-%m-%d")  
+  end
+end
+
+###
+### Mapping RefSeq db_xref to Identifiers.org
+###
+
+# https://gist.github.com/3985701
+# https://gist.github.com/4146256
+class RS_ID
+  include RDFSupport
+
+  def initialize
+    @rs_id = JSON.parse(File.read(File.dirname(__FILE__) + "/refseq2ttl/rs_id.json"))
+  end
+
+  def fetch(db)
+    @rs_id[db]
+  end
+
+  def labels
+    @rs_id.sort.each do |db, hash|
+      puts triple("insdc:#{hash['class']}", "rdfs:label", quote(hash['label']))
+    end
+  end
+end
+
+###
+### Mapping RefSeq feature table to Sequence Ontology
+###
+
+# https://gist.github.com/3650401
+class FT_SO
+  def initialize
+    @data = JSON.parse(File.read(File.dirname(__FILE__) + "/refseq2ttl/ft_so.json"))
+  end
+
+  # ftso = FT_SO.new
+  # puts ftso.so_id("-10_signal")  # => "SO:0000175"
+  def so_id(feature)
+    if hash = @data[feature]
+      return hash["so_id"]
+    end
+  end
+
+  def so_term(feature)
+    if hash = @data[feature]
+      return hash["so_term"]
+    end
+  end
+
+  def so_desc(feature)
+    if hash = @data[feature]
+      return hash["so_desc"]
+    end
+  end
+
+  def ft_desc(feature)
+    if hash = @data[feature]
+      return hash["ft_desc"]
+    end
+  end
+end
+
+###
+### Convert RefSeq (prokaryote) entries to RDF
+###
+
+class RefSeq2RDF
+
+  include RDFSupport
+
+  def initialize(io = ARGF, seqtype = nil)
+    set_prefixes
+
+    @seqtype = seqtype
+    @rs_id = RS_ID.new
+    @ft_so = FT_SO.new
+
+    @locus = {}
+    @xref_warn = {}
+
+    puts prefix
+    puts
+
+    parse_refseq(io)
+  end
+
+  attr_accessor :prefix
+
+  def set_prefixes
+    @prefix = default_prefix + [
+      #triple("@prefix", "genome:", "<http://purl.jp/bio/10/genome/>"),
+      triple("@prefix", "idorg:", "<http://rdf.identifiers.org/database/>"),
+      triple("@prefix", "insdc:", "<http://insdc.org/owl/>"),
+    ]
+  end
+
+  def xref(subject, db, id)
+    case db
+    when "HOMD"
+      id.sub!(/^tax_/, '')
+    when "ECOCYC"
+      #id = "ECOCYC:#{id}"
+    when "GI", "ERIC", "HMP", "PSEUDO", "Pathema"
+      unless @xref_warn[db]
+        $stderr.puts "Warning: Need to register '#{db}' in Identifiers.org"
+        @xref_warn[db] = true
+      end
+    end
+
+    if hash = @rs_id.fetch(db)
+      uri = "<#{hash['prefix']}#{id}>"
+      puts triple(subject, "rdfs:seeAlso", uri)
+      puts triple(uri, "rdfs:label", quote("#{db}:#{id}"))
+      puts triple(uri, "rdf:type", "idorg:#{hash['class']}")
+    else
+      unless @xref_warn[db]
+        $stderr.puts "Error: New database '#{db}' found. Add it to the rs_id.json file and/or Identifiers.org."
+        @xref_warn[db] = true
+      end
+    end
+  end
+
+  ###
+  ### FALDO http://biohackathon.org/faldo
+  ###
+
+  def new_location(pos, elem_type = false)
+    loc_id = new_uuid
+
+    puts triple(loc_id, "insdc:location", quote(pos))
+
+    @locations = Bio::Locations.new(pos)
+    pos_start = new_uuid
+    pos_end = new_uuid
+    puts triple(loc_id, "rdf:type", "faldo:Region")
+    puts triple(loc_id, "faldo:begin", pos_start)
+    puts triple(loc_id, "faldo:end", pos_end)
+    # [TODO] need to check join(100000..1000289,1..1234) type object located over the origin
+    new_position(pos_start, @locations.range.min, @locations.first.strand)
+    new_position(pos_end, @locations.range.max, @locations.last.strand)
+
+    list = []
+    if elem_type
+      @locations.each do |loc|
+        elem_id = new_uuid
+        elem_start = new_uuid
+        elem_end = new_uuid
+        puts triple(elem_id, "obo:so_part_of", loc_id)
+        puts triple(elem_id, "rdf:type", elem_type[:id]) + "  # #{elem_type[:term]}"
+        puts triple(elem_id, "rdf:type", "faldo:Region")
+        puts triple(elem_id, "faldo:begin", elem_start)
+        puts triple(elem_id, "faldo:end", elem_end)
+        new_position(elem_start, loc.from, loc.strand)
+        new_position(elem_end, loc.to, loc.strand)
+        list << elem_id
+      end
+    end
+
+    return loc_id, list
+  end
+
+  def new_position(pos_id, pos, strand)
+    puts triple(pos_id, "faldo:position", pos)
+    puts triple(pos_id, "faldo:reference", @sequence_id)
+    puts triple(pos_id, "rdf:type", "faldo:ExactPosition")
+    if strand > 0
+      puts triple(pos_id, "rdf:type", "faldo:ForwardStrandPosition")
+    else
+      puts triple(pos_id, "rdf:type", "faldo:ReverseStrandPosition")
+    end
+  end
+
+  ###
+  ### Main
+  ###
+
+  def parse_refseq(io)
+    # Read RefSeq entry
+    Bio::FlatFile.auto(io).each do |entry|
+      @entry = entry
+      @features = entry.features
+      @source = @features.shift
+      parse_sequence
+      parse_source
+      parse_genes
+      parse_cds
+      parse_features
+    end
+  end
+
+  ###
+  ### Sequence
+  ###
+
+  # [TODO]
+  # * bind sequences by BioProject ID?
+  # * flag complete/draft?
+  def parse_sequence
+    @sequence_id = new_uuid
+
+    # [TODO] How to identify the input is chromosome/plasmid/contig/...?
+    sequence_type(@seqtype)
+    # [TODO] Obtain rdfs:label from source /chromosome (eukaryotes) /plasmid (prokaryotes) -> see insdc:source_chromosome, insdc:source_plasmid
+    sequence_label(@entry.definition)
+    sequence_version(@entry.acc_version)
+    sequence_length(@entry.nalen)
+    # [TODO] provide REST API to retreive genomic DNA sequence by <@sequence_id.fasta>
+    sequence_seq(@entry.acc_version)
+    sequence_form(@entry.circular)
+    # [TODO] sequenced date, modified in the source db or in our RDF data?
+    sequence_date(@entry.date)
+    # [TODO] rdfs:seeAlso (like UniProt) or dc:relation, owl:sameAs
+    sequence_link_gi(@entry.gi.sub('GI:',''))
+    sequence_link_accver(@entry.acc_version)
+    sequence_link_bioproject(@entry.bioproject)
+    # [TODO] how to deal with direct submissions (references without PMID)?
+    sequence_ref(@entry.references)
+  end
+
+  def sequence_type(so = "SO:chromosome")
+    case so
+    when /0000340/, "SO:chromosome"
+      puts triple(@sequence_id, "rdf:type", "obo:SO_0000340") + "  # SO:chromosome"
+    when /0000155/, "SO:plasmid"
+      puts triple(@sequence_id, "rdf:type", "obo:SO_0000155") + "  # SO:plasmid"
+    when /0000736/, "SO:organelle_sequence"
+      puts triple(@sequence_id, "rdf:type", "obo:SO_0000736") + "  # SO:organelle_sequence"
+    when /0000819/, "SO:mitochondrial_chromosome"
+      puts triple(@sequence_id, "rdf:type", "obo:SO_0000819") + "  # SO:mitochondrial_chromosome"
+    when /0000740/, "SO:plastid_sequence"
+      puts triple(@sequence_id, "rdf:type", "obo:SO_0000740") + "  # SO:plastid_sequence"
+    when /0000719/, "SO:ultracontig"
+      puts triple(@sequence_id, "rdf:type", "obo:SO_0000719") + "  # SO:ultracontig"
+    when /0000148/, "SO:supercontig", "SO:scaffold"
+      puts triple(@sequence_id, "rdf:type", "obo:SO_0000148") + "  # SO:supercontig/scaffold"
+    when /0000149/, "SO:contig"
+      puts triple(@sequence_id, "rdf:type", "obo:SO_0000149") + "  # SO:contig"
+    else
+      puts triple(@sequence_id, "rdf:type", "obo:SO_0000353") + "  # SO:sequence_assembly"
+    end
+  end
+
+  def sequence_label(str)
+    # Use "name:" key in the JSON representation
+    puts triple(@sequence_id, "rdfs:label", quote(str))
+  end
+
+  def sequence_version(str)
+    puts triple(@sequence_id, "insdc:sequence_version", quote(str))
+  end
+
+  def sequence_length(int)
+    puts triple(@sequence_id, "insdc:sequence_length", int)
+  end
+
+  def sequence_seq(str)
+    # [TODO] Where to privide the actual DNA sequence?
+    fasta_uri = "<http://togows.dbcls.jp/entry/nucleotide/#{str}.fasta>"
+    #fasta_uri = "<http://www.ncbi.nlm.nih.gov/nuccore/#{str}?report=fasta>"
+    puts triple(@sequence_id, "insdc:sequence_fasta", fasta_uri)
+  end
+
+  def sequence_form(form)
+    case form
+    when "linear"
+      puts triple(@sequence_id, "rdf:type", "obo:SO_0000987") + "  # SO:linear"
+    when "circular"
+      puts triple(@sequence_id, "rdf:type", "obo:SO_0000988") + "  # SO:circular"
+    end
+  end
+
+  def sequence_date(date)
+    puts triple(@sequence_id, "insdc:sequence_date", quote(usdate2date(date))+"^^xsd:date")
+  end
+
+  def sequence_link_gi(str)
+    xref(@sequence_id, 'GI', str)
+  end
+
+  def sequence_link_accver(str)
+    xref(@sequence_id, 'RefSeq', str)
+  end
+
+  def sequence_link_bioproject(str)
+    xref(@sequence_id, 'BioProject', str)
+  end
+
+  def sequence_ref(refs)
+    refs.each do |ref|
+      pmid = ref.pubmed
+      if pmid.length > 0
+        xref(@sequence_id, 'PubMed', pmid)
+      end
+    end
+  end
+
+  ###
+  ### Source
+  ###
+
+  def parse_source
+    # Use @sequence_id for @source_id
+    @source_id = @sequence_id
+
+    hash = @source.to_hash
+    source_location(@source.position)
+    source_link(hash["db_xref"])
+    hash.delete("db_xref")
+    source_qualifiers(hash)
+  end
+
+  def source_location(pos)
+    loc_id, = new_location(pos)
+    puts triple(@source_id, "faldo:location", loc_id)
+  end
+
+  def source_link(links)
+    links.each do |link|
+      db, entry_id = link.split(':', 2)
+      xref(@source_id, db, entry_id)
+    end
+  end
+
+  def source_qualifiers(hash)
+    hash.each do |qual, vals|
+      vals.each do |val|
+        if val == true
+          puts triple(@source_id, "insdc:source_#{qual}", true)
+        else        
+          data = val.to_s.gsub(/\s+/, ' ').strip
+          if data[/^\d+$/]
+            puts triple(@source_id, "insdc:source_#{qual}", data)
+          else
+            puts triple(@source_id, "insdc:source_#{qual}", quote(data))
+          end
+        end
+      end
+    end
+  end
+
+  ###
+  ### genes
+  ###
+
+  def parse_genes
+    genes = @features.select {|x| x.feature == "gene"}
+
+    count = 1
+    genes.each do |gene|
+      gene_id = new_uuid
+      hash = gene.to_hash
+
+      puts triple(gene_id, "rdf:type", "obo:SO_0000704") + "  # SO:gene"
+      puts triple(gene_id, "obo:so_part_of", @sequence_id)
+
+      loc_id, _ = new_location(gene.position)
+      puts triple(gene_id, "faldo:location", loc_id)
+
+      if hash["locus_tag"]
+        locus_tag = hash["locus_tag"].first
+        @locus[locus_tag] = gene_id
+        puts triple(gene_id, "rdfs:label", quote(locus_tag))
+      elsif hash["gene"]
+        puts triple(gene_id, "rdfs:label", quote(hash["gene"].first))
+      else
+        # [TODO] Where else to find gene name?
+        puts triple(gene_id, "rdfs:label", quote("gene#{count}"))
+      end
+      count += 1
+
+      parse_qualifiers(gene_id, hash)
+    end
+  end
+
+  ###
+  ### CDS
+  ###
+
+  def parse_cds
+    cdss = @features.select {|x| x.feature == "CDS"}
+
+    count = 1
+    cdss.each do |cds|
+      cds_id = new_uuid
+      hash = cds.to_hash
+
+      puts triple(cds_id, "rdf:type", "obo:SO_0000316") + "  # SO:CDS"
+
+      if hash["locus_tag"]
+        if locus_tag = hash["locus_tag"].first
+          gene_id = @locus[locus_tag]
+        end
+      end
+
+      if gene_id
+        puts triple(cds_id, "obo:so_part_of", gene_id)
+      else
+        # [TODO] sure to do this?
+        puts triple(cds_id, "obo:so_part_of", @sequence_id)
+      end
+
+      if locus_tag
+        puts triple(cds_id, "rdfs:label", quote(locus_tag))
+      elsif hash["gene"]
+        puts triple(cds_id, "rdfs:label", quote(hash["gene"].first))
+      else
+        puts triple(cds_id, "rdfs:label", quote("CDS#{count}"))
+      end
+      count += 1
+      elem_type = { :id => "obo:SO_0000147", :term => "SO:exon" }
+      loc_id, exons = new_location(cds.position, elem_type)
+      puts triple(cds_id, "faldo:location", loc_id)
+      puts triple(cds_id, "obo:so_has_part", "(#{exons.join(' ')})")  # rdf:List
+
+      parse_qualifiers(cds_id, hash)
+    end
+  end
+
+  ###
+  ### Features
+  ###
+
+  def parse_features
+    features = @features.select {|x| ! x.feature[/^(gene|CDS)$/]}
+
+    features.each do |feat|
+      feature = feat.feature
+      feature_id = new_uuid
+      hash = feat.to_hash
+
+      puts triple(feature_id, "obo:so_part_of", @sequence_id)
+      puts triple(feature_id, "rdfs:label", quote(feature))
+
+      if so_id = @ft_so.so_id(feature)
+        if so_id != "undefined"
+          so = so_id.sub(':', '_')
+          puts triple(feature_id, "rdf:type", "obo:#{so}") + "  # SO:#{@ft_so.so_term(feature)}"
+        else
+          puts triple(feature_id, "rdf:type", "obo:SO_0000110") + "  # SO:sequence_feature"
+        end
+      end
+
+      loc_id, _ = new_location(feat.position)
+      puts triple(feature_id, "faldo:location", loc_id)
+
+      parse_qualifiers(feature_id, hash)
+    end
+  end
+
+  def parse_qualifiers(feature_id, hash)
+    hash.each do |qual, vals|
+      vals.each do |val|
+        if val == true
+          puts triple(feature_id, "insdc:feature_#{qual}", true)
+        else
+          data = val.to_s.gsub(/\s+/, ' ').strip
+          case qual
+          when "protein_id"
+            xref(feature_id, 'Protein', val)
+          when "db_xref"
+            db, id = val.split(':', 2)
+            # ad hoc
+            if db == 'InterPro' and @entry.acc_version[/(NC_010994.1|NC_014958.1|NC_015385.1|NC_015386.1|NC_015387.1|NC_015388.1|NC_015389.1)/]
+              # PRJNA59115/NC_010994.1
+              # PRJNA62225/NC_014958.1
+              # PRJNA65781/NC_015385.1
+              # PRJNA65781/plasmids/NC_015386.1
+              # PRJNA65783/NC_015387.1
+              # PRJNA65785/NC_015388.1
+              # PRJNA65787/NC_015389.1
+              #       /db_xref="InterPro:Chromosomal replication control,
+              #       initiator (DnaA)/regulator (Hda"
+              xref(feature_id, db, id) if id[/IPR\d+/]
+            elsif db == "ASAP" and @entry.acc_version[/(NC_017263.1|NC_017264.1|NC_017265.1|NC_017266.1)/]
+              # PRJNA158537/plasmids/NC_017263.1
+              # PRJNA158537/plasmids/NC_017264.1
+              # PRJNA158537/NC_017265.1
+              # PRJNA158537/plasmids/NC_017266.1
+              #       /db_xref="ASAP:BBE-0004740"
+              #       /db_xref="ASAP:BBE-0004740 ERIC"
+              xref(feature_id, db, id) unless id[/\s/]
+            elsif db == 'TIGRFAM' and @entry.acc_version[/NC_013418.2/]
+              # PRJNA41287/NC_013418.2
+              #       /db_xref="TIGRFAM:TIGR00336, TIGR01740"
+              #       /db_xref="TIGRFAM:TIGR00197; TF"
+              if id[/,/]
+                ids = id.split(',').map{|x| x.strip}
+                ids.each do |x|
+                  xref(feature_id, db, x)
+                end
+              else
+                xref(feature_id, db, id.sub(/;.*/, ''))
+              end
+            else
+              xref(feature_id, db, id)
+            end
+          else
+            if data[/^\d+$/]
+              puts triple(feature_id, "insdc:feature_#{qual}", data)
+            else
+              puts triple(feature_id, "insdc:feature_#{qual}", quote(data))
+            end
+          end
+        end
+      end
+    end
+  end
+
+end
+
+
+if __FILE__ == $0
+  require 'getoptlong'
+
+  args = GetoptLong.new(
+    [ '--seqtype', '-t', GetoptLong::REQUIRED_ARGUMENT ],
+  )
+
+  opts = {
+    :seqtype => "SO:chromosome",
+  }
+
+  args.each_option do |name, value|
+    case name
+    when /--seqtype/
+      opts[:seqtype] = value
+    end
+  end
+
+  RefSeq2RDF.new(ARGF, opts[:seqtype])
+end
+
+__END__