Skip to content

Instantly share code, notes, and snippets.

@ktym
Last active December 17, 2015 03:59

Revisions

  1. ktym revised this gist Feb 4, 2014. 1 changed file with 40 additions and 10 deletions.
    50 changes: 40 additions & 10 deletions refseq2ttl.rb
    Original file line number Diff line number Diff line change
    @@ -13,8 +13,16 @@ def dblink
    fetch('DBLINK')
    end

    def project
    dblink[/Project: (\d+)/, 1]
    end

    def bioproject
    dblink[/\d+/]
    dblink[/BioProject: (\S+)/, 1]
    end

    def biosample
    dblink[/BioSample: (\S+)/, 1]
    end
    end
    end
    @@ -158,7 +166,7 @@ def set_prefixes
    @prefix = default_prefix + [
    #triple("@prefix", "genome:", "<http://purl.jp/bio/10/genome/>"),
    #triple("@prefix", "insdc:", "<http://insdc.org/owl/>"),
    triple("@prefix", "insdc:", "<http://ddbj.nig.ac.jp/ontologies/sequence#>"),
    triple("@prefix", "insdc:", "<http://ddbj.nig.ac.jp/ontologies/sequence/>"),
    ]
    end

    @@ -176,9 +184,9 @@ def xref(subject, db, id)
    end

    if hash = @rs_id.fetch(db)
    uri = "<#{hash['prefix']}#{id}>"
    uri = "<#{hash['prefix']}/#{id}>"
    puts triple(subject, "rdfs:seeAlso", uri)
    puts triple(uri, "rdfs:label", quote("#{db}:#{id}"))
    puts triple(uri, "rdfs:label", quote(id))
    #puts triple(uri, "rdf:type", "idorg:#{hash['class']}")
    puts triple(uri, "rdf:type", "<#{hash['prefix']}>")
    else
    @@ -204,9 +212,10 @@ def new_location(pos, subpart_type = false)
    puts triple(loc_id, "rdf:type", "faldo:Region")
    puts triple(loc_id, "faldo:begin", pos_start)
    puts triple(loc_id, "faldo:end", pos_end)
    # [TODO] need to check join(100000..1000289,1..1234) type object located over the origin
    new_position(pos_start, @locations.range.min, @locations.first.strand)
    new_position(pos_end, @locations.range.max, @locations.last.strand)
    # [TODO] Note that positions of an object located over the origin can be faldo:begin > faldo:end
    # e.g., join(800..900,1000..1024,1..234) will be faldo:start 800 and faldo:end 234
    new_position(pos_start, @locations.first.from, @locations.first.strand)
    new_position(pos_end, @locations.last.to, @locations.last.strand)

    list = []
    if subpart_type
    @@ -280,7 +289,14 @@ def parse_sequence
    # [TODO] rdfs:seeAlso (like UniProt) or dc:relation, owl:sameAs
    sequence_link_gi(@entry.gi.sub('GI:',''))
    sequence_link_accver(@entry.acc_version)
    sequence_link_bioproject(@entry.bioproject)
    if bioproject = @entry.bioproject
    sequence_link_bioproject(bioproject)
    elsif project = @entry.project
    sequence_link_bioproject("PRJNA#{project}")
    end
    if biosample = @entry.biosample
    sequence_link_biosample(biosample)
    end
    # [TODO] how to deal with direct submissions (references without PMID)?
    sequence_ref(@entry.references)
    end
    @@ -323,7 +339,7 @@ def sequence_length(int)

    def sequence_seq(str)
    # [TODO] Where to privide the actual DNA sequence?
    fasta_uri = "<http://togows.dbcls.jp/entry/nucleotide/#{str}.fasta>"
    fasta_uri = "<http://togows.org/entry/nucleotide/#{str}.fasta>"
    #fasta_uri = "<http://www.ncbi.nlm.nih.gov/nuccore/#{str}?report=fasta>"
    puts triple(@sequence_id, "insdc:sequence_fasta", fasta_uri)
    end
    @@ -350,7 +366,21 @@ def sequence_link_accver(str)
    end

    def sequence_link_bioproject(str)
    xref(@sequence_id, 'BioProject', "PRJNA#{str}")
    xref_id = new_uuid
    id_pfx = "http://identifiers.org/bioproject"
    puts triple(@sequence_id, 'insdc:dblink', xref_id)
    puts triple(xref_id, 'rdfs:label', quote(str))
    puts triple(xref_id, 'rdfs:seeAlso', "<#{id_pfx}/#{str}>")
    puts triple("<#{id_pfx}/#{str}>", 'rdf:type', "<#{id_pfx}>")
    end

    def sequence_link_biosample(str)
    xref_id = new_uuid
    id_pfx = "http://identifiers.org/biosample"
    puts triple(@sequence_id, 'insdc:dblink', xref_id)
    puts triple(xref_id, 'rdfs:label', quote(str))
    puts triple(xref_id, 'rdfs:seeAlso', "<#{id_pfx}/#{str}>")
    puts triple("<#{id_pfx}/#{str}>", 'rdf:type', "<#{id_pfx}>")
    end

    def sequence_ref(refs)
  2. ktym revised this gist Sep 11, 2013. 1 changed file with 53 additions and 78 deletions.
    131 changes: 53 additions & 78 deletions refseq2ttl.rb
    Original file line number Diff line number Diff line change
    @@ -138,8 +138,9 @@ def initialize(io = nil, seqtype = nil)
    @rs_id = RS_ID.new
    @ft_so = FT_SO.new

    @locus = {}
    @gene = {}
    @xref_warn = {}
    @feature_count = Hash.new(0)

    puts prefix
    puts
    @@ -192,7 +193,7 @@ def xref(subject, db, id)
    ### FALDO http://biohackathon.org/faldo
    ###

    def new_location(pos, elem_type = false)
    def new_location(pos, subpart_type = false)
    loc_id = new_uuid

    puts triple(loc_id, "insdc:location", quote(pos))
    @@ -208,19 +209,19 @@ def new_location(pos, elem_type = false)
    new_position(pos_end, @locations.range.max, @locations.last.strand)

    list = []
    if elem_type
    if subpart_type
    @locations.each do |loc|
    elem_id = new_uuid
    elem_start = new_uuid
    elem_end = new_uuid
    puts triple(elem_id, "obo:so_part_of", loc_id)
    puts triple(elem_id, "rdf:type", elem_type[:id]) + " # #{elem_type[:term]}"
    puts triple(elem_id, "rdf:type", "faldo:Region")
    puts triple(elem_id, "faldo:begin", elem_start)
    puts triple(elem_id, "faldo:end", elem_end)
    new_position(elem_start, loc.from, loc.strand)
    new_position(elem_end, loc.to, loc.strand)
    list << elem_id
    subpart_id = new_uuid
    subpart_start = new_uuid
    subpart_end = new_uuid
    puts triple(subpart_id, "obo:so_part_of", loc_id)
    puts triple(subpart_id, "rdf:type", subpart_type[:id]) + " # #{subpart_type[:term]}"
    puts triple(subpart_id, "rdf:type", "faldo:Region")
    puts triple(subpart_id, "faldo:begin", subpart_start)
    puts triple(subpart_id, "faldo:end", subpart_end)
    new_position(subpart_start, loc.from, loc.strand)
    new_position(subpart_end, loc.to, loc.strand)
    list << subpart_id
    end
    end

    @@ -250,8 +251,7 @@ def parse_refseq(io)
    @source = @features.shift
    parse_sequence
    parse_source
    parse_parent_features # gene
    parse_child_features # CDS, mRNA etc.
    parse_genes
    parse_features
    end
    end
    @@ -407,13 +407,12 @@ def source_qualifiers(hash)
    end

    ###
    ### Features (parent: gene)
    ### Genes
    ###

    def parse_parent_features
    def parse_genes
    genes = @features.select {|x| x.feature == "gene"}

    count = 1
    genes.each do |gene|
    gene_id = new_uuid
    hash = gene.to_hash
    @@ -426,40 +425,40 @@ def parse_parent_features

    if hash["locus_tag"]
    locus_tag = hash["locus_tag"].first
    @locus[locus_tag] = gene_id
    @gene[locus_tag] = gene_id
    puts triple(gene_id, "rdfs:label", quote(locus_tag))
    elsif hash["gene"]
    gene = hash["gene"].first
    @locus[gene] = gene_id
    puts triple(gene_id, "rdfs:label", quote(hash["gene"].first))
    @gene[gene] = gene_id
    puts triple(gene_id, "rdfs:label", quote(gene))
    else
    # [TODO] Where else to find gene name?
    puts triple(gene_id, "rdfs:label", quote("gene#{count}"))
    end
    count += 1

    parse_qualifiers(gene_id, hash)
    end
    end

    ###
    ### Features (child: CDS, mRNA, misc_RNA, precursor_RNA, ncRNA, tRNA, rRNA)
    ### Features (part of gene: CDS, mRNA, misc_RNA, precursor_RNA, ncRNA, tRNA, rRNA)
    ###

    def parse_child_features
    features = @features.select {|x| x.feature[/^(CDS|mRNA|misc_RNA|precursor_RNA|ncRNA|tRNA|rRNA)$/] }
    def parse_features
    features = @features.select {|x| x.feature != "gene" }

    features.each do |feat|
    feature_id = new_uuid
    hash = feat.to_hash

    case feat.feature
    feature = feat.feature
    @feature_count[feature] += 1

    case feature
    when "CDS"
    puts triple(feature_id, "rdf:type", "obo:SO_0000316") + " # SO:CDS"
    when "mRNA"
    puts triple(feature_id, "rdf:type", "obo:SO_0000234") + " # SO:mRNA"
    when "misc_RNA"
    puts triple(feature_id, "rdf:type", "obo:SO_0000673") + " # SO:transcription"
    puts triple(feature_id, "rdf:type", "obo:SO_0000673") + " # SO:transcript"
    when "precursor_RNA"
    puts triple(feature_id, "rdf:type", "obo:SO_0000185") + " # SO:primary_transcript"
    when "ncRNA"
    @@ -468,70 +467,46 @@ def parse_child_features
    puts triple(feature_id, "rdf:type", "obo:SO_0000253") + " # SO:tRNA"
    when "rRNA"
    puts triple(feature_id, "rdf:type", "obo:SO_0000252") + " # SO:rRNA"
    else
    if so_id = @ft_so.so_id(feature)
    if so_id != "undefined"
    so = so_id.sub(':', '_')
    puts triple(feature_id, "rdf:type", "obo:#{so}") + " # SO:#{@ft_so.so_term(feature)}"
    else
    puts triple(feature_id, "rdf:type", "obo:SO_0000110") + " # SO:sequence_feature"
    end
    end
    end

    gene_id = locus_tag = gene = nil
    if hash["locus_tag"]
    if locus_tag = hash["locus_tag"].first
    gene_id = @locus[locus_tag]
    elsif gene = hash["gene"].first
    gene_id = @locus[gene]
    gene_id = @gene[locus_tag]
    end
    elsif hash["gene"]
    if gene = hash["gene"].first
    gene_id = @gene[gene]
    end
    end

    puts triple(feature_id, "rdfs:label", quote(locus_tag || gene || feature))
    puts triple(feature_id, "obo:so_part_of", gene_id || @sequence_id)

    if gene_id
    puts triple(feature_id, "obo:so_part_of", gene_id)
    subpart_type = { :id => "obo:SO_0000147", :term => "SO:exon" }
    else
    # [TODO] sure to do this?
    puts triple(feature_id, "obo:so_part_of", @sequence_id)
    subpart_type = { :id => "obo:SO_0000001", :term => "SO:region" }
    end

    if locus_tag
    puts triple(feature_id, "rdfs:label", quote(locus_tag))
    elsif gene
    puts triple(feature_id, "rdfs:label", quote(gene))
    end
    elem_type = { :id => "obo:SO_0000147", :term => "SO:exon" }
    loc_id, exons = new_location(feat.position, elem_type)
    loc_id, subparts = new_location(feat.position, subpart_type)
    puts triple(feature_id, "faldo:location", loc_id)
    puts triple(feature_id, "obo:so_has_part", "(#{exons.join(' ')})") # rdf:List

    parse_qualifiers(feature_id, hash)
    end
    end

    ###
    ### Features
    ###

    def parse_features
    features = @features.select {|x| ! x.feature[/^(gene|CDS|mRNA|misc_RNA|precursor_RNA|ncRNA|tRNA|rRNA)$/] }
    feature_count = Hash.new(0)

    features.each do |feat|
    feature = feat.feature
    feature_count[feature] += 1
    feature_id = new_uuid
    hash = feat.to_hash

    puts triple(feature_id, "obo:so_part_of", @sequence_id)
    puts triple(feature_id, "rdfs:label", quote(feature))

    if so_id = @ft_so.so_id(feature)
    if so_id != "undefined"
    so = so_id.sub(':', '_')
    puts triple(feature_id, "rdf:type", "obo:#{so}") + " # SO:#{@ft_so.so_term(feature)}"
    else
    puts triple(feature_id, "rdf:type", "obo:SO_0000110") + " # SO:sequence_feature"
    end
    unless subparts.empty?
    puts triple(feature_id, "obo:so_has_part", "(#{subparts.join(' ')})") # rdf:List
    end

    loc_id, _ = new_location(feat.position)
    puts triple(feature_id, "faldo:location", loc_id)

    parse_qualifiers(feature_id, hash)
    end

    $stderr.puts "Other features: #{feature_count.to_json}"
    $stderr.puts "Features: #{@feature_count.to_json}"
    end

    def parse_qualifiers(feature_id, hash)
  3. ktym revised this gist Sep 10, 2013. 1 changed file with 50 additions and 25 deletions.
    75 changes: 50 additions & 25 deletions refseq2ttl.rb
    Original file line number Diff line number Diff line change
    @@ -88,6 +88,12 @@ def labels
    class FT_SO
    def initialize
    @data = JSON.parse(File.read(File.dirname(__FILE__) + "/refseq2ttl/ft_so.json"))
    @data["ncRNA"] = {
    "so_id" => "SO:0000655",
    "so_term" => "ncRNA",
    "ft_desc" => "noncoding RNA",
    "so_desc" => "An RNA transcript that does not encode for a protein rather the RNA molecule is the gene product."
    }
    end

    # ftso = FT_SO.new
    @@ -244,8 +250,8 @@ def parse_refseq(io)
    @source = @features.shift
    parse_sequence
    parse_source
    parse_genes
    parse_cds
    parse_parent_features # gene
    parse_child_features # CDS, mRNA etc.
    parse_features
    end
    end
    @@ -401,10 +407,10 @@ def source_qualifiers(hash)
    end

    ###
    ### genes
    ### Features (parent: gene)
    ###

    def parse_genes
    def parse_parent_features
    genes = @features.select {|x| x.feature == "gene"}

    count = 1
    @@ -423,6 +429,8 @@ def parse_genes
    @locus[locus_tag] = gene_id
    puts triple(gene_id, "rdfs:label", quote(locus_tag))
    elsif hash["gene"]
    gene = hash["gene"].first
    @locus[gene] = gene_id
    puts triple(gene_id, "rdfs:label", quote(hash["gene"].first))
    else
    # [TODO] Where else to find gene name?
    @@ -435,46 +443,59 @@ def parse_genes
    end

    ###
    ### CDS
    ### Features (child: CDS, mRNA, misc_RNA, precursor_RNA, ncRNA, tRNA, rRNA)
    ###

    def parse_cds
    cdss = @features.select {|x| x.feature == "CDS"}
    def parse_child_features
    features = @features.select {|x| x.feature[/^(CDS|mRNA|misc_RNA|precursor_RNA|ncRNA|tRNA|rRNA)$/] }

    count = 1
    cdss.each do |cds|
    cds_id = new_uuid
    hash = cds.to_hash
    features.each do |feat|
    feature_id = new_uuid
    hash = feat.to_hash

    puts triple(cds_id, "rdf:type", "obo:SO_0000316") + " # SO:CDS"
    case feat.feature
    when "CDS"
    puts triple(feature_id, "rdf:type", "obo:SO_0000316") + " # SO:CDS"
    when "mRNA"
    puts triple(feature_id, "rdf:type", "obo:SO_0000234") + " # SO:mRNA"
    when "misc_RNA"
    puts triple(feature_id, "rdf:type", "obo:SO_0000673") + " # SO:transcription"
    when "precursor_RNA"
    puts triple(feature_id, "rdf:type", "obo:SO_0000185") + " # SO:primary_transcript"
    when "ncRNA"
    puts triple(feature_id, "rdf:type", "obo:SO_0000655") + " # SO:ncRNA"
    when "tRNA"
    puts triple(feature_id, "rdf:type", "obo:SO_0000253") + " # SO:tRNA"
    when "rRNA"
    puts triple(feature_id, "rdf:type", "obo:SO_0000252") + " # SO:rRNA"
    end

    if hash["locus_tag"]
    if locus_tag = hash["locus_tag"].first
    gene_id = @locus[locus_tag]
    elsif gene = hash["gene"].first
    gene_id = @locus[gene]
    end
    end

    if gene_id
    puts triple(cds_id, "obo:so_part_of", gene_id)
    puts triple(feature_id, "obo:so_part_of", gene_id)
    else
    # [TODO] sure to do this?
    puts triple(cds_id, "obo:so_part_of", @sequence_id)
    puts triple(feature_id, "obo:so_part_of", @sequence_id)
    end

    if locus_tag
    puts triple(cds_id, "rdfs:label", quote(locus_tag))
    elsif hash["gene"]
    puts triple(cds_id, "rdfs:label", quote(hash["gene"].first))
    else
    puts triple(cds_id, "rdfs:label", quote("CDS#{count}"))
    puts triple(feature_id, "rdfs:label", quote(locus_tag))
    elsif gene
    puts triple(feature_id, "rdfs:label", quote(gene))
    end
    count += 1
    elem_type = { :id => "obo:SO_0000147", :term => "SO:exon" }
    loc_id, exons = new_location(cds.position, elem_type)
    puts triple(cds_id, "faldo:location", loc_id)
    puts triple(cds_id, "obo:so_has_part", "(#{exons.join(' ')})") # rdf:List
    loc_id, exons = new_location(feat.position, elem_type)
    puts triple(feature_id, "faldo:location", loc_id)
    puts triple(feature_id, "obo:so_has_part", "(#{exons.join(' ')})") # rdf:List

    parse_qualifiers(cds_id, hash)
    parse_qualifiers(feature_id, hash)
    end
    end

    @@ -483,10 +504,12 @@ def parse_cds
    ###

    def parse_features
    features = @features.select {|x| ! x.feature[/^(gene|CDS)$/]}
    features = @features.select {|x| ! x.feature[/^(gene|CDS|mRNA|misc_RNA|precursor_RNA|ncRNA|tRNA|rRNA)$/] }
    feature_count = Hash.new(0)

    features.each do |feat|
    feature = feat.feature
    feature_count[feature] += 1
    feature_id = new_uuid
    hash = feat.to_hash

    @@ -507,6 +530,8 @@ def parse_features

    parse_qualifiers(feature_id, hash)
    end

    $stderr.puts "Other features: #{feature_count.to_json}"
    end

    def parse_qualifiers(feature_id, hash)
  4. ktym revised this gist Sep 10, 2013. 1 changed file with 25 additions and 13 deletions.
    38 changes: 25 additions & 13 deletions refseq2ttl.rb
    Original file line number Diff line number Diff line change
    @@ -125,7 +125,7 @@ class RefSeq2RDF

    include RDFSupport

    def initialize(io = ARGF, seqtype = nil)
    def initialize(io = nil, seqtype = nil)
    set_prefixes

    @seqtype = seqtype
    @@ -138,16 +138,20 @@ def initialize(io = ARGF, seqtype = nil)
    puts prefix
    puts

    parse_refseq(io)
    parse_refseq(io) if io
    end

    attr_accessor :prefix

    def self.prefixes
    puts prefix
    end

    def set_prefixes
    @prefix = default_prefix + [
    #triple("@prefix", "genome:", "<http://purl.jp/bio/10/genome/>"),
    triple("@prefix", "idorg:", "<http://rdf.identifiers.org/database/>"),
    triple("@prefix", "insdc:", "<http://insdc.org/owl/>"),
    #triple("@prefix", "insdc:", "<http://insdc.org/owl/>"),
    triple("@prefix", "insdc:", "<http://ddbj.nig.ac.jp/ontologies/sequence#>"),
    ]
    end

    @@ -168,7 +172,8 @@ def xref(subject, db, id)
    uri = "<#{hash['prefix']}#{id}>"
    puts triple(subject, "rdfs:seeAlso", uri)
    puts triple(uri, "rdfs:label", quote("#{db}:#{id}"))
    puts triple(uri, "rdf:type", "idorg:#{hash['class']}")
    #puts triple(uri, "rdf:type", "idorg:#{hash['class']}")
    puts triple(uri, "rdf:type", "<#{hash['prefix']}>")
    else
    unless @xref_warn[db]
    $stderr.puts "Error: New database '#{db}' found. Add it to the rs_id.json file and/or Identifiers.org."
    @@ -339,7 +344,7 @@ def sequence_link_accver(str)
    end

    def sequence_link_bioproject(str)
    xref(@sequence_id, 'BioProject', str)
    xref(@sequence_id, 'BioProject', "PRJNA#{str}")
    end

    def sequence_ref(refs)
    @@ -382,13 +387,13 @@ def source_qualifiers(hash)
    hash.each do |qual, vals|
    vals.each do |val|
    if val == true
    puts triple(@source_id, "insdc:source_#{qual}", true)
    puts triple(@source_id, "insdc:#{qual}", true)
    else
    data = val.to_s.gsub(/\s+/, ' ').strip
    if data[/^\d+$/]
    puts triple(@source_id, "insdc:source_#{qual}", data)
    puts triple(@source_id, "insdc:#{qual}", data)
    else
    puts triple(@source_id, "insdc:source_#{qual}", quote(data))
    puts triple(@source_id, "insdc:#{qual}", quote(data))
    end
    end
    end
    @@ -508,7 +513,7 @@ def parse_qualifiers(feature_id, hash)
    hash.each do |qual, vals|
    vals.each do |val|
    if val == true
    puts triple(feature_id, "insdc:feature_#{qual}", true)
    puts triple(feature_id, "insdc:#{qual}", true)
    else
    data = val.to_s.gsub(/\s+/, ' ').strip
    case qual
    @@ -553,9 +558,9 @@ def parse_qualifiers(feature_id, hash)
    end
    else
    if data[/^\d+$/]
    puts triple(feature_id, "insdc:feature_#{qual}", data)
    puts triple(feature_id, "insdc:#{qual}", data)
    else
    puts triple(feature_id, "insdc:feature_#{qual}", quote(data))
    puts triple(feature_id, "insdc:#{qual}", quote(data))
    end
    end
    end
    @@ -571,6 +576,7 @@ def parse_qualifiers(feature_id, hash)

    args = GetoptLong.new(
    [ '--seqtype', '-t', GetoptLong::REQUIRED_ARGUMENT ],
    [ '--prefixes', '-p', GetoptLong::NO_ARGUMENT ],
    )

    opts = {
    @@ -581,10 +587,16 @@ def parse_qualifiers(feature_id, hash)
    case name
    when /--seqtype/
    opts[:seqtype] = value
    when /--prefixes/
    opts[:prefixes] = true
    end
    end

    RefSeq2RDF.new(ARGF, opts[:seqtype])
    if opts[:prefixes]
    RefSeq2RDF.new
    else
    RefSeq2RDF.new(ARGF, opts[:seqtype])
    end
    end

    __END__
  5. ktym created this gist May 9, 2013.
    590 changes: 590 additions & 0 deletions refseq2ttl.rb
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,590 @@
    #!/usr/bin/env ruby

    require 'rubygems'
    require 'uri'
    require 'bio'
    require 'json'
    require 'securerandom'

    # [TODO] integrate this into BioRuby
    module Bio
    class GenBank
    def dblink
    fetch('DBLINK')
    end

    def bioproject
    dblink[/\d+/]
    end
    end
    end

    ###
    ### Utilities for RDF generation
    ###

    module RDFSupport
    def new_uuid(prefix = "http://purl.jp/bio/10/genome/uuid/")
    #return "<#{prefix}#{SecureRandom.uuid}>"
    #return "genome:uuid-#{SecureRandom.uuid}"
    return "<urn:uuid:#{SecureRandom.uuid}>"
    end

    def quote(str)
    return str.gsub('\\', '\\\\').gsub("\t", '\\t').gsub("\n", '\\n').gsub("\r", '\\r').gsub('"', '\\"').inspect
    end

    def triple(s, p, o)
    return [s, p, o].join("\t") + " ."
    end

    def default_prefix
    return [
    triple("@prefix", "rdf:", "<http://www.w3.org/1999/02/22-rdf-syntax-ns#>"),
    triple("@prefix", "rdfs:", "<http://www.w3.org/2000/01/rdf-schema#>"),
    #triple("@prefix", "dcterms:", "<http://purl.org/dc/terms/>"),
    triple("@prefix", "xsd:", "<http://www.w3.org/2001/XMLSchema#>"),
    #triple("@prefix", "sio:", "<http://semanticscience.org/resource#>"),
    #triple("@prefix", "so:", "<http://purl.org/obo/owl/SO#>"),
    triple("@prefix", "obo:", "<http://purl.obolibrary.org/obo/>"),
    triple("@prefix", "faldo:", "<http://biohackathon.org/resource/faldo#>"),
    ]
    end

    def usdate2date(str)
    return Date.parse(str).strftime("%Y-%m-%d")
    end
    end

    ###
    ### Mapping RefSeq db_xref to Identifiers.org
    ###

    # https://gist.github.com/3985701
    # https://gist.github.com/4146256
    class RS_ID
    include RDFSupport

    def initialize
    @rs_id = JSON.parse(File.read(File.dirname(__FILE__) + "/refseq2ttl/rs_id.json"))
    end

    def fetch(db)
    @rs_id[db]
    end

    def labels
    @rs_id.sort.each do |db, hash|
    puts triple("insdc:#{hash['class']}", "rdfs:label", quote(hash['label']))
    end
    end
    end

    ###
    ### Mapping RefSeq feature table to Sequence Ontology
    ###

    # https://gist.github.com/3650401
    class FT_SO
    def initialize
    @data = JSON.parse(File.read(File.dirname(__FILE__) + "/refseq2ttl/ft_so.json"))
    end

    # ftso = FT_SO.new
    # puts ftso.so_id("-10_signal") # => "SO:0000175"
    def so_id(feature)
    if hash = @data[feature]
    return hash["so_id"]
    end
    end

    def so_term(feature)
    if hash = @data[feature]
    return hash["so_term"]
    end
    end

    def so_desc(feature)
    if hash = @data[feature]
    return hash["so_desc"]
    end
    end

    def ft_desc(feature)
    if hash = @data[feature]
    return hash["ft_desc"]
    end
    end
    end

    ###
    ### Convert RefSeq (prokaryote) entries to RDF
    ###

    class RefSeq2RDF

    include RDFSupport

    def initialize(io = ARGF, seqtype = nil)
    set_prefixes

    @seqtype = seqtype
    @rs_id = RS_ID.new
    @ft_so = FT_SO.new

    @locus = {}
    @xref_warn = {}

    puts prefix
    puts

    parse_refseq(io)
    end

    attr_accessor :prefix

    def set_prefixes
    @prefix = default_prefix + [
    #triple("@prefix", "genome:", "<http://purl.jp/bio/10/genome/>"),
    triple("@prefix", "idorg:", "<http://rdf.identifiers.org/database/>"),
    triple("@prefix", "insdc:", "<http://insdc.org/owl/>"),
    ]
    end

    def xref(subject, db, id)
    case db
    when "HOMD"
    id.sub!(/^tax_/, '')
    when "ECOCYC"
    #id = "ECOCYC:#{id}"
    when "GI", "ERIC", "HMP", "PSEUDO", "Pathema"
    unless @xref_warn[db]
    $stderr.puts "Warning: Need to register '#{db}' in Identifiers.org"
    @xref_warn[db] = true
    end
    end

    if hash = @rs_id.fetch(db)
    uri = "<#{hash['prefix']}#{id}>"
    puts triple(subject, "rdfs:seeAlso", uri)
    puts triple(uri, "rdfs:label", quote("#{db}:#{id}"))
    puts triple(uri, "rdf:type", "idorg:#{hash['class']}")
    else
    unless @xref_warn[db]
    $stderr.puts "Error: New database '#{db}' found. Add it to the rs_id.json file and/or Identifiers.org."
    @xref_warn[db] = true
    end
    end
    end

    ###
    ### FALDO http://biohackathon.org/faldo
    ###

    def new_location(pos, elem_type = false)
    loc_id = new_uuid

    puts triple(loc_id, "insdc:location", quote(pos))

    @locations = Bio::Locations.new(pos)
    pos_start = new_uuid
    pos_end = new_uuid
    puts triple(loc_id, "rdf:type", "faldo:Region")
    puts triple(loc_id, "faldo:begin", pos_start)
    puts triple(loc_id, "faldo:end", pos_end)
    # [TODO] need to check join(100000..1000289,1..1234) type object located over the origin
    new_position(pos_start, @locations.range.min, @locations.first.strand)
    new_position(pos_end, @locations.range.max, @locations.last.strand)

    list = []
    if elem_type
    @locations.each do |loc|
    elem_id = new_uuid
    elem_start = new_uuid
    elem_end = new_uuid
    puts triple(elem_id, "obo:so_part_of", loc_id)
    puts triple(elem_id, "rdf:type", elem_type[:id]) + " # #{elem_type[:term]}"
    puts triple(elem_id, "rdf:type", "faldo:Region")
    puts triple(elem_id, "faldo:begin", elem_start)
    puts triple(elem_id, "faldo:end", elem_end)
    new_position(elem_start, loc.from, loc.strand)
    new_position(elem_end, loc.to, loc.strand)
    list << elem_id
    end
    end

    return loc_id, list
    end

    def new_position(pos_id, pos, strand)
    puts triple(pos_id, "faldo:position", pos)
    puts triple(pos_id, "faldo:reference", @sequence_id)
    puts triple(pos_id, "rdf:type", "faldo:ExactPosition")
    if strand > 0
    puts triple(pos_id, "rdf:type", "faldo:ForwardStrandPosition")
    else
    puts triple(pos_id, "rdf:type", "faldo:ReverseStrandPosition")
    end
    end

    ###
    ### Main
    ###

    def parse_refseq(io)
    # Read RefSeq entry
    Bio::FlatFile.auto(io).each do |entry|
    @entry = entry
    @features = entry.features
    @source = @features.shift
    parse_sequence
    parse_source
    parse_genes
    parse_cds
    parse_features
    end
    end

    ###
    ### Sequence
    ###

    # [TODO]
    # * bind sequences by BioProject ID?
    # * flag complete/draft?
    def parse_sequence
    @sequence_id = new_uuid

    # [TODO] How to identify the input is chromosome/plasmid/contig/...?
    sequence_type(@seqtype)
    # [TODO] Obtain rdfs:label from source /chromosome (eukaryotes) /plasmid (prokaryotes) -> see insdc:source_chromosome, insdc:source_plasmid
    sequence_label(@entry.definition)
    sequence_version(@entry.acc_version)
    sequence_length(@entry.nalen)
    # [TODO] provide REST API to retreive genomic DNA sequence by <@sequence_id.fasta>
    sequence_seq(@entry.acc_version)
    sequence_form(@entry.circular)
    # [TODO] sequenced date, modified in the source db or in our RDF data?
    sequence_date(@entry.date)
    # [TODO] rdfs:seeAlso (like UniProt) or dc:relation, owl:sameAs
    sequence_link_gi(@entry.gi.sub('GI:',''))
    sequence_link_accver(@entry.acc_version)
    sequence_link_bioproject(@entry.bioproject)
    # [TODO] how to deal with direct submissions (references without PMID)?
    sequence_ref(@entry.references)
    end

    def sequence_type(so = "SO:chromosome")
    case so
    when /0000340/, "SO:chromosome"
    puts triple(@sequence_id, "rdf:type", "obo:SO_0000340") + " # SO:chromosome"
    when /0000155/, "SO:plasmid"
    puts triple(@sequence_id, "rdf:type", "obo:SO_0000155") + " # SO:plasmid"
    when /0000736/, "SO:organelle_sequence"
    puts triple(@sequence_id, "rdf:type", "obo:SO_0000736") + " # SO:organelle_sequence"
    when /0000819/, "SO:mitochondrial_chromosome"
    puts triple(@sequence_id, "rdf:type", "obo:SO_0000819") + " # SO:mitochondrial_chromosome"
    when /0000740/, "SO:plastid_sequence"
    puts triple(@sequence_id, "rdf:type", "obo:SO_0000740") + " # SO:plastid_sequence"
    when /0000719/, "SO:ultracontig"
    puts triple(@sequence_id, "rdf:type", "obo:SO_0000719") + " # SO:ultracontig"
    when /0000148/, "SO:supercontig", "SO:scaffold"
    puts triple(@sequence_id, "rdf:type", "obo:SO_0000148") + " # SO:supercontig/scaffold"
    when /0000149/, "SO:contig"
    puts triple(@sequence_id, "rdf:type", "obo:SO_0000149") + " # SO:contig"
    else
    puts triple(@sequence_id, "rdf:type", "obo:SO_0000353") + " # SO:sequence_assembly"
    end
    end

    def sequence_label(str)
    # Use "name:" key in the JSON representation
    puts triple(@sequence_id, "rdfs:label", quote(str))
    end

    def sequence_version(str)
    puts triple(@sequence_id, "insdc:sequence_version", quote(str))
    end

    def sequence_length(int)
    puts triple(@sequence_id, "insdc:sequence_length", int)
    end

    def sequence_seq(str)
    # [TODO] Where to privide the actual DNA sequence?
    fasta_uri = "<http://togows.dbcls.jp/entry/nucleotide/#{str}.fasta>"
    #fasta_uri = "<http://www.ncbi.nlm.nih.gov/nuccore/#{str}?report=fasta>"
    puts triple(@sequence_id, "insdc:sequence_fasta", fasta_uri)
    end

    def sequence_form(form)
    case form
    when "linear"
    puts triple(@sequence_id, "rdf:type", "obo:SO_0000987") + " # SO:linear"
    when "circular"
    puts triple(@sequence_id, "rdf:type", "obo:SO_0000988") + " # SO:circular"
    end
    end

    def sequence_date(date)
    puts triple(@sequence_id, "insdc:sequence_date", quote(usdate2date(date))+"^^xsd:date")
    end

    def sequence_link_gi(str)
    xref(@sequence_id, 'GI', str)
    end

    def sequence_link_accver(str)
    xref(@sequence_id, 'RefSeq', str)
    end

    def sequence_link_bioproject(str)
    xref(@sequence_id, 'BioProject', str)
    end

    def sequence_ref(refs)
    refs.each do |ref|
    pmid = ref.pubmed
    if pmid.length > 0
    xref(@sequence_id, 'PubMed', pmid)
    end
    end
    end

    ###
    ### Source
    ###

    def parse_source
    # Use @sequence_id for @source_id
    @source_id = @sequence_id

    hash = @source.to_hash
    source_location(@source.position)
    source_link(hash["db_xref"])
    hash.delete("db_xref")
    source_qualifiers(hash)
    end

    def source_location(pos)
    loc_id, = new_location(pos)
    puts triple(@source_id, "faldo:location", loc_id)
    end

    def source_link(links)
    links.each do |link|
    db, entry_id = link.split(':', 2)
    xref(@source_id, db, entry_id)
    end
    end

    def source_qualifiers(hash)
    hash.each do |qual, vals|
    vals.each do |val|
    if val == true
    puts triple(@source_id, "insdc:source_#{qual}", true)
    else
    data = val.to_s.gsub(/\s+/, ' ').strip
    if data[/^\d+$/]
    puts triple(@source_id, "insdc:source_#{qual}", data)
    else
    puts triple(@source_id, "insdc:source_#{qual}", quote(data))
    end
    end
    end
    end
    end

    ###
    ### genes
    ###

    def parse_genes
    genes = @features.select {|x| x.feature == "gene"}

    count = 1
    genes.each do |gene|
    gene_id = new_uuid
    hash = gene.to_hash

    puts triple(gene_id, "rdf:type", "obo:SO_0000704") + " # SO:gene"
    puts triple(gene_id, "obo:so_part_of", @sequence_id)

    loc_id, _ = new_location(gene.position)
    puts triple(gene_id, "faldo:location", loc_id)

    if hash["locus_tag"]
    locus_tag = hash["locus_tag"].first
    @locus[locus_tag] = gene_id
    puts triple(gene_id, "rdfs:label", quote(locus_tag))
    elsif hash["gene"]
    puts triple(gene_id, "rdfs:label", quote(hash["gene"].first))
    else
    # [TODO] Where else to find gene name?
    puts triple(gene_id, "rdfs:label", quote("gene#{count}"))
    end
    count += 1

    parse_qualifiers(gene_id, hash)
    end
    end

    ###
    ### CDS
    ###

    def parse_cds
    cdss = @features.select {|x| x.feature == "CDS"}

    count = 1
    cdss.each do |cds|
    cds_id = new_uuid
    hash = cds.to_hash

    puts triple(cds_id, "rdf:type", "obo:SO_0000316") + " # SO:CDS"

    if hash["locus_tag"]
    if locus_tag = hash["locus_tag"].first
    gene_id = @locus[locus_tag]
    end
    end

    if gene_id
    puts triple(cds_id, "obo:so_part_of", gene_id)
    else
    # [TODO] sure to do this?
    puts triple(cds_id, "obo:so_part_of", @sequence_id)
    end

    if locus_tag
    puts triple(cds_id, "rdfs:label", quote(locus_tag))
    elsif hash["gene"]
    puts triple(cds_id, "rdfs:label", quote(hash["gene"].first))
    else
    puts triple(cds_id, "rdfs:label", quote("CDS#{count}"))
    end
    count += 1
    elem_type = { :id => "obo:SO_0000147", :term => "SO:exon" }
    loc_id, exons = new_location(cds.position, elem_type)
    puts triple(cds_id, "faldo:location", loc_id)
    puts triple(cds_id, "obo:so_has_part", "(#{exons.join(' ')})") # rdf:List

    parse_qualifiers(cds_id, hash)
    end
    end

    ###
    ### Features
    ###

    def parse_features
    features = @features.select {|x| ! x.feature[/^(gene|CDS)$/]}

    features.each do |feat|
    feature = feat.feature
    feature_id = new_uuid
    hash = feat.to_hash

    puts triple(feature_id, "obo:so_part_of", @sequence_id)
    puts triple(feature_id, "rdfs:label", quote(feature))

    if so_id = @ft_so.so_id(feature)
    if so_id != "undefined"
    so = so_id.sub(':', '_')
    puts triple(feature_id, "rdf:type", "obo:#{so}") + " # SO:#{@ft_so.so_term(feature)}"
    else
    puts triple(feature_id, "rdf:type", "obo:SO_0000110") + " # SO:sequence_feature"
    end
    end

    loc_id, _ = new_location(feat.position)
    puts triple(feature_id, "faldo:location", loc_id)

    parse_qualifiers(feature_id, hash)
    end
    end

    def parse_qualifiers(feature_id, hash)
    hash.each do |qual, vals|
    vals.each do |val|
    if val == true
    puts triple(feature_id, "insdc:feature_#{qual}", true)
    else
    data = val.to_s.gsub(/\s+/, ' ').strip
    case qual
    when "protein_id"
    xref(feature_id, 'Protein', val)
    when "db_xref"
    db, id = val.split(':', 2)
    # ad hoc
    if db == 'InterPro' and @entry.acc_version[/(NC_010994.1|NC_014958.1|NC_015385.1|NC_015386.1|NC_015387.1|NC_015388.1|NC_015389.1)/]
    # PRJNA59115/NC_010994.1
    # PRJNA62225/NC_014958.1
    # PRJNA65781/NC_015385.1
    # PRJNA65781/plasmids/NC_015386.1
    # PRJNA65783/NC_015387.1
    # PRJNA65785/NC_015388.1
    # PRJNA65787/NC_015389.1
    # /db_xref="InterPro:Chromosomal replication control,
    # initiator (DnaA)/regulator (Hda"
    xref(feature_id, db, id) if id[/IPR\d+/]
    elsif db == "ASAP" and @entry.acc_version[/(NC_017263.1|NC_017264.1|NC_017265.1|NC_017266.1)/]
    # PRJNA158537/plasmids/NC_017263.1
    # PRJNA158537/plasmids/NC_017264.1
    # PRJNA158537/NC_017265.1
    # PRJNA158537/plasmids/NC_017266.1
    # /db_xref="ASAP:BBE-0004740"
    # /db_xref="ASAP:BBE-0004740 ERIC"
    xref(feature_id, db, id) unless id[/\s/]
    elsif db == 'TIGRFAM' and @entry.acc_version[/NC_013418.2/]
    # PRJNA41287/NC_013418.2
    # /db_xref="TIGRFAM:TIGR00336, TIGR01740"
    # /db_xref="TIGRFAM:TIGR00197; TF"
    if id[/,/]
    ids = id.split(',').map{|x| x.strip}
    ids.each do |x|
    xref(feature_id, db, x)
    end
    else
    xref(feature_id, db, id.sub(/;.*/, ''))
    end
    else
    xref(feature_id, db, id)
    end
    else
    if data[/^\d+$/]
    puts triple(feature_id, "insdc:feature_#{qual}", data)
    else
    puts triple(feature_id, "insdc:feature_#{qual}", quote(data))
    end
    end
    end
    end
    end
    end

    end


    if __FILE__ == $0
    require 'getoptlong'

    args = GetoptLong.new(
    [ '--seqtype', '-t', GetoptLong::REQUIRED_ARGUMENT ],
    )

    opts = {
    :seqtype => "SO:chromosome",
    }

    args.each_option do |name, value|
    case name
    when /--seqtype/
    opts[:seqtype] = value
    end
    end

    RefSeq2RDF.new(ARGF, opts[:seqtype])
    end

    __END__