Created
May 27, 2011 00:39
-
-
Save roshank/994419 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'logger' | |
require 'net/http' | |
require 'open-uri' | |
#Roshan Khan | |
#CS 590B | |
#HW4 - HMM | |
class Fasta | |
attr_accessor :dna, :html | |
def initialize(code) | |
p "Retrieving Fasta for #{code}" | |
@html = File.open(code).read | |
@dna = html.split(/\n/)[1..-1].join | |
p "Fasta retrieved, length #{@dna.length}" | |
replacement = 0 | |
while ([email protected](/[BDEFHIJKLMNOPQRSUVWXYZ]/).nil?) | |
@dna[/[BDEFHIJKLMNOPQRSUVWXYZ]/] = 'T' | |
replacement += 1 | |
end | |
p "Replaced #{replacement} proteins with T" | |
end | |
end | |
class ViterbiPath | |
STATES = ['STANDARD', 'CG_RICH'] | |
OBS = ['A', 'C', 'G', 'T'] | |
START = {'STANDARD' , 0.9999, 'CG_RICH' , 0.0001} | |
TRANS = {'STANDARD' , {'STANDARD' , 0.9999, 'CG_RICH' , 0.0001} , 'CG_RICH' , {'STANDARD' , 0.01, 'CG_RICH' , 0.99} } | |
#TRANS = {'STANDARD' , {'STANDARD' , 0.6, 'CG_RICH' , 0.4} , 'CG_RICH' , {'STANDARD' , 0.2, 'CG_RICH' , 0.8} } | |
EMIT = {'STANDARD' , { 'A' , 0.25, 'C' , 0.25, 'G' , 0.25, 'T',0.25 }, 'CG_RICH' , {'A' , 0.2, 'C' , 0.3, 'G' , 0.3, 'T',0.2} } | |
# Initalize data structure for holding calculations. States x Dna array | |
def initialize (dna) | |
@dna = dna | |
end | |
def traceback | |
@path = [] | |
last_parent = "" | |
# Find highest value in the last node, to see where to start trace | |
if (@vit[@vit.length-1]['STANDARD'] > @vit[@vit.length-1]['CG_RICH']) | |
p "Probability of this path is " + @vit[@vit.length-1]['STANDARD'].to_s | |
@path << @vit[@vit.length-1]['STANDARDparent'] | |
lastparent = @vit[@vit.length-1]['STANDARDparent'] | |
else | |
p "Probability of this path is " + @vit[@vit.length-1]['CG_RICH'].to_s | |
@path << @vit[@vit.length-1]['CG_RICHparent'] | |
lastparent = @vit[@vit.length-1]['CG_RICHparent'] | |
end | |
# Now count back from last node - 1 | |
count = @vit.length - 2 | |
while (count >= 0) | |
# New path is viterbi sequence position, and look up parent value | |
@path << @vit[count][lastparent+'parent'] | |
lastparent = @vit[count][lastparent+'parent'] | |
count -= 1 | |
end | |
@path = @path.reverse | |
end | |
def train | |
p "Initiating training" | |
tc = { "STANDARD", {"STANDARD", 0.0, "CG_RICH", 0.0}, "CG_RICH", {"STANDARD", 0.0, "CG_RICH", 0.0} } | |
ec = {'STANDARD' , { 'A' , 0.0, 'C' , 0.0, 'G' , 0.0, 'T',0.0 }, 'CG_RICH' , {'A' , 0.0, 'C' , 0.0, 'G' , 0.0, 'T',0.0} } | |
cur = @path[1] | |
for i in ([email protected]) | |
tc[cur][@path[i]] += 1 | |
ec[cur][@dna[i].chr] += 1 | |
cur = @path[i] | |
end | |
standard_total = tc["STANDARD"]["STANDARD"] + tc["STANDARD"]["CG_RICH"] | |
cg_total = tc["CG_RICH"]["CG_RICH"] + tc["CG_RICH"]["STANDARD"] | |
TRANS["STANDARD"]["STANDARD"] = tc["STANDARD"]["STANDARD"] / standard_total | |
TRANS["STANDARD"]["CG_RICH"] = tc["STANDARD"]["CG_RICH"] / standard_total | |
TRANS["CG_RICH"]["STANDARD"] = tc["CG_RICH"]["STANDARD"] / cg_total | |
TRANS["CG_RICH"]["CG_RICH"] = tc["CG_RICH"]["CG_RICH"] / cg_total | |
s_total = ec["STANDARD"]["A"] + ec["STANDARD"]["C"] + ec["STANDARD"]["G"] + ec["STANDARD"]["T"] | |
g_total = ec["CG_RICH"]["A"] + ec["CG_RICH"]["C"] + ec["CG_RICH"]["G"] + ec["CG_RICH"]["T"] | |
EMIT["STANDARD"]["A"] = ec["STANDARD"]["A"] / s_total | |
EMIT["STANDARD"]["C"] = ec["STANDARD"]["C"] / s_total | |
EMIT["STANDARD"]["G"] = ec["STANDARD"]["G"] / s_total | |
EMIT["STANDARD"]["T"] = ec["STANDARD"]["T"] / s_total | |
EMIT["CG_RICH"]["A"] = ec["CG_RICH"]["A"] / g_total | |
EMIT["CG_RICH"]["C"] = ec["CG_RICH"]["C"] / g_total | |
EMIT["CG_RICH"]["G"] = ec["CG_RICH"]["G"] / g_total | |
EMIT["CG_RICH"]["T"] = ec["CG_RICH"]["T"] / g_total | |
p "New parameters are" | |
p "TRANSITION" | |
p TRANS | |
p "EMISSION" | |
p EMIT | |
end | |
# 1->1 = 1->1 / 1->1+1->2 | |
# 1->2 = 1->2 / 1->1+1->2 | |
# 2->1 = 2->1/ 2->1 + 2->2 | |
# 2->2 = 2->2 / 2->2 + 2->2 | |
# same thing for distribution | |
def print_cg_areas | |
in_cg = false | |
start_node = 0 | |
len = 0 | |
hits = 0 | |
@path.each_with_index do |state, index| | |
if (state == "CG_RICH") | |
if (in_cg == true) | |
len+=1 | |
else | |
in_cg = true | |
start_node = index | |
len=1 | |
end | |
else | |
if (in_cg == true) | |
hits += 1 | |
# FINISHED A CG_RICH AREA - PRINT THIS | |
p "CG_Rich area from #{start_node} to #{start_node + len} which is #{len} bp long." | |
odds = @vit[index]['CG_RICH'] | |
end | |
in_cg = false | |
len = 0 | |
end | |
end | |
p "Total number of hits in this sequence is #{hits}" | |
end | |
def longest_rich_area | |
in_cg = false | |
start_node = 0 | |
longest_start = 0 | |
longest_len = 0 | |
len = 0 | |
@path.each_with_index do |state, index| | |
if (state == "CG_RICH") | |
if (in_cg == true) | |
len+=1 | |
else | |
in_cg = true | |
start_node = index | |
len=1 | |
end | |
else | |
in_cg = false | |
len = 0 | |
end | |
if (longest_len < len) | |
longest_len = len | |
longest_start = start_node | |
end | |
end | |
p longest_len | |
p longest_start | |
#p @path[longest_start...longest_start+longest_len] | |
end | |
#TODO: Change this function name | |
# Calculate state values across all nodes in the chain | |
def predict() | |
@vit = Array.new() | |
@vit << {} | |
for state in STATES | |
@vit[0][state] = Math.log(START[state] * EMIT[state][@dna[0].chr]) | |
end | |
start = Time.now | |
for t in ([email protected]) | |
@vit << {} | |
if (t % 100000 == 0) | |
p "Completed #{t} in #{Time.now - start} seconds" | |
end | |
for y in STATES | |
values = [] | |
for y0 in STATES | |
values << [@vit[t-1][y0] + Math.log(TRANS[y0][y] * EMIT[y][@dna[t].chr]), y0] | |
end | |
@vit[t][y] = values.max[0] | |
@vit[t][y + 'parent'] = values.max[1] | |
end | |
end | |
end | |
end | |
if __FILE__ == $0 | |
f = Fasta.new(ARGV[0]) | |
vp = ViterbiPath.new(f.dna) | |
(0...10).each do |i| | |
p "---------------------------- ITERATION #{i+1} ----------------------------" | |
vp.predict | |
vp.traceback | |
vp.print_cg_areas | |
vp.train | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment