Created
July 17, 2016 10:36
-
-
Save parashardhapola/595ea962f044834e787e76be1d60bd3a to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import numpy as np | |
class CuffDiff(object): | |
def __init__(self, filename, up_log_fc_cutoff=1, down_log_fc_cutoff=-1, alpha=0.05): | |
self.filename = filename | |
self.upLogFCcutoff = up_log_fc_cutoff | |
self.downLogFCcutoff = down_log_fc_cutoff | |
self.alpha = alpha | |
self.conditions = self._get_conditions() | |
self.data_dict = self._read() | |
self.filtered_data_dict = self._filter() | |
self.totalGenes, self.filteredGenes = len(self.data_dict), len(self.filtered_data_dict) | |
self.upGenes, self.downGenes = self._get_diff_genes() | |
def _get_conditions(self): | |
with open(self.filename) as h: | |
h.next() | |
c = h.next().rstrip('\n').split('\t') | |
return {'1':c[4], '2':c[5]} | |
def _read(self): | |
data = {} | |
with open(self.filename) as h: | |
h.next() | |
for l in h: | |
c = l.rstrip('\n').split('\t') | |
if c[2] in data: | |
data[c[2]]['fpkm_cond1'].append(float(c[7])) | |
data[c[2]]['fpkm_cond2'].append(float(c[8])) | |
data[c[2]]['log2_fc'].append(float(c[9])) | |
data[c[2]]['p_val'].append(float(c[11])) | |
data[c[2]]['q_val'].append(float(c[12])) | |
else: | |
data[c[2]] = { | |
'fpkm_cond1': [float(c[7])], | |
'fpkm_cond2': [float(c[8])], | |
'log2_fc': [float(c[9])], | |
'p_val': [float(c[11])], | |
'q_val': [float(c[12])] | |
} | |
return data | |
def _filter(self): | |
data = {} | |
for gene in self.data_dict: | |
if np.sum(self.data_dict[gene]['fpkm_cond1']) + np.sum(self.data_dict[gene]['fpkm_cond2']) > 2: | |
data[gene] = self.data_dict[gene] | |
return data | |
def _get_diff_genes(self): | |
up_genes = {} | |
down_genes = {} | |
for gene in self.filtered_data_dict: | |
for i in range(len(self.filtered_data_dict[gene]['q_val'])): | |
if self.filtered_data_dict[gene]['q_val'][i] <= self.alpha: | |
f1 = max(self.filtered_data_dict[gene]['fpkm_cond1'][i], 1) | |
f2 = max(self.filtered_data_dict[gene]['fpkm_cond2'][i], 1) | |
if self.filtered_data_dict[gene]['log2_fc'][i] >= self.upLogFCcutoff: | |
up_genes[gene] = [f1, f2] | |
break | |
elif self.filtered_data_dict[gene]['log2_fc'][i] <= self.downLogFCcutoff: | |
down_genes[gene] = [f1, f2] | |
break | |
return up_genes, down_genes | |
def _write_output(self): | |
fpkms = ["\t".join(["Genes", self.conditions['1'], self.conditions['2']])] | |
for gene in self.upGenes: | |
fpkms.append("\t".join(map(str, [gene, np.log2(self.upGenes[gene][0]), np.log2(self.upGenes[gene][1])]))) | |
with open('up_genes.txt', 'w') as outfile: | |
outfile.write("\n".join(fpkms)) | |
fpkms = ["\t".join(["Genes", self.conditions['1'], self.conditions['2']])] | |
for gene in self.downGenes: | |
fpkms.append("\t".join(map(str, [gene, np.log2(self.downGenes[gene][0]), np.log2(self.downGenes[gene][1])]))) | |
with open('down_genes.txt', 'w') as outfile: | |
outfile.write("\n".join(fpkms)) | |
if __name__ == "__main__": | |
res = CuffDiff(sys.argv[1], up_log_fc_cutoff=np.log2(2), down_log_fc_cutoff=np.log2(0.5)) | |
print "Total %d genes processed." % res.totalGenes | |
print "Total genes after filtering low expressing genes: %d" % res.filteredGenes | |
print "Condition 1: %s; Condition 2: %s" % (res.conditions['1'], res.conditions['2']) | |
print "Number of up-regulated genes: %d" % len(res.upGenes) | |
print "Number of down-regulated genes: %d" % len(res.downGenes) | |
res._write_output() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment