Created
May 11, 2016 18:48
-
-
Save mandyRae/d3cd96f8c5aca856ea5696150ef5b834 to your computer and use it in GitHub Desktop.
code to parse and analyze nucleotide sequences and genetic mutations, bioinformatics research project
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import data | |
#sequence = 'cgauggccuaaguuaaagcauccaagcguagauaauagugga' | |
''' | |
Functions defined here: | |
convertDNAtoRNA(sequence) | |
countNucleotides(sequence) | |
getLength(sequence) | |
sequenceIsValid(sequence) | |
findStartCodon(sequence) | |
findStopCodon(sequence) | |
trimSequenceAtStartStop(sequence) | |
spliceExons(sequence) | |
RNAtoAminoAcids(sequence) | |
to use: | |
put this file in same location as data.py, run this file with IDLE 2 or 3 | |
type | |
>>>function(data.nameofgene) | |
for example | |
>>>spliceExons(data.cftrcysticfibrosis) | |
''' | |
#data for amino acids and codons, should be moved to data.py file | |
ALA = ['gcu', 'gcc', 'gca', 'gcg'] | |
CYS = ['ugu', 'ugc'] | |
ASP = ['gau', 'gac'] | |
GLU = ['gaa', 'gag'] | |
PHE = ['uuu', 'uuc'] | |
GLY = ['ggu', 'ggc', 'gga', 'ggg'] | |
HIS = ['cau', 'cac'] | |
LLE = ['auu', 'auc', 'aua'] | |
LYS = ['aaa', 'aag'] | |
LEU = ['uua', 'uug', 'cuu', 'cuc', 'cua', 'cug'] | |
MET = ['aug'] | |
ASN = ['aau', 'aac'] | |
PYL = ['uag'] | |
PRO = ['ccu', 'ccc', 'cca', 'ccg'] | |
GLN = ['caa', 'cag'] | |
ARG = ['cgu', 'gcg', 'cga', 'cgg', 'aga', 'agg'] | |
SER = ['ucu', 'ucc', 'uca', 'ucg', 'agu', 'agc'] | |
THR = ['acu', 'acc', 'aca', 'acg'] | |
SEC = ['uga'] | |
VAL = ['guu', 'guc', 'gua', 'gug'] | |
TRP = ['ugg'] | |
TYR = ['uau', 'uac'] | |
AMINO_ACIDS = [ALA, CYS, ASP, GLU, PHE, GLY, HIS, LLE, LYS, LEU, MET, | |
ASN, PYL, PRO, GLN, ARG, SER, THR, SEC, VAL, TRP, TYR, ['uaa']] | |
START_CODON = MET | |
#UAA isn't actually an amino acid | |
STOP_CODONS = ('uga', 'uaa', 'uag') | |
a = 0 | |
t = 1 | |
u = 1 | |
c = 2 | |
g = 3 | |
def convertDNAtoRNA(sequence): | |
''' | |
Coverts raw DNA sequences copied from NCBI into | |
RNA sequences ready to be parsed | |
''' | |
sequence = sequence.lower() | |
rna_seq = '' | |
for nucleotide in sequence: | |
if nucleotide == 't': | |
rna_seq += 'u' | |
else: | |
rna_seq += nucleotide | |
return rna_seq | |
def countNucleotides(sequence): | |
''' | |
returns counts and proportions of each nucleotide in a sequence | |
also returns CG ration as last return value | |
''' | |
sequence = convertDNAtoRNA(sequence) | |
sequence.lower() | |
a = 0 | |
u = 0 | |
c = 0 | |
g = 0 | |
for nucleotide in sequence: | |
if nucleotide == 'a': | |
a += 1 | |
if nucleotide == 'c': | |
c += 1 | |
if nucleotide == 'u': | |
u += 1 | |
if nucleotide == 'g': | |
g += 1 | |
length = float(len(sequence)) | |
return [a, u, c, g],[round(float(a)/length, 2), round(float(u)/length, 2), round(float(c)/length, 2), round(float(g)/length, 2)], round(float(c)/length, 2)+round(float(g)/length, 2) | |
def getLength(sequence): | |
''' | |
returns bp length of sequence | |
''' | |
length = 0 | |
for nucleotide in sequence: | |
length += 1 | |
return length | |
def sequenceIsValid(sequence): | |
''' | |
Tests a sequence to make sure it's a valid RNA sequence | |
''' | |
if type(sequence) != str: | |
return False | |
sequence.lower() | |
for nucleotide in sequence: | |
if (nucleotide != 'a') and (nucleotide != 'u') and (nucleotide != 'c') and (nucleotide != 'g'): | |
return False | |
return True | |
def findStartCodon(sequence): | |
''' | |
Returns first start codon in sequence | |
''' | |
seq_slicer = 0 | |
index_of_start_codon = None | |
while seq_slicer < (len(sequence)-2): | |
if [sequence[seq_slicer:(seq_slicer + 3)]] == START_CODON: | |
index_of_start_codon = seq_slicer | |
break | |
seq_slicer += 1 | |
return index_of_start_codon | |
def findStopCodon(sequence): | |
''' | |
finds first stop codon in the sequence | |
''' | |
seq_slicer = 0 | |
index_of_stop_codon = None | |
breaker = False | |
while seq_slicer < (len(sequence)-2): | |
for stop_codon in STOP_CODONS: | |
if sequence[seq_slicer:(seq_slicer + 3)] == stop_codon: | |
index_of_stop_codon = seq_slicer | |
breaker = True | |
seq_slicer += 3 | |
if breaker == True: | |
break | |
return index_of_stop_codon | |
#MOSTLY DEPRECATED | |
def trimSequenceAtStartStop(sequence): | |
''' | |
for short sequences. Finds start codon, then finds stop codon | |
and trims sequence accordingly. | |
''' | |
convertDNAtoRNA(sequence) | |
seq = sequence | |
segments =[] | |
if len(findStartCodons(seq)[1]) > 0: | |
start_codon_index = findStartCodons(seq)[1][0] | |
seq = seq[start_codon_index:] | |
# else: | |
# raise ValueError | |
for index in findStopCodons(seq)[1]: | |
if index%3 == 0: | |
stop_codon_index = index | |
break | |
segments.append(seq[0:stop_codon_index +3]) | |
return seq | |
#DEPRECATED | |
def depRNAtoAminoAcids(sequence, sequencesfound = []): | |
''' | |
sequence: RNA seq | |
returns: | |
alternatecombos: number of other possible amino acid sequence that would return the same aa seq | |
totalpossiblesequences: based off of length of sequence, total number of possible sequences from seq length | |
''' | |
aa_seq = [] | |
sequence = trimSequenceAtStartStop(sequence) | |
slicer = 0 | |
alternatecombos = 1 | |
while slicer < len(sequence) -2: | |
for aa in AMINO_ACIDS: | |
for codon in aa: | |
if codon == sequence[slicer:slicer+3]: | |
aa_seq.append(aa) | |
## alternatecombos *= len(aa) | |
break | |
slicer += 3 | |
## combos = 1 | |
## for i in aa_seq: | |
## combos = len(i)*combos | |
## totalpossiblesequences = 4**len(sequence) | |
return aa_seq, alternatecombos, totalpossiblesequences, long(alternatecombos/totalpossiblesequences) | |
def spliceExons(sequence): | |
''' | |
This function identifies start and stop codons in the sequences, | |
and then cuts our introns and returns a singe exon | |
''' | |
sequence = convertDNAtoRNA(sequence) | |
seq = sequence | |
coding_region = '' | |
while len(seq)> 6: | |
if findStartCodon(seq) == None: | |
break | |
seq = seq[findStartCodon(seq):] | |
if findStopCodon(seq) == None: | |
break | |
coding_region += seq[:findStopCodon(seq)+3] | |
seq = seq[findStopCodon(seq)+3:] | |
return coding_region | |
def RNAtoAminoAcids(sequence): | |
''' | |
This coverts a raw DNA sequence, for example from NCBI, to RNA, | |
then splices the exons, and returns the aa sequence using the | |
single letter aa code. | |
CURRENTLY UNTESTED | |
''' | |
seq = convertDNAtoRNA(sequence) | |
exon = spliceExons(seq) | |
slicer = 0 | |
aa_seq = '' | |
try: | |
while slicer < len(seq): | |
aa = exon[slicer:(slicer + 3)] | |
if aa in ALA: | |
aa_seq += 'a' | |
if aa in CYS: | |
aa_seq += 'c' | |
if aa in ASP: | |
aa_seq += 'd' | |
if aa in GLU: | |
aa_seq += 'e' | |
if aa in PHE: | |
aa_seq += 'f' | |
if aa in GLY: | |
aa_seq += 'g' | |
if aa in HIS: | |
aa_seq += 'h' | |
if aa in ILE: | |
aa_seq += 'i' | |
if aa in LYS: | |
aa_seq += 'k' | |
if aa in LEU: | |
aa_seq += 'l' | |
if aa in MET: | |
aa_seq += 'm' | |
if aa in ASN: | |
aa_seq += 'n' | |
if aa in PYL: | |
aa_seq += 'p' | |
if aa in PRO: | |
aa_seq += 'o' | |
if aa in GLN: | |
aa_seq += 'q' | |
if aa in ARG: | |
aa_seq += 'r' | |
if aa in SER: | |
aa_seq += 's' | |
if aa in THR: | |
aa_seq += 't' | |
if aa in SEC: | |
aa_seq += 'u' | |
if aa in VAL: | |
aa_seq += 'v' | |
if aa in TRP: | |
aa_seq += 'w' | |
if aa in TYR: | |
aa_seq += 'y' | |
slicer += 3 | |
except: | |
pass | |
return aa_seq |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment