Created
November 8, 2012 18:47
-
-
Save fransua/4040707 to your computer and use it in GitHub Desktop.
searchs for potential loops in single strand sequences
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
08 Nov 2012 | |
""" | |
from string import maketrans | |
REV = maketrans('ATGC', 'TACG') | |
def print_loops(seq, matches): | |
""" | |
nice way to print loops | |
st1 st1+len1 st1+len2 | |
| | | | |
xxxxmmmmmmmmmxmmmmmmmmmmmmmxxxxxxxxxxx | |
||||||||| ||||||||||||| x | |
xxxxmmmmmmmmmxmmmmmmmmmmmmmxxxxxxxxxxx | |
| | | | |
st2 st2+len1 st2+len2 | |
""" | |
print ' ' + '-'*80 | |
for i, m in enumerate(matches.items()): | |
print ' # {}\n'.format(i+1) | |
_print_loop(seq, m) | |
print ' ' + '-'*80 | |
def _print_loop(seq, match): | |
""" | |
nice way to print 1 loop | |
st1 st1+len1 st1+len2 | |
| | | | |
xxxxmmmmmmmmmxmmmmmmmmmmmmmxxxxxxxxxxx | |
||||||||| ||||||||||||| x | |
xxxxmmmmmmmmmxmmmmmmmmmmmmmxxxxxxxxxxx | |
| | | | |
st2 st2+len1 st2+len2 | |
""" | |
(st1, st2), lengths = match | |
len2 = lengths[-1] | |
loop_len = (len(seq) - (st2 + len2)) - (st1 + len2) | |
seq1 = seq [:st1 +len2 + loop_len/2] | |
seq2 = seq[::-1][:st2 + len2 + loop_len/2] | |
prev = ' '*4 | |
draw = prev + '{1:>{0}}'.format(max(len(seq1), len(seq2)), seq1) + '\n' | |
draw += prev + ' ' * max(st1, st2) | |
draw += '|'*lengths[0] | |
for i in xrange(1,len(lengths)): | |
draw += ' ' + '|' * (lengths[i]-lengths[i-1]-1) | |
draw += ' ' * (loop_len/2) + (seq[st1 + len2 + loop_len/2] if loop_len%2 else ')') + '\n' | |
draw += prev + '{1:>{0}}'.format(max(len(seq1), len(seq2)), seq2) + '\n' | |
print draw | |
def merge_not_exact_match_old(matches, gap = 1): | |
""" | |
find consecutive perfect matches separated by 1 base (gap value) and | |
merge them. | |
""" | |
while True: | |
for m in matches: | |
add = matches[m][-1] + gap | |
nxt = m[0] + add, m[1] + add | |
if not nxt in matches: continue | |
matches[m] = [_ for _ in matches[m]]+[_ + add for _ in matches[nxt]] | |
break | |
else: | |
break | |
del(matches[nxt]) | |
def merge_not_exact_match(matches, gap = 1): | |
""" | |
find consecutive perfect matches separated by 1 base (gap value) and | |
merge them. | |
""" | |
to_merge = [] | |
for m in matches: | |
add = matches[m][-1] + gap | |
nxt = m[0] + add, m[1] + add | |
if not nxt in matches: | |
continue | |
to_merge.append((m, nxt)) | |
to_del = [] | |
for m, nxt in to_merge: | |
matches[m] = [_ for _ in matches[m]]+[_ + add for _ in matches[nxt]] | |
to_del.append(nxt) | |
for m in to_del: | |
del(matches[m]) | |
def search_palindrome(seq, verbose=True): | |
""" | |
Find all complementary sequences on a simgle strand. | |
may be separated by at least 3 bases. | |
""" | |
def rev_matches(pattern, seq2, ind): | |
""" | |
search for all ocurrences of pattern in seq2 | |
""" | |
while ind != -1: | |
yield ind | |
ind = seq2.find(pattern, ind+1) | |
rev_seq = seq[::-1].translate(REV) | |
matches = {} | |
gap = 3 # this is fixed otherwise print will not work | |
for i in xrange(len(seq)-gap): | |
for j in xrange(i+gap, len(seq)): | |
pattern = seq[i:j] | |
seq2 = rev_seq[:-j-gap] | |
indx = seq2.find(pattern) | |
# not found | |
if indx == -1: | |
break | |
# previous bas pair also matching (already found) | |
if seq[i-1] is rev_seq[indx-1]: | |
if i!=0 and indx!=0: | |
break | |
# find all matchings in other part | |
for indx in rev_matches(pattern, seq2, indx): | |
matches[(i, indx)] = [j-i] | |
if verbose: | |
print '{0} possible loops found in sequences:\n'.format(len(matches)) | |
print '{0}\n'.format(seq) | |
print ' ' + '-'*80 | |
for i, m in enumerate(matches.items()): | |
print ' # {}\n'.format(i+1) | |
_print_loop(seq, m) | |
print ' ' + '-'*80 | |
return matches | |
def main(): | |
""" | |
main function... testing | |
""" | |
#from random import random | |
seq = 'CAACGTGGCAACGTGCTATACGTAGTACTGCACGTTCCCAGGTT'*300 | |
matches = search_palindrome(seq, verbose=False) | |
print len(matches) | |
merge_not_exact_match(matches) | |
print len(matches) | |
x = max(matches, key=lambda x:matches[x][-1]) | |
_print_loop(seq, (x, matches[x])) | |
#print_loops(seq, matches) | |
if __name__ == "__main__": | |
exit(main()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment