Created
November 25, 2012 21:43
-
-
Save nikaspran/4145533 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import Bio.Blast.NCBIWWW | |
import Bio.Blast.NCBIXML | |
import Bio.Blast.Record | |
import Bio.Align.Applications | |
import Bio.SeqIO | |
from StringIO import StringIO | |
from Bio.SeqRecord import SeqRecord | |
from Bio.Seq import Seq | |
from Bio.Alphabet import generic_protein | |
from Bio.Align.Applications import MafftCommandline | |
from sys import maxint as MAX_INT | |
SEQUENCE_FILE = "py_results.faa" | |
SEQUENCE_FILE_FORMAT = "fasta" | |
SEQUENCE_WINDOW = 15 | |
SEQUENCE_NO = "4502027" | |
ENTREZ_QUERY = '"serum albumin"[Protein] AND mammals[Organism]' | |
MIN_POSITIVES = 80.0 | |
MAFFT_CMD = "mafft.bat" | |
REFERENCE_SEQUENCE_ID = "ALBU_HUMAN" | |
def filter_by_min_positives(blast_results, min_positives_percent): | |
correct_indices = [] | |
for index, sequence in enumerate(blast_results.alignments): | |
if sequence.hsps[0].positives * 100.0 / sequence.length >= min_positives_percent: | |
correct_indices.append(index) | |
filtered_results = Bio.Blast.Record.Blast() | |
filtered_results.multiple_alignment = blast_results.multiple_alignment | |
for index in correct_indices: | |
filtered_results.descriptions.append(blast_results.descriptions[index]) | |
filtered_results.alignments.append(blast_results.alignments[index]) | |
return filtered_results | |
def count_differences(seq1, seq2): | |
return len([i for i, (left, right) in enumerate(zip(seq1, seq2)) if left != right]) | |
def calculate_difference_extreme_indices(aligned_records, reference_record, sequence_window): | |
min_start, max_start = 0, 0 | |
min_diff, max_diff = MAX_INT, 0 | |
for start_index in range(len(reference_record) - sequence_window): | |
end_index = start_index + sequence_window | |
human_subseq = reference_record[start_index:end_index].seq | |
differences = 0 | |
for record in aligned_records: # no need to skip human record for there will be no differences | |
record_subseq = record[start_index:end_index].seq | |
differences += count_differences(human_subseq, record_subseq) | |
if differences < min_diff: | |
min_start = start_index | |
min_diff = differences | |
if differences > max_diff: | |
max_start = start_index | |
max_diff = differences | |
return min_start, max_start | |
def run_and_parse_blast(sequence, entrez_query): | |
blast_result_xml = Bio.Blast.NCBIWWW.qblast(program="blastp", database="swissprot", sequence=sequence, | |
entrez_query=entrez_query, hitlist_size=500, expect=100.0) | |
return Bio.Blast.NCBIXML.read(blast_result_xml) | |
def write_to_fasta(blast_results, filename): | |
sequence_records = [] | |
for index, alignment in enumerate(blast_results.alignments): | |
sequence = alignment.hsps[0] | |
sequence_records.append(SeqRecord(Seq(sequence.sbjct, generic_protein), id=alignment.title)) | |
Bio.SeqIO.write(sequence_records, filename, SEQUENCE_FILE_FORMAT) | |
def align_using_mafft(input): | |
mafft_cline = MafftCommandline(cmd=MAFFT_CMD, input=input) | |
stdout, stderr = mafft_cline() | |
return list(Bio.SeqIO.parse(StringIO(stdout), SEQUENCE_FILE_FORMAT)) | |
def find_first_record_by_id(sequences, id): | |
return filter(lambda sequence: id in sequence.id, sequences)[0] | |
### Main script: | |
blast_results = run_and_parse_blast(SEQUENCE_NO, ENTREZ_QUERY) | |
filtered_results = filter_by_min_positives(blast_results, MIN_POSITIVES); | |
write_to_fasta(filtered_results, SEQUENCE_FILE) | |
aligned_records = align_using_mafft(SEQUENCE_FILE) | |
reference_record = find_first_record_by_id(aligned_records, REFERENCE_SEQUENCE_ID) | |
min_start, max_start = calculate_difference_extreme_indices(aligned_records, reference_record, SEQUENCE_WINDOW) | |
print "Least different start:", min_start + 1, ", seq:", reference_record[min_start:min_start + SEQUENCE_WINDOW].seq | |
print "Most different start:", max_start + 1, ", seq:", reference_record[max_start:max_start + SEQUENCE_WINDOW].seq |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment