Created
September 2, 2015 19:43
-
-
Save alexcritschristoph/4484dae35cfcf2709578 to your computer and use it in GitHub Desktop.
Generates randomized contigs from a list of FASTA genomes (naive assembled metagenome simulator)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
from Bio import SeqIO | |
from Bio.SeqRecord import SeqRecord | |
from random import randint | |
j = 1 | |
import sys | |
for root, subdirs, files in os.walk(sys.argv[1]): | |
for f in files: | |
seq = os.path.join(root,f) | |
seqr = SeqIO.read(open(seq), "fasta") | |
limit=len(seqr.seq) | |
#generate a random number of fragments from this genome | |
#random size, greater than 10,000 bp and smaller than 0.75 of its genome) | |
if limit > 10000: | |
#Maximum is 100,000 bp or 0.75 * genome size | |
max_s = int(0.75*limit) | |
if max_s > 100000: | |
max_s = 100000 | |
size = randint(1000,max_s) | |
#random start- anywhere from 0 to genome_length - size | |
start = randint(0,limit-size) | |
end = start + size | |
fragment = seqr.seq[start:end] | |
print ">" + str(j) | |
print fragment | |
j += 1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment