-
-
Save dgrtwo/3725741 to your computer and use it in GitHub Desktop.
| """ | |
| barcode_splitter.py | |
| Barcode splitter for fastq sequencing files, that matches using Levenshtein | |
| distance. | |
| USAGE: | |
| python barcode_splitter.py reads.fastq index_reads.fastq barcodes.txt | |
| Takes three files: a raw reads file, a barcode index reads file, where each | |
| barcode corresponds to a read in the raw file, and an catalog file in a | |
| format similar to: | |
| group1 TGACCA | |
| group2 GCCAAT | |
| group3 CAGATC | |
| This program matches each index to the closest barcode, including mutations, | |
| insertions, and deletions. It then splits the file according to those indices- | |
| in this case, into group1.fastq, group2.fastq, and group3.fastq. | |
| """ | |
| import sys | |
| import warnings | |
| import itertools | |
| import collections | |
| from Bio import SeqIO | |
| def levenshtein_distance(s1, s2): | |
| """ | |
| Python version of Levenshtein distance for compatability. The third-party | |
| library is much faster and recommended. Taken from recipe at: | |
| http://code.activestate.com/recipes/576874-levenshtein-distance/ | |
| """ | |
| l1 = len(s1) | |
| l2 = len(s2) | |
| matrix = [range(l1 + 1)] * (l2 + 1) | |
| for zz in xrange(l2 + 1): | |
| matrix[zz] = range(zz, zz + l1 + 1) | |
| for zz in range(0, l2): | |
| for sz in range(0, l1): | |
| if s1[sz] == s2[zz]: | |
| matrix[zz + 1][sz + 1] = min(matrix[zz + 1][sz] + 1, | |
| matrix[zz][sz + 1] + 1, matrix[zz][sz]) | |
| else: | |
| matrix[zz + 1][sz + 1] = min(matrix[zz + 1][sz] + 1, | |
| matrix[zz][sz + 1] + 1, matrix[zz][sz] + 1) | |
| return matrix[l2][l1] | |
| try: | |
| import Levenshtein | |
| levenshtein_distance = Levenshtein.distance | |
| except ImportError: | |
| warnings.warn("python-Levenshtein package not found. The package is " + | |
| "recommended as it makes barcode splitting much faster. " + | |
| "Using native Python Levenshtein distance function instead.") | |
| class BarcodeIndex: | |
| """Represents a set of indices, with the ability to find the closest one""" | |
| def __init__(self, index_file, max_distance=3): | |
| self.cache = {} | |
| with open(index_file) as inf: | |
| for l in inf: | |
| if l.startswith("#") or l == "": | |
| continue | |
| g, barcode = l.split("\t") | |
| self.cache[barcode] = g | |
| self.barcodes = self.cache.keys() | |
| self.groups = self.cache.values() | |
| self.max_distance = max_distance | |
| def find_barcode(self, barcode): | |
| """ | |
| match barcode and return the group it's supposed to be in. If | |
| there is none within the Levenshtein distance given, or if there | |
| is a tie, return None | |
| """ | |
| # if there's an exact match, return that | |
| exact = self.cache.get(barcode) | |
| if exact is not None: | |
| return exact | |
| # find the Levenshtein distance to each | |
| distances = [levenshtein_distance(barcode, b) for b in self.barcodes] | |
| best = min(distances) | |
| # check if there's a tie or the distance is too great: | |
| if best > self.max_distance or distances.count(best) > 1: | |
| return None | |
| # otherwise, return the best one, after caching it for future use | |
| ret = self.groups[distances.index(best)] | |
| self.cache[barcode] = ret | |
| return ret | |
| def split_reads(read_file, barcode_file, index_file): | |
| """ | |
| Given a fastq file of reads, a fastq file with corresponding barcodes, | |
| and a file with the barcode index. Create one fastq file for each group | |
| based on the closest matching barcode | |
| """ | |
| index = BarcodeIndex(index_file) | |
| counts = collections.defaultdict(int) | |
| # one output for each group | |
| outfs = dict([(g, open(g + ".fastq", "w")) | |
| for g in index.groups + ["Unassigned"]]) | |
| with open(read_file) as read_inf, open(barcode_file) as barcode_inf: | |
| for r, b in itertools.izip(SeqIO.parse(read_inf, "fastq"), | |
| SeqIO.parse(barcode_inf, "fastq")): | |
| group = index.find_barcode(str(b.seq)) | |
| group = group if group != None else "Unassigned" | |
| SeqIO.write([r], outfs[group], "fastq") | |
| counts[group] += 1 | |
| for k, v in sorted(counts.items(), key=lambda t: t[0] == "Unassigned"): | |
| print k, v | |
| for o in outfs.values(): | |
| o.close() | |
| if __name__ == "__main__": | |
| if len(sys.argv) != 4: | |
| print "USAGE: python barcode_splitter.py reads.fastq index.fastq", | |
| print "barcodes.txt" | |
| sys.exit(1) | |
| [read_file, barcode_file, index_file] = sys.argv[1:] | |
| split_reads(read_file, barcode_file, index_file) |
Hi David,
FR-W-S073878:~ vengurlekarp$ python barcode_splitter.py reads.fastq index_reads.fastq barcodes.txt /Users/vengurlekarp/Downloads/Data-2/Intensities/BaseCalls/1_S1_L001_R1_001.fastq barcode.txt
File "barcode_splitter.py", line 115
with open(read_file) as read_inf, open(barcode_file) as barcode_inf:
^
SyntaxError: invalid syntax
FR-W-S073878:~ vengurlekarp$
I ma getting this error when i use the usage command....This is a little urgent for me to get this to work...i am using your script for the first time.Kindly looking forward to your reply .
Sincerely,
Piyu
FR-W-S073878:~ vengurlekarp$ python barcode_splitter.py 1_S1_L001_R1_001.fastq barcode.txt
File "barcode_splitter.py", line 115
with open(read_file) as read_inf, open(barcode_file) as barcode_inf:
^
SyntaxError: invalid syntax
FR-W-S073878:~ vengurlekarp$
The with statement isn't part of the language until Python 2.6, but can be enabled under Python 2.5 - what version are you using @piyuveng1
EDIT: Just to make sure, the file I don't understand if 'index.fastq' from the usage. I do get 'barcodes.txt'.
Hi David,
I'm writing a custom tool in python for NGS work. I would love to use this Gist for the multiplexing handling. Could you please expand on what exactly is the index file? It is no clear to me how I can build this file or why you would need it.
Thanks,
Carlos