Last active
October 24, 2017 14:36
-
-
Save wckdouglas/5d4322b8e779f8f55431bdd8372e36e8 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
chr1 566309 566361 - GTGATT | |
chr1 568081 568136 + TTGAAA,TTGAAA,TTGAAA | |
chr1 16840712 16840773 - TACTTA | |
chr1 17067025 17222642 + TGGCAG | |
chr1 45196732 45196802 - TCGCCT | |
chr1 161424755 161432231 - GCGTTG | |
chr1 161493635 161493697 - TGGTGG | |
chr1 173834762 173834818 - TTGTAA | |
chr1 173835773 173835846 - TCCACA | |
chr1 204475714 204475737 + TCAAGT |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
from __future__ import print_function | |
from itertools import combinations | |
from functools import partial | |
from collections import Counter | |
from networkx import Graph, connected_components | |
import sys | |
import argparse | |
from numba import jit | |
def get_opt(): | |
help_message = 'Demultiplexing UMI bed file with the follwing columns:\n'+\ | |
'1. chrom name\n2. start\n3. end\n4. strand\n5. collapsed barcodes delim by ","\n'+\ | |
'This file can be generated by the command: \n'+\ | |
' datamash -g 1,2,3,6 collapse 4 \n'+\ | |
'with barcode at column $I of a $BED_FILE \n'+\ | |
'The program internally used hamming distance matrix of the '+\ | |
'barcodes to generate connected network and cluster them' | |
parser = argparse.ArgumentParser(description=help_message) | |
parser.add_argument('-i','--infile', | |
help ='input bedfile, can be "-" or "/dev/stdin" for stdin (default: -).' +\ | |
'Format should be: ' + \ | |
'\n1. chrom name\n2. start\n3. end\n4. strand\n5. collapsed barcodes delim by ","', | |
default = '-') | |
parser.add_argument('-t','--threshold', | |
help='How many error between barcodes can be tolerated? (default = 1)', | |
type=int, default = 1) | |
args = parser.parse_args() | |
return args | |
@jit(nopython=True) | |
def hamming_barcode(barcode_pair): | |
a, b = barcode_pair | |
assert len(a) == len(b), 'Wrong barcode extraction' | |
hd = 0 | |
for i, j in zip(a, b): | |
if i != j: | |
hd += 1 | |
return hd | |
@jit() | |
def make_graph(comparison, threshold): | |
''' | |
Using a graph to connect all umi with <= threshold mismatches | |
''' | |
G = Graph() | |
for pair in comparison: | |
if hamming_barcode(pair) <= threshold: | |
G.add_edge(pair[0],pair[1]) | |
else: | |
G.add_node(pair[0]) | |
G.add_node(pair[1]) | |
return G | |
@jit() | |
def unique_barcode_from_graph(graph, barcodes): | |
''' | |
''' | |
unique_barcode = [] | |
for subgraph in connected_components(graph): | |
subgraph = list(subgraph) | |
if len(subgraph) == 1: | |
barcode_id = subgraph[0] | |
member_counts = barcodes[barcode_id] | |
barcode_id = barcode_id + '_' + str(member_counts) + '_members' | |
else: | |
member_counts = sum(barcodes[bc] for bc in subgraph) | |
barcode_id = subgraph[0] + '_' + str(member_counts) + '_members' | |
unique_barcode.append(barcode_id) | |
return unique_barcode | |
def demultiplex(barcodes, threshold): | |
comparison = combinations(barcodes.keys(),r=2) | |
graph = make_graph(comparison, threshold) | |
unique_barcode = unique_barcode_from_graph(graph, barcodes) | |
#print 'In: %i, out: %i' %(len(barcodes), len(unique_barcode)) | |
return unique_barcode | |
@jit() | |
def make_line(chrom, start, end, strand, barcode): | |
''' | |
Generate output bed line | |
''' | |
length = long(end) - long(start) | |
template = '{chrom}\t{start}\t{end}\t{barcode}\t{length}\t{strand}' \ | |
.format(chrom = chrom, | |
start = start, | |
end = end, | |
barcode = barcode, | |
length = length, | |
strand= strand) | |
print(template, file=sys.stdout) | |
return 0 | |
@jit() | |
def file_processing(file_handle, threshold): | |
''' | |
parsing bed line and extract distinct barcode, count the duplicates and out put bed line | |
''' | |
out_count = 0 | |
for i, line in enumerate(file_handle): | |
line = line.rstrip('\n') | |
fields = line.split('\t') | |
barcodes = fields[-1] | |
template = partial(make_line, fields[0], fields[1], fields[2], fields[3]) | |
if ',' not in barcodes: | |
barcode_id = barcodes + '_1_members' | |
template(barcode_id) | |
out_count += 1 | |
else: | |
barcodes = barcodes.split(',') | |
barcodes_counter = Counter(barcodes) | |
if len(barcodes_counter.keys()) == 1: | |
barcode_id = barcodes[0] + '_' + str(barcodes_counter[barcodes[0]]) + '_members' | |
template(barcode_id) | |
out_count += 1 | |
else: | |
barcode_ids = demultiplex(barcodes_counter, threshold) | |
for barcode_id in barcode_ids: | |
template(barcode_id) | |
out_count += 1 | |
if i % 10000000 == 0: | |
print('Parsed %i lines' %(i), file=sys.stderr) | |
print('Ouput: %i lines' %(out_count), file=sys.stderr) | |
return 0 | |
def main(): | |
args = get_opt() | |
filename = args.infile | |
threshold = args.threshold | |
if filename != '-' and filename != '/dev/stdin': | |
file_handle = open(filename, 'r') | |
sys.stderr.write('Using file %s\n' %filename) | |
else: | |
file_handle = sys.stdin | |
print('Using stdin...', file=sys.stderr) | |
file_processing(file_handle, threshold) | |
return 0 | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment