Created
September 22, 2014 18:33
-
-
Save annasa/eef7c30152ac296bb49b to your computer and use it in GitHub Desktop.
Input a sorted (chr and start pos) sam file and output all records except for optical duplicates
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/python | |
"RemoveOpticalDuplicates.py" | |
"Author: Anna Salzberg" | |
import sys | |
import string | |
import os | |
import time | |
import math | |
def convertStr(s): | |
"""Convert string to either int or float.""" | |
try: | |
ret = int(s) | |
except ValueError: | |
try : | |
ret = float(s); | |
except ValueError : | |
ret = -1; | |
return ret | |
class Parameter : | |
def __init__(self, argv) : | |
self.argv = argv; | |
self.inFN = ""; | |
self.opticalDuplicatePixelDistance = 100; | |
def getOpticalDuplicatePixelDistance(self) : | |
return self.opticalDuplicatePixelDistance | |
def getInFN(self) : | |
return self.inFN; | |
def getOutFN(self) : | |
return self.outFN; | |
# Prints usage | |
def usage(self): | |
print "Usage: python RemoveOpticalDuplicates.py [-d optical_duplciate_pixel_distance] <input file> " | |
print "where the input is a sam file sorted by chr position. Default optical_duplciate_pixel_distance: 100" | |
print "Example: python RemoveOpticalDuplicates.py sampleA.bwa.sorted.sam > sampleA.bwa.rmoptdup.sam" | |
sys.exit(1) | |
def checkArgs(self) : | |
i = 1; | |
argc = len(sys.argv); | |
errorMsg = ""; | |
while i < argc and errorMsg == "": | |
if self.argv[i] == '-d' : | |
i += 1; | |
if i < argc and not self.argv[i].startswith('-') : | |
self.opticalDuplicatePixelDistance = convertStr(self.argv[i]); | |
if self.opticalDuplicatePixelDistance < 1 : | |
errorMsg = 'Error: optical duplicate pixel distance must be an integer >= 1' | |
else : | |
errorMsg = 'Error: optical duplicate pixel distance expected after -d flag' | |
elif i == argc-1 : | |
self.inFN = self.argv[i]; | |
else : | |
errorMsg = 'Error: unknown flag: ' + self.argv[i]; | |
i += 1; | |
if errorMsg == "" : | |
if self.inFN == "" : | |
errorMsg = 'Error: insufficient arguments given' | |
if errorMsg <> "" : | |
print errorMsg; | |
self.usage(); | |
def removeOpticalDuplicates(param) : | |
inFN = param.getInFN(); | |
outFile = sys.stdout; | |
optDist = param.getOpticalDuplicatePixelDistance() | |
try: | |
inFile = open (inFN, 'r') | |
except: | |
print "Unable to open file " + inFN | |
sys.exit(-1) | |
tile_cigarToDupDict = {} | |
curChr = "" | |
curStartPos = "" | |
first = True; | |
line = inFile.readline(); | |
while (1) : | |
if not line: break | |
nextLine = inFile.readline(); | |
if not line.startswith("@") : | |
tokens = line.split(); | |
chr = tokens[2]; | |
startPos = tokens[3]; | |
mapq = tokens[4] | |
cigar = tokens[5]; | |
subtokens = tokens[0].split(":"); | |
tile = subtokens[4]; | |
x = subtokens[5] | |
y = subtokens[6] | |
tile_cigar = tile + "_" + cigar; | |
if first : | |
curChr = chr; | |
curStartPos = startPos | |
first = False; | |
if chr <> "*" and (chr == curChr and startPos == curStartPos) : | |
dups = tile_cigarToDupDict.get(tile_cigar, []); | |
dups.append([x, y, mapq, line]); | |
tile_cigarToDupDict[tile_cigar] = dups; | |
if chr <> "*" and (chr <> curChr or startPos <> curStartPos or not nextLine) : | |
for key in tile_cigarToDupDict.keys() : | |
dups = tile_cigarToDupDict[key]; | |
dups.sort(); | |
if len(dups) == 1 : | |
nondupline = dups[0][3] | |
outFile.write(nondupline); | |
elif len(dups) > 1 : | |
processed = [] | |
for k in range(0, len(dups)) : | |
processed.append(False) | |
for i in range(0, len(dups)) : | |
if not processed[i] : | |
xi = convertStr(dups[i][0]) | |
yi = convertStr(dups[i][1]) | |
mapqi = convertStr(dups[i][2]) | |
processed[i] = True; | |
best = i; | |
bestMapq = mapqi; | |
for j in range(i+1, len(dups)) : | |
if not processed[j] : | |
xj = convertStr(dups[j][0]) | |
yj = convertStr(dups[j][1]) | |
mapqj = convertStr(dups[j][2]) | |
if abs(xj-xi) > optDist : break; | |
if abs(yj-yi) <= optDist : | |
processed[j] = True | |
if mapqj > bestMapq : | |
best = j; | |
bestMapq = mapqj; | |
bestline = dups[best][3] | |
outFile.write(bestline); | |
tile_cigarToDupDict = {} | |
tile_cigarToDupDict[tile_cigar] = [[x, y, mapq, line]] | |
if chr <> "*" and not nextLine and (chr <> curChr or startPos <> curStartPos) : | |
outFile.write(line); | |
if chr == "*" : | |
outFile.write(line); | |
curChr = chr | |
curStartPos = startPos; | |
line = nextLine; | |
inFile.close(); | |
outFile.close(); | |
# Execute as application | |
if __name__ == '__main__' : | |
param = Parameter(sys.argv) | |
param.checkArgs(); | |
removeOpticalDuplicates(param) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
@annasa
I am following up on a possible bug you reported to Picard last October. Due to a resource shortage we were not able to handle it at the time, but we are finally able to do so now. I realize this is probably too late to help with the particular problem you encountered, however we would like to check whether this was a real bug, and if so, fix the code accordingly.
Would you be able to tell me if you were able to resolve the problem? If not, would you be willing to share a snippet of data for us to reproduce the issue locally and debug?
If so, please let us know on this discussion thread: broadinstitute/picard#134
Thanks!