-
-
Save davidliwei/2323462 to your computer and use it in GitHub Desktop.
| #!/usr/bin/env python | |
| ''' | |
| Automatically estimate insert size of the paired-end reads for a given SAM/BAM file. | |
| Usage: getinsertsize.py <SAM file> or samtools view <BAM file> | getinsertsize.py - | |
| Author: Wei Li | |
| Copyright (c) <2015> <Wei Li> | |
| Permission is hereby granted, free of charge, to any person obtaining a copy | |
| of this software and associated documentation files (the "Software"), to deal | |
| in the Software without restriction, including without limitation the rights | |
| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
| copies of the Software, and to permit persons to whom the Software is | |
| furnished to do so, subject to the following conditions: | |
| The above copyright notice and this permission notice shall be included in | |
| all copies or substantial portions of the Software. | |
| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | |
| THE SOFTWARE. | |
| ''' | |
| from __future__ import print_function | |
| import sys; | |
| import pydoc; | |
| import os; | |
| import re; | |
| import fileinput; | |
| import math; | |
| import argparse; | |
| parser=argparse.ArgumentParser(description='Automatically estimate the insert size of the paired-end reads for a given SAM/BAM file.'); | |
| parser.add_argument('SAMFILE',type=argparse.FileType('r'),help='Input SAM file (use - from standard input)'); | |
| parser.add_argument('--span-distribution-file','-s',type=argparse.FileType('w'),help='Write the distribution of the paired-end read span into a text file with name SPAN_DISTRIBUTION_FILE. This text file is tab-delimited, each line containing two numbers: the span and the number of such paired-end reads.'); | |
| parser.add_argument('--read-distribution-file','-r',type=argparse.FileType('w'),help='Write the distribution of the paired-end read length into a text file with name READ_DISTRIBUTION_FILE. This text file is tab-delimited, each line containing two numbers: the read length and the number of such paired-end reads.'); | |
| args=parser.parse_args(); | |
| plrdlen={}; | |
| plrdspan={}; | |
| def getmeanval(dic,maxbound=-1): | |
| nsum=0; n=0; | |
| for (k,v) in dic.items(): | |
| if maxbound!=-1 and k>maxbound: | |
| continue; | |
| nsum=nsum+k*v; | |
| n=n+v; | |
| meanv=nsum*1.0/n; | |
| nsum=0; n=0; | |
| for (k,v) in dic.items(): | |
| if maxbound!=-1 and k>maxbound: | |
| continue; | |
| nsum=nsum+(k-meanv)*(k-meanv)*v; | |
| n=n+v; | |
| varv=math.sqrt(nsum*1.0/(n-1)); | |
| return (meanv,varv); | |
| objmrl=re.compile('([0-9]+)M$'); | |
| objmtj=re.compile('NH:i:(\d+)'); | |
| nline=0; | |
| for lines in args.SAMFILE: | |
| field=lines.strip().split(); | |
| nline=nline+1; | |
| if nline%1000000==0: | |
| print(str(nline/1000000)+'M...',file=sys.stderr); | |
| if len(field)<12: | |
| continue; | |
| try: | |
| mrl=objmrl.match(field[5]); | |
| if mrl==None: # ignore non-perfect reads | |
| continue; | |
| readlen=int(mrl.group(1)); | |
| if readlen in plrdlen.keys(): | |
| plrdlen[readlen]=plrdlen[readlen]+1; | |
| else: | |
| plrdlen[readlen]=1; | |
| if field[6]!='=': | |
| continue; | |
| dist=int(field[8]); | |
| if dist<=0: # ignore neg dist | |
| continue; | |
| mtj=objmtj.search(lines); | |
| # if mtj==None: | |
| # continue; | |
| # if int(mtj.group(1))!=1: | |
| # continue; | |
| #print(field[0]+' '+str(dist)); | |
| if dist in plrdspan.keys(): | |
| plrdspan[dist]=plrdspan[dist]+1; | |
| else: | |
| plrdspan[dist]=1; | |
| except ValueError: | |
| continue; | |
| #print(str(plrdlen)); | |
| #print(str(plrdspan)); | |
| # get the maximum value | |
| readlenval=getmeanval(plrdlen); | |
| print('Read length: mean '+str(readlenval[0])+', STD='+str(readlenval[1])); | |
| # print('Possible read lengths and their counts:'); | |
| # print(str(plrdlen)); | |
| if args.span_distribution_file is not None: | |
| for k in sorted(plrdspan.keys()): | |
| print(str(k)+'\t'+str(plrdspan[k]),file=args.span_distribution_file); | |
| if args.read_distribution_file is not None: | |
| for k in sorted(plrdlen.keys()): | |
| print(str(k)+'\t'+str(plrdlen[k]),file=args.read_distribution_file); | |
| if len(plrdspan)==0: | |
| print('No qualified paired-end reads found. Are they single-end reads?'); | |
| else: | |
| maxv=max(plrdspan,key=plrdspan.get); | |
| spanval=getmeanval(plrdspan,maxbound=maxv*3); | |
| print('Read span: mean '+str(spanval[0])+', STD='+str(spanval[1])); | |
| # print('maxv:'+str(maxv)); |
Am trying it, thank you...
Hello,
Thank you for this tool !
I am feeding getinsertsize.py with overlapping paired-end reads. I get two metrics: "read size" and "read span". Is the "read span" the insert size + the 2x the number of cycles? I am asking because I find it confusing that in the first case "read" really means sequenced read, but in the second it would mean the distance between first base sequenced for reads 1 and 2 right? The outer distance in some sense.
Hello,
Thank you for this tool !
I am feeding getinsertsize.py with overlapping paired-end reads. I get two metrics: "read size" and "read span". Is the "read span" the insert size + the 2x the number of cycles? I am asking because I find it confusing that in the first case "read" really means sequenced read, but in the second it would mean the distance between first base sequenced for reads 1 and 2 right? The outer distance in some sense.
I have the same question
Read his original blog post and comments there for more details.
Awesome! The span/length writeouts are super useful for potting distribution graphs - thanks so much :)