Skip to content

Instantly share code, notes, and snippets.

@budsonjelmont
Forked from davidliwei/gtf2bed.py
Last active October 15, 2019 19:37
Show Gist options
  • Save budsonjelmont/5a08c6b30e6f65e881315e730d4f05d6 to your computer and use it in GitHub Desktop.
Save budsonjelmont/5a08c6b30e6f65e881315e730d4f05d6 to your computer and use it in GitHub Desktop.
Converting Cufflinks predictions (.GTF) into .BED annotations
#!/usr/bin/env python3
'''
gtf2bed.py converts GTF file to BED file.
Usage: gtf2bed.py {OPTIONS} [.GTF file]
History
Nov.5th 2012:
1. Allow conversion from general GTF files (instead of only Cufflinks supports).
2. If multiple identical transcript_id exist, transcript_id will be appended a string like "_DUP#" to separate.
'''
import sys;
import re;
if len(sys.argv)<2:
print('This script converts .GTF into .BED annotations.\n');
print('Usage: gtf2bed {OPTIONS} [.GTF file]\n');
print('Options:');
print('-c color\tSpecify the color of the track. This is a RGB value represented as "r,g,b". Default 255,0,0 (red)');
print('\nNote:');
print('1\tOnly "exon" and "transcript" are recognized in the feature field (3rd field).');
print('2\tIn the attribute list of .GTF file, the script tries to find "gene_id", "transcript_id" and "FPKM" attribute, and convert them as name and score field in .BED file.');
print('Author: Wei Li (li.david.wei AT gmail.com)');
sys.exit();
color='255,0,0'
for i in range(len(sys.argv)):
if sys.argv[i]=='-c':
color=sys.argv[i+1];
allids={};
def printbedline(estart,eend,cdsstart,cdsend,field,nline):
try:
estp=estart[0]-1;
eedp=eend[-1];
# If no CDS was found, set cdsstart = cdsend = eedp
if cdsstart==float('-inf'):
cdsstart=eedp
cdsend=eedp
# use regular expression to get transcript_id, gene_id and expression level
geneid=re.findall(r'gene_id \"([\w\.]+)\"',field[8])
transid=re.findall(r'transcript_id \"([\w\.]+)\"',field[8])
fpkmval=re.findall(r'FPKM \"([\d\.]+)\"',field[8])
if len(geneid)==0:
print('Warning: no gene_id field ',file=sys.stderr);
else:
geneid=geneid[0];
if len(transid)==0:
print('Warning: no transcript_id field',file=sys.stderr);
transid='Trans_'+str(nline);
else:
transid=transid[0];
if transid in allids.keys():
transid2=transid+'_DUP'+str(allids[transid]);
allids[transid]=allids[transid]+1;
transid=transid2;
else:
allids[transid]=1;
if len(fpkmval)==0:
#print('Warning: no FPKM field',file=sys.stderr);
fpkmval='100';
else:
fpkmval=fpkmval[0];
fpkmint=round(float(fpkmval));
print(field[0]+'\t'+str(estp)+'\t'+str(eedp)+'\t'+transid+'\t'+str(fpkmint)+'\t'+field[6]+'\t'+str(cdsstart)+'\t'+str(cdsend)+'\t'+color+'\t'+str(len(estart))+'\t',end='');
seglen=[eend[i]-estart[i]+1 for i in range(len(estart))];
segstart=[estart[i]-estart[0] for i in range(len(estart))];
strl=str(seglen[0]);
for i in range(1,len(seglen)):
strl+=','+str(seglen[i]);
strs=str(segstart[0]);
for i in range(1,len(segstart)):
strs+=','+str(segstart[i]);
print(strl+'\t'+strs);
except ValueError:
print('Error: non-number fields at line '+str(nline),file=sys.stderr);
cdsstart=cdsstart=float('-inf');
cdsend=cdsend=float('inf');
estart=[];
eend=[];
# read lines one to one
nline=0;
prevfield=[];
prevtransid='';
for lines in open(sys.argv[-1]):
field=lines.strip().split('\t');
nline=nline+1;
if len(field)<9:
print('Error: the GTF should has at least 9 fields at line '+str(nline),file=sys.stderr);
continue;
if field[1]!='Cufflinks':
pass;
#print('Warning: the second field is expected to be \'Cufflinks\' at line '+str(nline),file=sys.stderr);
if field[2]!='exon' and field[2] !='transcript' and field[2] != 'CDS':
#print('Error: the third filed is expected to be \'exon\' or \'transcript\' at line '+str(nline),file=sys.stderr);
continue;
transid=re.findall(r'transcript_id \"([\w\.]+)\"',field[8]);
if len(transid)>0:
transid=transid[0];
else:
transid='';
if field[2]=='transcript' or (prevtransid != '' and transid!='' and transid != prevtransid):
#print('prev:'+prevtransid+', current:'+transid);
# A new transcript record, write
if len(estart)!=0:
printbedline(estart,eend,cdsstart,cdsend,prevfield,nline);
cdsstart=float('-inf');
cdsend=float('inf');
estart=[];
eend=[];
prevfield=field;
prevtransid=transid;
if field[2]=='exon':
try:
est=int(field[3]);
eed=int(field[4]);
estart+=[est];
eend+=[eed];
except ValueError:
print('Error: non-number fields at line '+str(nline),file=sys.stderr);
if field[2]=='CDS':
try:
cdsst=int(field[3]);
cdsed=int(field[4]);
except ValueError:
print('Error: non-number fields at line '+str(nline),file=sys.stderr);
if cdsstart==float('-inf') or cdsst < cdsstart:
cdsstart = cdsst
if cdsend==float('inf') or cdsed > cdsend:
cdsend = cdsed
# the last record
if len(estart)!=0:
printbedline(estart,eend,cdsstart,cdsend,field,nline);
@budsonjelmont
Copy link
Author

Added additional handling for annotating CDS start & end (as thickStart & thickEnd .Bed attributes) based on the format they're given in in the GENCODE .GTF files.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment