Skip to content

Instantly share code, notes, and snippets.

@zhoujj2013
Created July 16, 2014 02:59
Show Gist options
  • Save zhoujj2013/024ef0789c1f5538d568 to your computer and use it in GitHub Desktop.
Save zhoujj2013/024ef0789c1f5538d568 to your computer and use it in GitHub Desktop.
gtf2gff3.pl: convert ensembl gtf to gff3 format, just for JBrowser
#/usr/bin/perl -w
use strict;
use Data::Dumper;
## Written by [email protected]
## convert ensembl gtf to gff3 format, just for JBrowser
my ($gtf_f, $chr_list_f) = @ARGV; # ensembl gtf file, gtf from different source are different.
my %chr;
open IN,"$chr_list_f" || die $!;
while(<IN>){
chomp;
my @t = split /\t/;
$chr{$t[0]} = $t[1];
}
close IN;
my %gtf;
&read_gtf($gtf_f, \%gtf, \%chr);
#print Dumper(\%gtf);
# output gff file
print "##gff-version 3\n";
foreach my $chr (sort keys %gtf){
foreach my $gid (keys %{$gtf{$chr}}){
# output the gene records
my $attr = pop @{$gtf{$chr}{$gid}{'gene'}};
print "####\n";
print join "\t",@{$gtf{$chr}{$gid}{'gene'}};
print "\tID=$gid\n";
foreach my $tid (keys %{$gtf{$chr}{$gid}}){
next if($tid eq "gene");
# output transcript record
$gtf{$chr}{$gid}{$tid}{'trans'}->[2] = "mRNA";
my $attr_trans = pop @{$gtf{$chr}{$gid}{$tid}{'trans'}};
my $trans_name = $1 if($attr_trans =~ /transcript_name "([^"]+)"/);
print join "\t",@{$gtf{$chr}{$gid}{$tid}{'trans'}};
print "\tID=$tid;Name=$trans_name;Parent=$gid";
print "\n";
# output CDS, exon, three_prime_UTR, five_prime_UTR
my $flag = $gtf{$chr}{$gid}{$tid}{'flag'};
my @flag_sorted = sort {$a->[3] <=> $b->[3]} @$flag;
my $cds_seq = 0;
foreach my $f (@flag_sorted){
my $f_attr = pop(@$f);
if($f->[2] eq "UTR" && $cds_seq == 0){
if($f->[6] eq "+"){
$f->[2] = "five_prime_UTR";
}elsif($f->[6] eq "-"){
$f->[2] = "three_prime_UTR";
}
print join "\t",@$f;
print "\tParent=$tid;Name=$trans_name\n";
}elsif($f->[2] eq "UTR" && $cds_seq == 1){
if($f->[6] eq "-"){
$f->[2] = "five_prime_UTR";
}elsif($f->[6] eq "+"){
$f->[2] = "three_prime_UTR";
}
print join "\t",@$f;
print "\tParent=$tid;Name=$trans_name\n";
}else{
if($f->[2] eq "CDS"){
$cds_seq = 1;
}
print join "\t",@$f;
print "\tParent=$tid;Name=$trans_name\n";
}
}
}
}
}
# read in gtf file
sub read_gtf{
my ($f, $gtf, $c) = @_;
open IN,"$f" || die $!;
while(<IN>){
next if(/^#/);
chomp;
my @t = split /\t/;
next unless($c->{$t[0]});
my $chr = $c->{$t[0]};
$t[0] = $chr;
my $start = $t[3];
my $end = $t[4];
my $strand = $t[6];
($t[3],$t[4]) = ($t[4],$t[3]) if($t[3]>$t[4]);
my ($geneid,$transid) = ("","");
$geneid = $1 if($t[8] =~ /gene_id "([^"]+)";/);
$transid = $1 if($t[8] =~ /transcript_id "([^"]+)";/ && $t[2] ne "gene");
if($t[2] eq "gene"){
$gtf{$chr}{$geneid}{'gene'} = \@t;
}elsif($t[2] eq "transcript"){
$gtf{$chr}{$geneid}{$transid}{'trans'} = \@t;
}else{
push @{$gtf{$chr}{$geneid}{$transid}{'flag'}},\@t;
}
}
close IN;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment