Created
July 16, 2014 02:59
-
-
Save zhoujj2013/024ef0789c1f5538d568 to your computer and use it in GitHub Desktop.
gtf2gff3.pl: convert ensembl gtf to gff3 format, just for JBrowser
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#/usr/bin/perl -w | |
use strict; | |
use Data::Dumper; | |
## Written by [email protected] | |
## convert ensembl gtf to gff3 format, just for JBrowser | |
my ($gtf_f, $chr_list_f) = @ARGV; # ensembl gtf file, gtf from different source are different. | |
my %chr; | |
open IN,"$chr_list_f" || die $!; | |
while(<IN>){ | |
chomp; | |
my @t = split /\t/; | |
$chr{$t[0]} = $t[1]; | |
} | |
close IN; | |
my %gtf; | |
&read_gtf($gtf_f, \%gtf, \%chr); | |
#print Dumper(\%gtf); | |
# output gff file | |
print "##gff-version 3\n"; | |
foreach my $chr (sort keys %gtf){ | |
foreach my $gid (keys %{$gtf{$chr}}){ | |
# output the gene records | |
my $attr = pop @{$gtf{$chr}{$gid}{'gene'}}; | |
print "####\n"; | |
print join "\t",@{$gtf{$chr}{$gid}{'gene'}}; | |
print "\tID=$gid\n"; | |
foreach my $tid (keys %{$gtf{$chr}{$gid}}){ | |
next if($tid eq "gene"); | |
# output transcript record | |
$gtf{$chr}{$gid}{$tid}{'trans'}->[2] = "mRNA"; | |
my $attr_trans = pop @{$gtf{$chr}{$gid}{$tid}{'trans'}}; | |
my $trans_name = $1 if($attr_trans =~ /transcript_name "([^"]+)"/); | |
print join "\t",@{$gtf{$chr}{$gid}{$tid}{'trans'}}; | |
print "\tID=$tid;Name=$trans_name;Parent=$gid"; | |
print "\n"; | |
# output CDS, exon, three_prime_UTR, five_prime_UTR | |
my $flag = $gtf{$chr}{$gid}{$tid}{'flag'}; | |
my @flag_sorted = sort {$a->[3] <=> $b->[3]} @$flag; | |
my $cds_seq = 0; | |
foreach my $f (@flag_sorted){ | |
my $f_attr = pop(@$f); | |
if($f->[2] eq "UTR" && $cds_seq == 0){ | |
if($f->[6] eq "+"){ | |
$f->[2] = "five_prime_UTR"; | |
}elsif($f->[6] eq "-"){ | |
$f->[2] = "three_prime_UTR"; | |
} | |
print join "\t",@$f; | |
print "\tParent=$tid;Name=$trans_name\n"; | |
}elsif($f->[2] eq "UTR" && $cds_seq == 1){ | |
if($f->[6] eq "-"){ | |
$f->[2] = "five_prime_UTR"; | |
}elsif($f->[6] eq "+"){ | |
$f->[2] = "three_prime_UTR"; | |
} | |
print join "\t",@$f; | |
print "\tParent=$tid;Name=$trans_name\n"; | |
}else{ | |
if($f->[2] eq "CDS"){ | |
$cds_seq = 1; | |
} | |
print join "\t",@$f; | |
print "\tParent=$tid;Name=$trans_name\n"; | |
} | |
} | |
} | |
} | |
} | |
# read in gtf file | |
sub read_gtf{ | |
my ($f, $gtf, $c) = @_; | |
open IN,"$f" || die $!; | |
while(<IN>){ | |
next if(/^#/); | |
chomp; | |
my @t = split /\t/; | |
next unless($c->{$t[0]}); | |
my $chr = $c->{$t[0]}; | |
$t[0] = $chr; | |
my $start = $t[3]; | |
my $end = $t[4]; | |
my $strand = $t[6]; | |
($t[3],$t[4]) = ($t[4],$t[3]) if($t[3]>$t[4]); | |
my ($geneid,$transid) = ("",""); | |
$geneid = $1 if($t[8] =~ /gene_id "([^"]+)";/); | |
$transid = $1 if($t[8] =~ /transcript_id "([^"]+)";/ && $t[2] ne "gene"); | |
if($t[2] eq "gene"){ | |
$gtf{$chr}{$geneid}{'gene'} = \@t; | |
}elsif($t[2] eq "transcript"){ | |
$gtf{$chr}{$geneid}{$transid}{'trans'} = \@t; | |
}else{ | |
push @{$gtf{$chr}{$geneid}{$transid}{'flag'}},\@t; | |
} | |
} | |
close IN; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment