Created
October 29, 2012 12:22
-
-
Save nathanhaigh/3973241 to your computer and use it in GitHub Desktop.
Convert an SSPACE evidence file into an AGP 2.0 file
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
# | |
# Usage: sspace_evidence2agp.pl formattedcontigs.fasta < final.evidence > out.agp 2> out.stderr | |
# e.g. sspace_evidence2agp.pl intermediate_results/standard_output.formattedcontigs.fasta < standard_output.final.evidence > standard_output.agp 2> standard_output.agp.stderr | |
# | |
# What this script does: | |
# 1) Uses the *.final.evidence file created by SSPACE to generate an AGP v2.0 file. | |
# 2) Uses information in the *.formattedcontigs.fasta file to recover the original contig | |
# names. | |
# 3) Non-positive length gaps are output as component_type=U and gap length 100, as per the | |
# specifications. Notification of this is sent to STDERR so you can examine possible overlap | |
# and joining of contigs either side of such a gap. | |
# | |
# I have tried to follow recommendations as set out at: | |
# http://www.ncbi.nlm.nih.gov/projects/genome/assembly/agp/AGP_Specification_v2.0.shtml | |
# AGP validation can be performed at: | |
# http://www.ncbi.nlm.nih.gov/projects/genome/assembly/agp/agp_validate.cgi | |
use strict; | |
use warnings; | |
my $scaffold; | |
my $current_start_bp; | |
my $part_number; | |
my $n_scaffolds = 0; | |
my $component_type; | |
my $contigs = shift @ARGV; | |
my %tig2orig; | |
open(CONTIGS, '<', $contigs) or die "Couldn't open formattedcontigs.fasta file: $!\n"; | |
while(<CONTIGS>) { | |
next unless /^>/; | |
chomp; | |
my @fields = split '\|'; | |
$fields[0] =~ s/^>//; | |
$fields[-1] =~ s/^seed://; | |
$tig2orig{$fields[0]} = $fields[-1]; | |
} | |
close CONTIGS; | |
print "##agp-version 2.0\n"; | |
while(<>){ | |
chomp; | |
if(/^\s*$/ || eof()) { | |
# clear current values in objects | |
#exit if $n_scaffolds == 500; | |
} else { | |
my @fields = split '\|'; | |
#print "@fields\n"; | |
if($fields[0] =~ s/^>(.+?)$//) { | |
# sort of evidence for a new scaffold | |
$n_scaffolds++; | |
$scaffold = $1; | |
$current_start_bp = 1; | |
$part_number = 1; | |
} elsif($fields[0] =~ /^(f|r)_(.+?)$/) { | |
my $orientation = $1; | |
my $tig = $2; | |
my $length = $fields[1]; $length =~ s/^size//; | |
$component_type = 'W'; | |
# output this fragment in AGP format | |
printf "%s\t%d\t%d\t%d\t%s\t%s\t1\t%d\t%s\n", | |
$scaffold, | |
$current_start_bp, | |
$current_start_bp + $length - 1, | |
$part_number++, | |
$component_type, | |
$tig2orig{'con'.$tig}, | |
$length, | |
$orientation eq 'f' ? '+' : '-' | |
; | |
$current_start_bp += $length; | |
if(scalar@fields > 2) { | |
# this contig is also followed by a gap | |
my $n_links = $fields[2]; $n_links =~ s/^links//; | |
my $n_gaps = $fields[3]; $n_gaps =~ s/^gaps//; | |
$component_type = 'N'; | |
if($n_gaps < 1) { | |
print STDERR "Gap with non-positive length ($n_gaps) found at part number $part_number in object $scaffold. Setting the type and size to U and 100 respectively\n"; | |
$component_type = 'U'; | |
$n_gaps = 100; | |
#next; | |
} | |
# output the gap | |
printf "%s\t%d\t%d\t%d\t%s\t%d\tscaffold\tyes\tpaired-ends\n", | |
$scaffold, | |
$current_start_bp, | |
$current_start_bp + $n_gaps - 1, | |
$part_number++, | |
$component_type, | |
$n_gaps | |
; | |
$current_start_bp += $n_gaps; | |
} | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment