Created
July 1, 2014 15:56
-
-
Save zhoujj2013/199b29fe6e0c34afe7b8 to your computer and use it in GitHub Desktop.
Standardize GeneBank Fasta Format
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl -w | |
use strict; | |
my $f = shift; # input fasta file download from batch Entrez (http://www.ncbi.nlm.nih.gov/sites/batchentrez) | |
open IN,"$f" || die $!; | |
$/ = ">"; <IN>; $/ = "\n"; | |
while(<IN>){ | |
chomp; | |
my $id = $_; | |
# replace "gi|634859302|gb|KJ524455.1|" > "gi_634859302|gb|KJ524455.1|" | |
$id =~ s/^(gi\|)/gi_/g; | |
# replace "gi_634859302|gb|KJ524455.1|" > "gi_634859302 KJ524455.1|" | |
# you can add more things to this part | |
$id =~ s/(\|gb\|)/ /g; | |
$id =~ s/(\|ref\|)/ /g; | |
$id =~ s/(\|dbj\|)/ /g; | |
# replace "gi_634859302 KJ524455.1|" > "gi_634859302 KJ524455.1" | |
$id =~ s/(\|) / /g; | |
# get seq | |
$/ = ">"; | |
my $seq = <IN>; | |
chomp($seq); | |
$seq =~ s/\n//g; | |
$/ = "\n"; | |
# stdout | |
print ">$id\n$seq\n"; | |
} | |
close IN; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment