Last active
July 5, 2023 20:11
-
-
Save rknx/6ecfa30969dcb46a09a04c7bab4715d9 to your computer and use it in GitHub Desktop.
Fasta assembly metrics by assem_mets
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
##/usr/bin/env bash | |
# ------------------------------------------------------------------------------ | |
# Script: Assembly metrics | |
# Author: Anuj Sharma | |
# Github: rknx/ | |
# Email: [email protected] | |
# Description: This script is used to calculate various assembly metrics from | |
# a single multiFASTA file, often a genome assembly. | |
# Last modified: <2022/08/04> | |
# Copyright (C) 2022 Anuj Sharma ([email protected]) | |
# Permission to copy and modify is granted under the GPL license | |
# ------------------------------------------------------------------------------ | |
# --- Default values ----------------------------------------------------------- | |
VERSION=0.4.0 | |
XLOG= | |
XTMP= | |
# --- Usage -------------------------------------------------------------------- | |
read -r -d '' USAGE << EOF | |
USAGE | |
$(basename "$0") [options] [--] ... [infile] | |
DESCRIPTION | |
This script computes several assembly metrics for a multiFASTA file. | |
IMPLEMENTATION | |
version $(basename ${0%.sh}) v$VERSION | |
author Anuj Sharma | |
copyright Copyright © 2022 Anuj Sharma ([email protected]) | |
license GNU General Public License | |
OPTIONS | |
-h | --help | |
Display help text. | |
-v | --version | |
Display software version. | |
-o | --outfile [] | |
Specify output file. | |
-s | --short | |
Do not display individual contig information. | |
-V | --verbose | |
Write debug messages to logfile. | |
EOF | |
# --- Options processing ------------------------------------------------------- | |
# Return help if there are no arguments | |
if [ $# == 0 ] ; then echo "$USAGE"; exit 1; fi | |
# Pre-process arguments | |
ARGS=() | |
ENDOPT= | |
while [[ $# -gt 0 ]]; do | |
arg="$1"; shift | |
case "${ENDOPT}${arg}" in | |
--) ARGS+=("$arg"); ENDOPT=1 ;; | |
--*=*) ARGS+=("${arg%%=*}" "${arg#*=}") ;; | |
--*) ARGS+=("$arg") ;; | |
-*) for i in $(seq 2 ${#arg}); do ARGS+=("-${arg:i-1:1}"); done ;; | |
*) ARGS+=("$arg") ;; | |
esac | |
done | |
# Apply pre-processed options | |
set -- "${ARGS[@]}" | |
# Parse options | |
ENDOPT= | |
SHORT= | |
INFILES=() | |
while [[ $# -gt 0 ]]; do | |
case "${ENDOPT}${1}" in | |
# -x|--xxxxx) fun ;; # option | |
# -x|--xxxxx) shift; var="$1" ;; # one time argument with value | |
# -x|--xxxxx) shift; multivar+=("$1") ;; # repeated argument with values | |
-h|--help) echo "$USAGE" >&2; exit 0 ;; | |
-v|--version) echo "$(basename ${0%.sh}) v$VERSION" >&2; exit 0 ;; | |
-V|--verbose) export XLOG=1 ;; | |
-o|--outfile) shift; export OUTFILE="$1" ;; | |
-s|--short) export SHORT=1;; | |
--) ENDOPT=1 ;; | |
-*) echo "ERROR: Unrecognized argument: $1" >&2; | |
echo "$USAGE" >&2; exit 1 ;; | |
*) INFILE+=("$1") ;; | |
esac | |
shift | |
done | |
# Helper function for logging and outfile | |
source "$(dirname $(which "$0"))/logging.sh" | |
# Check if input is provided | |
[[ -z "$INFILE" ]] && \ | |
__ts "No input FASTA file for computing metrics." && \ | |
exit 1 | |
# Check if multiple input is provided | |
[[ "${#INFILE[@]}" -gt "1" ]] && \ | |
__ts "Multiple input files provided. Only first one is used." && \ | |
INFILE=${INFILE[0]} | |
# Input information | |
__tsv "Processing file: $INFILE." | |
echo "Input file: $INFILE" | |
# Get lengths of contigs | |
__tsv "Calculating contig lengths." | |
arr=( $( | |
awk ' | |
BEGIN{RS=">";ORS="\n";FS="\n"} | |
NR>1{$1 = ""; print gsub(/[aAtTuUwWgGcCsSrRyYkKmMbBdDhHvVnN]/, "")} | |
' "$INFILE" | \ | |
sort -nr | |
) ) | |
__tsv "Finished contig length." | |
# Get length of assembly | |
size=$((${arr[@]/%/+}0)) | |
# Basic assembly information | |
__tsv "Calculating general assembly information." | |
echo -e "\n---- Assembly overview -------------------------" | |
echo "Total assembly length: $size" | |
echo "Number of contigs: ${#arr[@]}" | |
echo "Largest contig size: ${arr[0]}" | |
echo "Smallest contig size: ${arr[-1]}" | |
__tsv "Calculating GC percent." | |
grep -v ">" "$INFILE" | tr -d '\n' | \ | |
awk '{ | |
gcs = gsub(/[gGcCsS]/, "") | |
atuw = gsub(/[aAtTuUwW]/, "") | |
printf "GC content: %.2f%%\n", gcs * 100 / ( gcs + atuw ) | |
}' | |
__tsv "Finished general assembly information." | |
# Assembly quality (N50 etc.) | |
__tsv "Calculating assembly quality information." | |
echo -e "\n---- Assembly quality --------------------------" | |
for i in 50 75 90 | |
do | |
__tsv "Processing N$i." | |
awk -v t="$size" -vRS=" " -vf="$i" '{d++}(c+=$1)>=t*f/100 \ | |
{printf "N%s: %s\tL%s: %s\n", f, $1, f, d; exit}' <<< ${arr[@]} | |
done | |
__tsv "Finished assembly quality information." | |
# End if short output | |
[[ "$SHORT" -eq 1 ]] && \ | |
__tsv "Short output complete." && \ | |
exit 1 | |
# Contig size distribution | |
__tsv "Calculating contig size distribution." | |
echo -e "\n---- Contig size distribution ------------------" | |
for i in {3..6} | |
do | |
awk -vRS=" " -vf=$(( 10**$i )) ' | |
$1 < f { printf ">%-10s\t%s contigs\t%s bp\n", f, d+0, c+0; exit } | |
{ d++; c+=$1 } | |
' <<< ${arr[@]} | |
done | |
__tsv "Finished contig size distribition." | |
# Individual contig information | |
__tsv "Calculating contig specific values." | |
echo -e "\n---- Contig information ------------------------" | |
awk -vRS=">" -vORS="\n" -vFS="\n" ' | |
BEGIN {print "Name", "Length", "%GC"} | |
NR>1 { | |
split($1, j, " "); $1 = "" | |
atuw=gsub(/[aAtTuUwW]/, "") | |
gcs=gsub(/[gGcCsS]/, "") | |
rykmbdhvn=gsub(/[rRyYkKmMbBdDhHvVnN]/, "") | |
printf "%s %d %.2f\n", j[1], atuw+gcs+rykmbdhvn, gcs/(gcs+atuw)*100 | |
} | |
' "$INFILE" | column -t | |
__tsv "Finished contig specific values." | |
__tsv "Assembly metric all done for $INFILE." |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Error on N50 calculation in v0.3.0 is fixed in v0.4.0