Instantly share code, notes, and snippets.
Created
May 14, 2026 19:06
-
Star
0
(0)
You must be signed in to star a gist -
Fork
0
(0)
You must be signed in to fork a gist
-
-
Save tomkinsc/47f9e767edf5c7a2bc9871256cc361af to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env bash | |
| set -euo pipefail | |
| # ============================ | |
| # Defaults | |
| # ============================ | |
| READLEN=150 | |
| INSERT_MEAN=350 | |
| INSERT_SD=40 | |
| ERROR_RATE=0.0003 # see: https://drive5.com/usearch/manual/quality_score.html | |
| # NOTE on wgsim divergence/substitutions: | |
| # wgsim's -r parameter applies substitutions to the *reference sequence* first, | |
| # producing ONE substituted haplotype (a single substituted "template"), and then | |
| # samples reads from that template. This is *not* per-read random substitutions. | |
| # If you want per-read diversity / a quasispecies model, wgsim -r is not that. | |
| HOST_SUB_RATE=0.0 | |
| VIRAL_DIVERGENCE=0.05 | |
| HOST_INDEL_RATE=0.0 | |
| VIRAL_INDEL_RATE=0.0 | |
| TOTAL_PAIRS=2000000 | |
| VIRAL_FRACTION=0.02 | |
| RANDOM_SEED=42 | |
| OUT_PREFIX="" # optional; derived later if empty | |
| DETERMINISTIC_PREFIX=0 # if 1, omit timestamp in auto-derived prefix | |
| # Source tags to embed in FASTQ read headers | |
| HOST_TAG="host" | |
| VIRAL_TAG="virus" | |
| # Options | |
| KEEP_TMP=0 | |
| OUT_DIR="." # output directory (default: current dir) | |
| NO_PARALLEL=0 # if 1, run everything serially | |
| USE_GNU_SHUF=0 # if 1, shuffle using GNU shuf | |
| # ============================ | |
| # Usage / help | |
| # ============================ | |
| usage() { | |
| cat <<EOF | |
| Usage: | |
| $(basename "$0") \\ | |
| --viral-fasta VIRUS.fa \\ | |
| --host-fasta HOST.fa \\ | |
| [--out-dir DIR] \\ | |
| [--out-prefix PREFIX | --deterministic-prefix] \\ | |
| [--keep-tmp] [--no-parallel] [--use-gnu-shuf] [options] | |
| Required arguments: | |
| --viral-fasta FILE Viral reference FASTA | |
| --host-fasta FILE Host reference FASTA | |
| Output control: | |
| --out-dir DIR Directory for outputs (default: .). Created if missing. | |
| --out-prefix PREFIX Output FASTQ prefix (overrides auto-derivation) | |
| --deterministic-prefix If deriving prefix automatically, omit timestamp | |
| --keep-tmp Preserve temp directory (prints its path at the end) | |
| Repro / debugging: | |
| --no-parallel Disable backgrounding of steps (serial execution) | |
| Shuffling: | |
| --use-gnu-shuf Use GNU shuf for shuffling paired records | |
| (If the script estimates awk shuffle may exceed available memory, it will auto-enable GNU shuf.) | |
| Mixture: | |
| --total-pairs N Total read pairs (default: ${TOTAL_PAIRS}) | |
| --viral-fraction F Fraction viral [0–1] (default: ${VIRAL_FRACTION}) | |
| Read simulation: | |
| --readlen N Read length (default: ${READLEN}) | |
| --insert-mean N Mean insert size (default: ${INSERT_MEAN}) | |
| --insert-sd N Insert size SD (default: ${INSERT_SD}) | |
| --error-rate F Sequencing error rate (default: ${ERROR_RATE}) | |
| --viral-divergence F Viral divergence applied via wgsim -r (default: ${VIRAL_DIVERGENCE}) | |
| --host-sub-rate F Host substitution rate via wgsim -r (default: ${HOST_SUB_RATE}) | |
| --seed N RNG seed (default: ${RANDOM_SEED}) | |
| Other: | |
| -h, --help Show this help and exit | |
| Outputs: | |
| DIR/PREFIX_R1.fq.gz | |
| DIR/PREFIX_R2.fq.gz | |
| Notes: | |
| - wgsim -r creates a single substituted haplotype (template) before read sampling; | |
| it does NOT apply substitutions independently per read. | |
| - FASTQ headers are re-written to encode source: | |
| @host|<wgsim_name> and @virus|<wgsim_name> | |
| - Fisher–Yates shuffle in awk is deterministic with --seed but loads all records in RAM. | |
| GNU shuf path is disk-based and reproducible here via --random-source generated from --seed. | |
| EOF | |
| } | |
| # ============================ | |
| # No args → help | |
| # ============================ | |
| if [[ $# -eq 0 ]]; then | |
| usage | |
| exit 1 | |
| fi | |
| # ============================ | |
| # Argument parsing | |
| # ============================ | |
| while [[ $# -gt 0 ]]; do | |
| case "$1" in | |
| --viral-fasta) VIRAL_FA="$2"; shift 2 ;; | |
| --host-fasta) HOST_FA="$2"; shift 2 ;; | |
| --out-dir) OUT_DIR="$2"; shift 2 ;; | |
| --out-prefix) OUT_PREFIX="$2"; shift 2 ;; | |
| --deterministic-prefix) DETERMINISTIC_PREFIX=1; shift 1 ;; | |
| --keep-tmp) KEEP_TMP=1; shift 1 ;; | |
| --no-parallel) NO_PARALLEL=1; shift 1 ;; | |
| --use-gnu-shuf) USE_GNU_SHUF=1; shift 1 ;; | |
| --total-pairs) TOTAL_PAIRS="$2"; shift 2 ;; | |
| --viral-fraction)VIRAL_FRACTION="$2"; shift 2 ;; | |
| --readlen) READLEN="$2"; shift 2 ;; | |
| --insert-mean) INSERT_MEAN="$2"; shift 2 ;; | |
| --insert-sd) INSERT_SD="$2"; shift 2 ;; | |
| --error-rate) ERROR_RATE="$2"; shift 2 ;; | |
| --viral-divergence) VIRAL_DIVERGENCE="$2"; shift 2 ;; | |
| --host-sub-rate) HOST_SUB_RATE="$2"; shift 2 ;; | |
| --seed) RANDOM_SEED="$2"; shift 2 ;; | |
| -h|--help) | |
| usage | |
| exit 0 | |
| ;; | |
| *) | |
| echo "ERROR: Unknown argument: $1" >&2 | |
| usage | |
| exit 1 | |
| ;; | |
| esac | |
| done | |
| # ============================ | |
| # Validation | |
| # ============================ | |
| : "${VIRAL_FA:?Missing --viral-fasta}" | |
| : "${HOST_FA:?Missing --host-fasta}" | |
| # ============================ | |
| # Memory-based auto-selection of shuffle mode | |
| # ============================ | |
| # Estimate bytes needed for awk-based shuffle: | |
| # ((TOTAL_PAIRS * ((READLEN*2)+60)) * 2) | |
| # (A rough proxy for in-memory storage of paired-record lines + overhead.) | |
| AWK_SHUF_EST_BYTES=$(( (TOTAL_PAIRS * ((READLEN * 2) + 60)) * 2 )) | |
| get_mem_available_bytes() { | |
| # Linux: prefer MemAvailable from /proc/meminfo (kB) | |
| if [[ -r /proc/meminfo ]]; then | |
| local kb | |
| kb="$(awk '/^MemAvailable:/ {print $2; exit}' /proc/meminfo || true)" | |
| if [[ -n "${kb:-}" ]]; then | |
| echo $(( kb * 1024 )) | |
| return 0 | |
| fi | |
| fi | |
| # macOS: approximate "available" via vm_stat (free + inactive + speculative + purgeable) * page_size | |
| if command -v vm_stat >/dev/null 2>&1; then | |
| local page_size | |
| page_size="$(vm_stat | awk -F' ' '/page size of/ {gsub(/[^0-9]/,"",$8); print $8; exit}')" | |
| if [[ -z "${page_size:-}" ]]; then | |
| page_size="$(vm_stat | awk -F' ' '/page size of/ {for(i=1;i<=NF;i++) if($i ~ /^[0-9]+$/){print $i; exit}}')" | |
| fi | |
| page_size="${page_size:-4096}" | |
| local free inactive speculative purgeable | |
| free="$(vm_stat | awk -F': *' '/^Pages free/ {gsub(/\./,"",$2); print $2; exit}')" | |
| inactive="$(vm_stat | awk -F': *' '/^Pages inactive/ {gsub(/\./,"",$2); print $2; exit}')" | |
| speculative="$(vm_stat | awk -F': *' '/^Pages speculative/ {gsub(/\./,"",$2); print $2; exit}')" | |
| purgeable="$(vm_stat | awk -F': *' '/^Pages purgeable/ {gsub(/\./,"",$2); print $2; exit}')" | |
| free="${free:-0}" | |
| inactive="${inactive:-0}" | |
| speculative="${speculative:-0}" | |
| purgeable="${purgeable:-0}" | |
| echo $(( (free + inactive + speculative + purgeable) * page_size )) | |
| return 0 | |
| fi | |
| return 1 | |
| } | |
| if mem_avail_bytes="$(get_mem_available_bytes)"; then | |
| if [[ "$USE_GNU_SHUF" -eq 0 && "$AWK_SHUF_EST_BYTES" -gt "$mem_avail_bytes" ]]; then | |
| USE_GNU_SHUF=1 | |
| echo "WARNING: Estimated awk-shuffle memory requirement exceeds available memory." >&2 | |
| echo " Estimated (bytes): ${AWK_SHUF_EST_BYTES}" >&2 | |
| echo " Available (bytes): ${mem_avail_bytes}" >&2 | |
| echo " Auto-enabling: --use-gnu-shuf" >&2 | |
| fi | |
| else | |
| echo "WARNING: Could not determine available memory; leaving shuffle mode unchanged." >&2 | |
| fi | |
| # ============================ | |
| # Check required executables | |
| # ============================ | |
| need_cmd() { command -v "$1" >/dev/null 2>&1; } | |
| missing=() | |
| for c in wgsim awk paste cut tr gzip python3 mktemp basename cat mv mkdir; do | |
| need_cmd "$c" || missing+=("$c") | |
| done | |
| if [[ -z "$OUT_PREFIX" && "$DETERMINISTIC_PREFIX" -eq 0 ]]; then | |
| need_cmd date || missing+=(date) | |
| fi | |
| if [[ "$USE_GNU_SHUF" -eq 1 ]]; then | |
| need_cmd shuf || missing+=(shuf) | |
| fi | |
| if (( ${#missing[@]} > 0 )); then | |
| echo "ERROR: Missing required executables on PATH:" >&2 | |
| for c in "${missing[@]}"; do echo " - $c" >&2; done | |
| exit 1 | |
| fi | |
| # ============================ | |
| # Helpers | |
| # ============================ | |
| strip_ext() { | |
| local f | |
| f="$(basename "$1")" | |
| f="${f%.fa}" | |
| f="${f%.fasta}" | |
| f="${f%.fna}" | |
| f="${f%.fas}" | |
| echo "$f" | |
| } | |
| tag_fastq_headers() { | |
| local tag="$1" | |
| local in_fq="$2" | |
| local out_fq="$3" | |
| awk -v tag="$tag" ' | |
| (NR % 4) == 1 { | |
| if ($0 ~ /^@/) sub(/^@/, "@" tag "|") | |
| else $0 = "@" tag "|" $0 | |
| } | |
| { print } | |
| ' "$in_fq" > "$out_fq" | |
| } | |
| wait_pids() { | |
| local label="$1"; shift | |
| local -a pids=( "$@" ) | |
| local pid rc=0 | |
| for pid in "${pids[@]}"; do | |
| [[ -z "$pid" ]] && continue | |
| if ! wait "$pid"; then | |
| echo "ERROR: Background job failed during: ${label} (pid=${pid})" >&2 | |
| rc=1 | |
| fi | |
| done | |
| return "$rc" | |
| } | |
| # Fix A: set PID into a named variable (avoid command substitution capturing stdout) | |
| run_maybe_bg() { | |
| # Usage: run_maybe_bg pid_varname command arg1 arg2 ... | |
| local __pidvar="$1"; shift | |
| if [[ "$NO_PARALLEL" -eq 1 ]]; then | |
| "$@" | |
| printf -v "$__pidvar" '%s' "" | |
| else | |
| "$@" & | |
| printf -v "$__pidvar" '%s' "$!" | |
| fi | |
| } | |
| assert_first_line_prefix() { | |
| local file="$1" | |
| local expected="$2" | |
| local first | |
| first="$(awk 'NR==1{print; exit}' "$file")" | |
| if [[ "${first}" != "${expected}"* ]]; then | |
| echo "ERROR: Sanity check failed for $file" >&2 | |
| echo " Expected first line to start with: ${expected}" >&2 | |
| echo " Observed first line: ${first}" >&2 | |
| exit 1 | |
| fi | |
| } | |
| make_shuf_random_source() { | |
| local n_lines="$1" | |
| local seed="$2" | |
| local out_file="$3" | |
| python3 - <<EOF | |
| import random | |
| n_lines = int(${n_lines}) | |
| seed = int(${seed}) | |
| nbytes = n_lines * 8 + 1024 * 1024 | |
| random.seed(seed) | |
| with open(${out_file!r}, "wb") as f: | |
| chunk = 1024 * 1024 | |
| remaining = nbytes | |
| while remaining > 0: | |
| m = min(chunk, remaining) | |
| f.write(bytes(random.randrange(256) for _ in range(m))) | |
| remaining -= m | |
| EOF | |
| } | |
| # ============================ | |
| # Create output directory | |
| # ============================ | |
| mkdir -p "$OUT_DIR" | |
| # ============================ | |
| # Derive output prefix if needed | |
| # ============================ | |
| if [[ -z "$OUT_PREFIX" ]]; then | |
| OUT_PREFIX_BASE="simulated_$(strip_ext "$VIRAL_FA")__spiked_into__$(strip_ext "$HOST_FA")"\ | |
| "__tp${TOTAL_PAIRS}__vf${VIRAL_FRACTION}__vdiv${VIRAL_DIVERGENCE}__hdiv${HOST_SUB_RATE}"\ | |
| "__rl${READLEN}__err${ERROR_RATE}" | |
| if [[ "$DETERMINISTIC_PREFIX" -eq 1 ]]; then | |
| OUT_PREFIX="${OUT_PREFIX_BASE}" | |
| else | |
| OUT_PREFIX="${OUT_PREFIX_BASE}__$(date +'%Y%m%d%H%M')" | |
| fi | |
| fi | |
| OUT_R1="${OUT_DIR%/}/${OUT_PREFIX}_R1.fq" | |
| OUT_R2="${OUT_DIR%/}/${OUT_PREFIX}_R2.fq" | |
| # ============================ | |
| # Python float math | |
| # ============================ | |
| VIRAL_PAIRS="$( | |
| python3 - <<EOF | |
| total = ${TOTAL_PAIRS} | |
| frac = ${VIRAL_FRACTION} | |
| print(round(total * frac)) | |
| EOF | |
| )" | |
| HOST_PAIRS=$(( TOTAL_PAIRS - VIRAL_PAIRS )) | |
| echo "Simulation plan:" | |
| echo " Viral FASTA: $VIRAL_FA" | |
| echo " Host FASTA: $HOST_FA" | |
| echo " Viral pairs: $VIRAL_PAIRS (div=${VIRAL_DIVERGENCE})" | |
| echo " Host pairs: $HOST_PAIRS (sub=${HOST_SUB_RATE})" | |
| echo " Output dir: $OUT_DIR" | |
| echo " Output prefix: $OUT_PREFIX" | |
| echo " Parallel: $([[ "$NO_PARALLEL" -eq 1 ]] && echo no || echo yes)" | |
| echo " Shuffle mode: $([[ "$USE_GNU_SHUF" -eq 1 ]] && echo "GNU shuf" || echo "awk Fisher–Yates")" | |
| echo | |
| TMPDIR="$(mktemp -d)" | |
| cleanup() { | |
| if [[ "$KEEP_TMP" -eq 1 ]]; then | |
| return 0 | |
| fi | |
| rm -rf "$TMPDIR" | |
| } | |
| trap cleanup EXIT | |
| # ============================ | |
| # Viral + host reads (maybe parallel) | |
| # ============================ | |
| run_maybe_bg pid_vwgsim \ | |
| wgsim \ | |
| -h \ | |
| -e "$ERROR_RATE" \ | |
| -r "$VIRAL_DIVERGENCE" \ | |
| -R "$VIRAL_INDEL_RATE" \ | |
| -X "$VIRAL_INDEL_RATE" \ | |
| -d "$INSERT_MEAN" \ | |
| -s "$INSERT_SD" \ | |
| -N "$VIRAL_PAIRS" \ | |
| -1 "$READLEN" \ | |
| -2 "$READLEN" \ | |
| -S "$RANDOM_SEED" \ | |
| "$VIRAL_FA" \ | |
| "$TMPDIR/viral_R1.fq" "$TMPDIR/viral_R2.fq" | |
| run_maybe_bg pid_hwgsim \ | |
| wgsim \ | |
| -e "$ERROR_RATE" \ | |
| -r "$HOST_SUB_RATE" \ | |
| -R "$HOST_INDEL_RATE" \ | |
| -X "$HOST_INDEL_RATE" \ | |
| -d "$INSERT_MEAN" \ | |
| -s "$INSERT_SD" \ | |
| -N "$HOST_PAIRS" \ | |
| -1 "$READLEN" \ | |
| -2 "$READLEN" \ | |
| -S "$((RANDOM_SEED+1))" \ | |
| "$HOST_FA" \ | |
| "$TMPDIR/host_R1.fq" "$TMPDIR/host_R2.fq" | |
| wait_pids "wgsim simulation" "${pid_vwgsim:-}" "${pid_hwgsim:-}" | |
| # ============================ | |
| # Re-header FASTQs (maybe parallel) | |
| # ============================ | |
| run_maybe_bg pid_vr1 tag_fastq_headers "$VIRAL_TAG" "$TMPDIR/viral_R1.fq" "$TMPDIR/viral_R1.tagged.fq" | |
| run_maybe_bg pid_vr2 tag_fastq_headers "$VIRAL_TAG" "$TMPDIR/viral_R2.fq" "$TMPDIR/viral_R2.tagged.fq" | |
| run_maybe_bg pid_hr1 tag_fastq_headers "$HOST_TAG" "$TMPDIR/host_R1.fq" "$TMPDIR/host_R1.tagged.fq" | |
| run_maybe_bg pid_hr2 tag_fastq_headers "$HOST_TAG" "$TMPDIR/host_R2.fq" "$TMPDIR/host_R2.tagged.fq" | |
| wait_pids "FASTQ re-headering" "${pid_vr1:-}" "${pid_vr2:-}" "${pid_hr1:-}" "${pid_hr2:-}" | |
| mv "$TMPDIR/viral_R1.tagged.fq" "$TMPDIR/viral_R1.fq" | |
| mv "$TMPDIR/viral_R2.tagged.fq" "$TMPDIR/viral_R2.fq" | |
| mv "$TMPDIR/host_R1.tagged.fq" "$TMPDIR/host_R1.fq" | |
| mv "$TMPDIR/host_R2.tagged.fq" "$TMPDIR/host_R2.fq" | |
| assert_first_line_prefix "$TMPDIR/viral_R1.fq" "@${VIRAL_TAG}|" | |
| assert_first_line_prefix "$TMPDIR/viral_R2.fq" "@${VIRAL_TAG}|" | |
| assert_first_line_prefix "$TMPDIR/host_R1.fq" "@${HOST_TAG}|" | |
| assert_first_line_prefix "$TMPDIR/host_R2.fq" "@${HOST_TAG}|" | |
| # ============================ | |
| # Mix host + virus (maybe parallel) | |
| # ============================ | |
| run_maybe_bg pid_cat1 bash -c "cat '$TMPDIR/host_R1.fq' '$TMPDIR/viral_R1.fq' > '$TMPDIR/mix_R1.fq'" | |
| run_maybe_bg pid_cat2 bash -c "cat '$TMPDIR/host_R2.fq' '$TMPDIR/viral_R2.fq' > '$TMPDIR/mix_R2.fq'" | |
| wait_pids "FASTQ concatenation" "${pid_cat1:-}" "${pid_cat2:-}" | |
| # ============================ | |
| # Shuffle paired records | |
| # ============================ | |
| paste -d '\t' \ | |
| <(paste - - - - < "$TMPDIR/mix_R1.fq") \ | |
| <(paste - - - - < "$TMPDIR/mix_R2.fq") \ | |
| > "$TMPDIR/paired.tsv" | |
| if [[ "$USE_GNU_SHUF" -eq 1 ]]; then | |
| make_shuf_random_source "$TOTAL_PAIRS" "$RANDOM_SEED" "$TMPDIR/shuf.random.bin" | |
| shuf --random-source="$TMPDIR/shuf.random.bin" "$TMPDIR/paired.tsv" > "$TMPDIR/shuf.tsv" | |
| else | |
| awk -v seed="$RANDOM_SEED" ' | |
| BEGIN { srand(seed) } | |
| { a[NR]=$0 } | |
| END { | |
| for(i=NR;i>1;i--){ | |
| j=int(rand()*i)+1 | |
| tmp=a[i]; a[i]=a[j]; a[j]=tmp | |
| } | |
| for(i=1;i<=NR;i++) print a[i] | |
| } | |
| ' "$TMPDIR/paired.tsv" > "$TMPDIR/shuf.tsv" | |
| fi | |
| # ============================ | |
| # Split TSV back to FASTQ (maybe parallel) | |
| # ============================ | |
| run_maybe_bg pid_out1 bash -c "cut -f1-4 '$TMPDIR/shuf.tsv' | tr '\t' '\n' > '$OUT_R1'" | |
| run_maybe_bg pid_out2 bash -c "cut -f5-8 '$TMPDIR/shuf.tsv' | tr '\t' '\n' > '$OUT_R2'" | |
| wait_pids "R1/R2 splitting" "${pid_out1:-}" "${pid_out2:-}" | |
| run_maybe_bg pid_gzip_out1 gzip -9 -f "$OUT_R1" | |
| run_maybe_bg pid_gzip_out2 gzip -9 -f "$OUT_R2" | |
| wait_pids "compressing output" "${pid_gzip_out1:-}" "${pid_gzip_out2:-}" | |
| echo "Done:" | |
| echo " ${OUT_R1}.gz" | |
| echo " ${OUT_R2}.gz" | |
| if [[ "$KEEP_TMP" -eq 1 ]]; then | |
| echo "Temp directory preserved:" | |
| echo " $TMPDIR" | |
| fi |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment