Skip to content

Instantly share code, notes, and snippets.

@tomkinsc
Created May 14, 2026 19:06
Show Gist options
  • Select an option

  • Save tomkinsc/47f9e767edf5c7a2bc9871256cc361af to your computer and use it in GitHub Desktop.

Select an option

Save tomkinsc/47f9e767edf5c7a2bc9871256cc361af to your computer and use it in GitHub Desktop.
#!/usr/bin/env bash
set -euo pipefail
# ============================
# Defaults
# ============================
READLEN=150
INSERT_MEAN=350
INSERT_SD=40
ERROR_RATE=0.0003 # see: https://drive5.com/usearch/manual/quality_score.html
# NOTE on wgsim divergence/substitutions:
# wgsim's -r parameter applies substitutions to the *reference sequence* first,
# producing ONE substituted haplotype (a single substituted "template"), and then
# samples reads from that template. This is *not* per-read random substitutions.
# If you want per-read diversity / a quasispecies model, wgsim -r is not that.
HOST_SUB_RATE=0.0
VIRAL_DIVERGENCE=0.05
HOST_INDEL_RATE=0.0
VIRAL_INDEL_RATE=0.0
TOTAL_PAIRS=2000000
VIRAL_FRACTION=0.02
RANDOM_SEED=42
OUT_PREFIX="" # optional; derived later if empty
DETERMINISTIC_PREFIX=0 # if 1, omit timestamp in auto-derived prefix
# Source tags to embed in FASTQ read headers
HOST_TAG="host"
VIRAL_TAG="virus"
# Options
KEEP_TMP=0
OUT_DIR="." # output directory (default: current dir)
NO_PARALLEL=0 # if 1, run everything serially
USE_GNU_SHUF=0 # if 1, shuffle using GNU shuf
# ============================
# Usage / help
# ============================
usage() {
cat <<EOF
Usage:
$(basename "$0") \\
--viral-fasta VIRUS.fa \\
--host-fasta HOST.fa \\
[--out-dir DIR] \\
[--out-prefix PREFIX | --deterministic-prefix] \\
[--keep-tmp] [--no-parallel] [--use-gnu-shuf] [options]
Required arguments:
--viral-fasta FILE Viral reference FASTA
--host-fasta FILE Host reference FASTA
Output control:
--out-dir DIR Directory for outputs (default: .). Created if missing.
--out-prefix PREFIX Output FASTQ prefix (overrides auto-derivation)
--deterministic-prefix If deriving prefix automatically, omit timestamp
--keep-tmp Preserve temp directory (prints its path at the end)
Repro / debugging:
--no-parallel Disable backgrounding of steps (serial execution)
Shuffling:
--use-gnu-shuf Use GNU shuf for shuffling paired records
(If the script estimates awk shuffle may exceed available memory, it will auto-enable GNU shuf.)
Mixture:
--total-pairs N Total read pairs (default: ${TOTAL_PAIRS})
--viral-fraction F Fraction viral [0–1] (default: ${VIRAL_FRACTION})
Read simulation:
--readlen N Read length (default: ${READLEN})
--insert-mean N Mean insert size (default: ${INSERT_MEAN})
--insert-sd N Insert size SD (default: ${INSERT_SD})
--error-rate F Sequencing error rate (default: ${ERROR_RATE})
--viral-divergence F Viral divergence applied via wgsim -r (default: ${VIRAL_DIVERGENCE})
--host-sub-rate F Host substitution rate via wgsim -r (default: ${HOST_SUB_RATE})
--seed N RNG seed (default: ${RANDOM_SEED})
Other:
-h, --help Show this help and exit
Outputs:
DIR/PREFIX_R1.fq.gz
DIR/PREFIX_R2.fq.gz
Notes:
- wgsim -r creates a single substituted haplotype (template) before read sampling;
it does NOT apply substitutions independently per read.
- FASTQ headers are re-written to encode source:
@host|<wgsim_name> and @virus|<wgsim_name>
- Fisher–Yates shuffle in awk is deterministic with --seed but loads all records in RAM.
GNU shuf path is disk-based and reproducible here via --random-source generated from --seed.
EOF
}
# ============================
# No args → help
# ============================
if [[ $# -eq 0 ]]; then
usage
exit 1
fi
# ============================
# Argument parsing
# ============================
while [[ $# -gt 0 ]]; do
case "$1" in
--viral-fasta) VIRAL_FA="$2"; shift 2 ;;
--host-fasta) HOST_FA="$2"; shift 2 ;;
--out-dir) OUT_DIR="$2"; shift 2 ;;
--out-prefix) OUT_PREFIX="$2"; shift 2 ;;
--deterministic-prefix) DETERMINISTIC_PREFIX=1; shift 1 ;;
--keep-tmp) KEEP_TMP=1; shift 1 ;;
--no-parallel) NO_PARALLEL=1; shift 1 ;;
--use-gnu-shuf) USE_GNU_SHUF=1; shift 1 ;;
--total-pairs) TOTAL_PAIRS="$2"; shift 2 ;;
--viral-fraction)VIRAL_FRACTION="$2"; shift 2 ;;
--readlen) READLEN="$2"; shift 2 ;;
--insert-mean) INSERT_MEAN="$2"; shift 2 ;;
--insert-sd) INSERT_SD="$2"; shift 2 ;;
--error-rate) ERROR_RATE="$2"; shift 2 ;;
--viral-divergence) VIRAL_DIVERGENCE="$2"; shift 2 ;;
--host-sub-rate) HOST_SUB_RATE="$2"; shift 2 ;;
--seed) RANDOM_SEED="$2"; shift 2 ;;
-h|--help)
usage
exit 0
;;
*)
echo "ERROR: Unknown argument: $1" >&2
usage
exit 1
;;
esac
done
# ============================
# Validation
# ============================
: "${VIRAL_FA:?Missing --viral-fasta}"
: "${HOST_FA:?Missing --host-fasta}"
# ============================
# Memory-based auto-selection of shuffle mode
# ============================
# Estimate bytes needed for awk-based shuffle:
# ((TOTAL_PAIRS * ((READLEN*2)+60)) * 2)
# (A rough proxy for in-memory storage of paired-record lines + overhead.)
AWK_SHUF_EST_BYTES=$(( (TOTAL_PAIRS * ((READLEN * 2) + 60)) * 2 ))
get_mem_available_bytes() {
# Linux: prefer MemAvailable from /proc/meminfo (kB)
if [[ -r /proc/meminfo ]]; then
local kb
kb="$(awk '/^MemAvailable:/ {print $2; exit}' /proc/meminfo || true)"
if [[ -n "${kb:-}" ]]; then
echo $(( kb * 1024 ))
return 0
fi
fi
# macOS: approximate "available" via vm_stat (free + inactive + speculative + purgeable) * page_size
if command -v vm_stat >/dev/null 2>&1; then
local page_size
page_size="$(vm_stat | awk -F' ' '/page size of/ {gsub(/[^0-9]/,"",$8); print $8; exit}')"
if [[ -z "${page_size:-}" ]]; then
page_size="$(vm_stat | awk -F' ' '/page size of/ {for(i=1;i<=NF;i++) if($i ~ /^[0-9]+$/){print $i; exit}}')"
fi
page_size="${page_size:-4096}"
local free inactive speculative purgeable
free="$(vm_stat | awk -F': *' '/^Pages free/ {gsub(/\./,"",$2); print $2; exit}')"
inactive="$(vm_stat | awk -F': *' '/^Pages inactive/ {gsub(/\./,"",$2); print $2; exit}')"
speculative="$(vm_stat | awk -F': *' '/^Pages speculative/ {gsub(/\./,"",$2); print $2; exit}')"
purgeable="$(vm_stat | awk -F': *' '/^Pages purgeable/ {gsub(/\./,"",$2); print $2; exit}')"
free="${free:-0}"
inactive="${inactive:-0}"
speculative="${speculative:-0}"
purgeable="${purgeable:-0}"
echo $(( (free + inactive + speculative + purgeable) * page_size ))
return 0
fi
return 1
}
if mem_avail_bytes="$(get_mem_available_bytes)"; then
if [[ "$USE_GNU_SHUF" -eq 0 && "$AWK_SHUF_EST_BYTES" -gt "$mem_avail_bytes" ]]; then
USE_GNU_SHUF=1
echo "WARNING: Estimated awk-shuffle memory requirement exceeds available memory." >&2
echo " Estimated (bytes): ${AWK_SHUF_EST_BYTES}" >&2
echo " Available (bytes): ${mem_avail_bytes}" >&2
echo " Auto-enabling: --use-gnu-shuf" >&2
fi
else
echo "WARNING: Could not determine available memory; leaving shuffle mode unchanged." >&2
fi
# ============================
# Check required executables
# ============================
need_cmd() { command -v "$1" >/dev/null 2>&1; }
missing=()
for c in wgsim awk paste cut tr gzip python3 mktemp basename cat mv mkdir; do
need_cmd "$c" || missing+=("$c")
done
if [[ -z "$OUT_PREFIX" && "$DETERMINISTIC_PREFIX" -eq 0 ]]; then
need_cmd date || missing+=(date)
fi
if [[ "$USE_GNU_SHUF" -eq 1 ]]; then
need_cmd shuf || missing+=(shuf)
fi
if (( ${#missing[@]} > 0 )); then
echo "ERROR: Missing required executables on PATH:" >&2
for c in "${missing[@]}"; do echo " - $c" >&2; done
exit 1
fi
# ============================
# Helpers
# ============================
strip_ext() {
local f
f="$(basename "$1")"
f="${f%.fa}"
f="${f%.fasta}"
f="${f%.fna}"
f="${f%.fas}"
echo "$f"
}
tag_fastq_headers() {
local tag="$1"
local in_fq="$2"
local out_fq="$3"
awk -v tag="$tag" '
(NR % 4) == 1 {
if ($0 ~ /^@/) sub(/^@/, "@" tag "|")
else $0 = "@" tag "|" $0
}
{ print }
' "$in_fq" > "$out_fq"
}
wait_pids() {
local label="$1"; shift
local -a pids=( "$@" )
local pid rc=0
for pid in "${pids[@]}"; do
[[ -z "$pid" ]] && continue
if ! wait "$pid"; then
echo "ERROR: Background job failed during: ${label} (pid=${pid})" >&2
rc=1
fi
done
return "$rc"
}
# Fix A: set PID into a named variable (avoid command substitution capturing stdout)
run_maybe_bg() {
# Usage: run_maybe_bg pid_varname command arg1 arg2 ...
local __pidvar="$1"; shift
if [[ "$NO_PARALLEL" -eq 1 ]]; then
"$@"
printf -v "$__pidvar" '%s' ""
else
"$@" &
printf -v "$__pidvar" '%s' "$!"
fi
}
assert_first_line_prefix() {
local file="$1"
local expected="$2"
local first
first="$(awk 'NR==1{print; exit}' "$file")"
if [[ "${first}" != "${expected}"* ]]; then
echo "ERROR: Sanity check failed for $file" >&2
echo " Expected first line to start with: ${expected}" >&2
echo " Observed first line: ${first}" >&2
exit 1
fi
}
make_shuf_random_source() {
local n_lines="$1"
local seed="$2"
local out_file="$3"
python3 - <<EOF
import random
n_lines = int(${n_lines})
seed = int(${seed})
nbytes = n_lines * 8 + 1024 * 1024
random.seed(seed)
with open(${out_file!r}, "wb") as f:
chunk = 1024 * 1024
remaining = nbytes
while remaining > 0:
m = min(chunk, remaining)
f.write(bytes(random.randrange(256) for _ in range(m)))
remaining -= m
EOF
}
# ============================
# Create output directory
# ============================
mkdir -p "$OUT_DIR"
# ============================
# Derive output prefix if needed
# ============================
if [[ -z "$OUT_PREFIX" ]]; then
OUT_PREFIX_BASE="simulated_$(strip_ext "$VIRAL_FA")__spiked_into__$(strip_ext "$HOST_FA")"\
"__tp${TOTAL_PAIRS}__vf${VIRAL_FRACTION}__vdiv${VIRAL_DIVERGENCE}__hdiv${HOST_SUB_RATE}"\
"__rl${READLEN}__err${ERROR_RATE}"
if [[ "$DETERMINISTIC_PREFIX" -eq 1 ]]; then
OUT_PREFIX="${OUT_PREFIX_BASE}"
else
OUT_PREFIX="${OUT_PREFIX_BASE}__$(date +'%Y%m%d%H%M')"
fi
fi
OUT_R1="${OUT_DIR%/}/${OUT_PREFIX}_R1.fq"
OUT_R2="${OUT_DIR%/}/${OUT_PREFIX}_R2.fq"
# ============================
# Python float math
# ============================
VIRAL_PAIRS="$(
python3 - <<EOF
total = ${TOTAL_PAIRS}
frac = ${VIRAL_FRACTION}
print(round(total * frac))
EOF
)"
HOST_PAIRS=$(( TOTAL_PAIRS - VIRAL_PAIRS ))
echo "Simulation plan:"
echo " Viral FASTA: $VIRAL_FA"
echo " Host FASTA: $HOST_FA"
echo " Viral pairs: $VIRAL_PAIRS (div=${VIRAL_DIVERGENCE})"
echo " Host pairs: $HOST_PAIRS (sub=${HOST_SUB_RATE})"
echo " Output dir: $OUT_DIR"
echo " Output prefix: $OUT_PREFIX"
echo " Parallel: $([[ "$NO_PARALLEL" -eq 1 ]] && echo no || echo yes)"
echo " Shuffle mode: $([[ "$USE_GNU_SHUF" -eq 1 ]] && echo "GNU shuf" || echo "awk Fisher–Yates")"
echo
TMPDIR="$(mktemp -d)"
cleanup() {
if [[ "$KEEP_TMP" -eq 1 ]]; then
return 0
fi
rm -rf "$TMPDIR"
}
trap cleanup EXIT
# ============================
# Viral + host reads (maybe parallel)
# ============================
run_maybe_bg pid_vwgsim \
wgsim \
-h \
-e "$ERROR_RATE" \
-r "$VIRAL_DIVERGENCE" \
-R "$VIRAL_INDEL_RATE" \
-X "$VIRAL_INDEL_RATE" \
-d "$INSERT_MEAN" \
-s "$INSERT_SD" \
-N "$VIRAL_PAIRS" \
-1 "$READLEN" \
-2 "$READLEN" \
-S "$RANDOM_SEED" \
"$VIRAL_FA" \
"$TMPDIR/viral_R1.fq" "$TMPDIR/viral_R2.fq"
run_maybe_bg pid_hwgsim \
wgsim \
-e "$ERROR_RATE" \
-r "$HOST_SUB_RATE" \
-R "$HOST_INDEL_RATE" \
-X "$HOST_INDEL_RATE" \
-d "$INSERT_MEAN" \
-s "$INSERT_SD" \
-N "$HOST_PAIRS" \
-1 "$READLEN" \
-2 "$READLEN" \
-S "$((RANDOM_SEED+1))" \
"$HOST_FA" \
"$TMPDIR/host_R1.fq" "$TMPDIR/host_R2.fq"
wait_pids "wgsim simulation" "${pid_vwgsim:-}" "${pid_hwgsim:-}"
# ============================
# Re-header FASTQs (maybe parallel)
# ============================
run_maybe_bg pid_vr1 tag_fastq_headers "$VIRAL_TAG" "$TMPDIR/viral_R1.fq" "$TMPDIR/viral_R1.tagged.fq"
run_maybe_bg pid_vr2 tag_fastq_headers "$VIRAL_TAG" "$TMPDIR/viral_R2.fq" "$TMPDIR/viral_R2.tagged.fq"
run_maybe_bg pid_hr1 tag_fastq_headers "$HOST_TAG" "$TMPDIR/host_R1.fq" "$TMPDIR/host_R1.tagged.fq"
run_maybe_bg pid_hr2 tag_fastq_headers "$HOST_TAG" "$TMPDIR/host_R2.fq" "$TMPDIR/host_R2.tagged.fq"
wait_pids "FASTQ re-headering" "${pid_vr1:-}" "${pid_vr2:-}" "${pid_hr1:-}" "${pid_hr2:-}"
mv "$TMPDIR/viral_R1.tagged.fq" "$TMPDIR/viral_R1.fq"
mv "$TMPDIR/viral_R2.tagged.fq" "$TMPDIR/viral_R2.fq"
mv "$TMPDIR/host_R1.tagged.fq" "$TMPDIR/host_R1.fq"
mv "$TMPDIR/host_R2.tagged.fq" "$TMPDIR/host_R2.fq"
assert_first_line_prefix "$TMPDIR/viral_R1.fq" "@${VIRAL_TAG}|"
assert_first_line_prefix "$TMPDIR/viral_R2.fq" "@${VIRAL_TAG}|"
assert_first_line_prefix "$TMPDIR/host_R1.fq" "@${HOST_TAG}|"
assert_first_line_prefix "$TMPDIR/host_R2.fq" "@${HOST_TAG}|"
# ============================
# Mix host + virus (maybe parallel)
# ============================
run_maybe_bg pid_cat1 bash -c "cat '$TMPDIR/host_R1.fq' '$TMPDIR/viral_R1.fq' > '$TMPDIR/mix_R1.fq'"
run_maybe_bg pid_cat2 bash -c "cat '$TMPDIR/host_R2.fq' '$TMPDIR/viral_R2.fq' > '$TMPDIR/mix_R2.fq'"
wait_pids "FASTQ concatenation" "${pid_cat1:-}" "${pid_cat2:-}"
# ============================
# Shuffle paired records
# ============================
paste -d '\t' \
<(paste - - - - < "$TMPDIR/mix_R1.fq") \
<(paste - - - - < "$TMPDIR/mix_R2.fq") \
> "$TMPDIR/paired.tsv"
if [[ "$USE_GNU_SHUF" -eq 1 ]]; then
make_shuf_random_source "$TOTAL_PAIRS" "$RANDOM_SEED" "$TMPDIR/shuf.random.bin"
shuf --random-source="$TMPDIR/shuf.random.bin" "$TMPDIR/paired.tsv" > "$TMPDIR/shuf.tsv"
else
awk -v seed="$RANDOM_SEED" '
BEGIN { srand(seed) }
{ a[NR]=$0 }
END {
for(i=NR;i>1;i--){
j=int(rand()*i)+1
tmp=a[i]; a[i]=a[j]; a[j]=tmp
}
for(i=1;i<=NR;i++) print a[i]
}
' "$TMPDIR/paired.tsv" > "$TMPDIR/shuf.tsv"
fi
# ============================
# Split TSV back to FASTQ (maybe parallel)
# ============================
run_maybe_bg pid_out1 bash -c "cut -f1-4 '$TMPDIR/shuf.tsv' | tr '\t' '\n' > '$OUT_R1'"
run_maybe_bg pid_out2 bash -c "cut -f5-8 '$TMPDIR/shuf.tsv' | tr '\t' '\n' > '$OUT_R2'"
wait_pids "R1/R2 splitting" "${pid_out1:-}" "${pid_out2:-}"
run_maybe_bg pid_gzip_out1 gzip -9 -f "$OUT_R1"
run_maybe_bg pid_gzip_out2 gzip -9 -f "$OUT_R2"
wait_pids "compressing output" "${pid_gzip_out1:-}" "${pid_gzip_out2:-}"
echo "Done:"
echo " ${OUT_R1}.gz"
echo " ${OUT_R2}.gz"
if [[ "$KEEP_TMP" -eq 1 ]]; then
echo "Temp directory preserved:"
echo " $TMPDIR"
fi
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment