Skip to content

Instantly share code, notes, and snippets.

@chmouel
Created May 26, 2026 13:14
Show Gist options
  • Select an option

  • Save chmouel/ad15c891e011c88feed344b6fa0b7a3f to your computer and use it in GitHub Desktop.

Select an option

Save chmouel/ad15c891e011c88feed344b6fa0b7a3f to your computer and use it in GitHub Desktop.
#!/usr/bin/env bash
#
# Reproduce the resolver goroutine leak fixed by tektoncd/pipeline#10098.
#
# The default mode creates many PipelineRuns whose TaskRef is resolved by the
# git resolver against an in-cluster HTTP server that accepts requests and never
# replies. With a short git resolver fetch-timeout, the resolver reconciler's
# timeout case wins, then the worker goroutine blocks forever while sending on
# an unbuffered result channel in vulnerable builds.
#
# This is intended for disposable Tekton test clusters.
#
# Handoff note:
# This harness reproduces the resolver goroutine leak fixed by upstream
# Tekton PR #10098. It drives many PipelineRuns through remote resolution,
# enables resolver pprof, samples goroutine profiles, and writes goroutine
# counts over time.
#
# Deterministic default repro:
#
# hack/reproduce-resolver-goroutine-leak.sh \
# --total 3000 \
# --batch-size 200 \
# --sample-seconds 900
#
# The default mode uses the git resolver against an in-cluster HTTP endpoint
# that accepts connections but never replies. The script lowers the git resolver
# timeout, so the resolver reconciler hits the timeout/cancel path reliably.
#
# Cluster resolver / API latency repro:
#
# MODE=cluster hack/reproduce-resolver-goroutine-leak.sh \
# --total 2000 \
# --chaos-mesh-apiserver-delay 75s
#
# Cluster mode creates many PipelineRuns with taskRef.resolver: cluster. If
# Chaos Mesh is not installed, the script installs it with Helm using OpenShift
# defaults for CRI-O. It then injects latency from the resolver pod to
# kubernetes.default.svc, simulating a slow Kubernetes API path.
#
# Expected validation output:
#
# resolver-leak-pprof-*/goroutine-counts.tsv
# resolver-leak-pprof-*/goroutine-*.txt
#
# On a vulnerable build, goroutine counts should climb roughly linearly and not
# recover. Profiles should show blocked channel sends around the resolver
# reconciler. On a fixed build, counts should plateau after the workload drains.
set -euo pipefail
KUBECTL="${KUBECTL:-kubectl}"
MODE="${MODE:-git-timeout}"
LOAD_NAMESPACE="${LOAD_NAMESPACE:-resolver-leak-repro}"
TEKTON_NAMESPACE="${TEKTON_NAMESPACE:-tekton-pipelines}"
RESOLVERS_NAMESPACE="${RESOLVERS_NAMESPACE:-tekton-pipelines-resolvers}"
RESOLVERS_DEPLOYMENT="${RESOLVERS_DEPLOYMENT:-tekton-pipelines-remote-resolvers}"
TOTAL="${TOTAL:-1200}"
BATCH_SIZE="${BATCH_SIZE:-100}"
BATCH_SLEEP_SECONDS="${BATCH_SLEEP_SECONDS:-1}"
GIT_TIMEOUT="${GIT_TIMEOUT:-750ms}"
SAMPLE_SECONDS="${SAMPLE_SECONDS:-600}"
SAMPLE_INTERVAL_SECONDS="${SAMPLE_INTERVAL_SECONDS:-30}"
PPROF_LOCAL_PORT="${PPROF_LOCAL_PORT:-18008}"
PPROF_REMOTE_PORT="${PPROF_REMOTE_PORT:-8008}"
OUT_DIR="${OUT_DIR:-resolver-leak-pprof-$(date +%Y%m%d-%H%M%S)}"
CLEANUP="${CLEANUP:-false}"
CHAOS_MESH_APISERVER_DELAY="${CHAOS_MESH_APISERVER_DELAY:-}"
CHAOS_MESH_APISERVER_TARGET="${CHAOS_MESH_APISERVER_TARGET:-kubernetes.default.svc}"
INSTALL_CHAOS_MESH="${INSTALL_CHAOS_MESH:-false}"
CHAOS_MESH_NAMESPACE="${CHAOS_MESH_NAMESPACE:-chaos-mesh}"
CHAOS_MESH_RELEASE="${CHAOS_MESH_RELEASE:-chaos-mesh}"
CHAOS_MESH_VERSION="${CHAOS_MESH_VERSION:-}"
CHAOS_MESH_RUNTIME="${CHAOS_MESH_RUNTIME:-crio}"
CHAOS_MESH_SOCKET_PATH="${CHAOS_MESH_SOCKET_PATH:-/var/run/crio/crio.sock}"
usage() {
cat <<EOF
Usage: $0 [options]
Options:
--mode git-timeout|cluster Workload mode. Default: ${MODE}
--namespace NAME Namespace for generated workload. Default: ${LOAD_NAMESPACE}
--total N Total PipelineRuns to create. Default: ${TOTAL}
--batch-size N PipelineRuns per kubectl apply batch. Default: ${BATCH_SIZE}
--batch-sleep-seconds N Sleep between batches. Default: ${BATCH_SLEEP_SECONDS}
--git-timeout DURATION git-resolver fetch-timeout. Default: ${GIT_TIMEOUT}
--sample-seconds N pprof sampling window. Default: ${SAMPLE_SECONDS}
--sample-interval-seconds N pprof sampling interval. Default: ${SAMPLE_INTERVAL_SECONDS}
--out-dir DIR Directory for pprof samples. Default: ${OUT_DIR}
--install-chaos-mesh Force install/upgrade Chaos Mesh with Helm before applying chaos.
--chaos-mesh-apiserver-delay DURATION Apply Chaos Mesh egress delay; installs Chaos Mesh if missing.
--chaos-mesh-version VERSION Install a specific Chaos Mesh chart version.
--chaos-mesh-runtime RUNTIME Runtime passed to Helm. Default: ${CHAOS_MESH_RUNTIME}
--chaos-mesh-socket-path PATH Runtime socket passed to Helm. Default: ${CHAOS_MESH_SOCKET_PATH}
--cleanup Delete the workload namespace before exit.
-h, --help Show this help.
Environment overrides:
KUBECTL, TEKTON_NAMESPACE, RESOLVERS_NAMESPACE, RESOLVERS_DEPLOYMENT,
PPROF_LOCAL_PORT, PPROF_REMOTE_PORT, CHAOS_MESH_APISERVER_TARGET,
CHAOS_MESH_NAMESPACE, CHAOS_MESH_RELEASE.
Examples:
$0 --total 3000 --batch-size 200 --sample-seconds 900
MODE=cluster $0 --total 2000 --chaos-mesh-apiserver-delay 75s
EOF
}
while [[ $# -gt 0 ]]; do
case "$1" in
--mode)
MODE="$2"
shift 2
;;
--namespace)
LOAD_NAMESPACE="$2"
shift 2
;;
--total)
TOTAL="$2"
shift 2
;;
--batch-size)
BATCH_SIZE="$2"
shift 2
;;
--batch-sleep-seconds)
BATCH_SLEEP_SECONDS="$2"
shift 2
;;
--git-timeout)
GIT_TIMEOUT="$2"
shift 2
;;
--sample-seconds)
SAMPLE_SECONDS="$2"
shift 2
;;
--sample-interval-seconds)
SAMPLE_INTERVAL_SECONDS="$2"
shift 2
;;
--out-dir)
OUT_DIR="$2"
shift 2
;;
--install-chaos-mesh)
INSTALL_CHAOS_MESH="true"
shift
;;
--chaos-mesh-apiserver-delay)
CHAOS_MESH_APISERVER_DELAY="$2"
shift 2
;;
--chaos-mesh-version)
CHAOS_MESH_VERSION="$2"
shift 2
;;
--chaos-mesh-runtime)
CHAOS_MESH_RUNTIME="$2"
shift 2
;;
--chaos-mesh-socket-path)
CHAOS_MESH_SOCKET_PATH="$2"
shift 2
;;
--cleanup)
CLEANUP="true"
shift
;;
-h|--help)
usage
exit 0
;;
*)
echo "unknown argument: $1" >&2
usage >&2
exit 2
;;
esac
done
if [[ "${MODE}" != "git-timeout" && "${MODE}" != "cluster" ]]; then
echo "--mode must be git-timeout or cluster" >&2
exit 2
fi
need() {
if ! command -v "$1" >/dev/null 2>&1; then
echo "required command not found: $1" >&2
exit 1
fi
}
log() {
printf '[%(%Y-%m-%dT%H:%M:%S%z)T] %s\n' -1 "$*"
}
patch_cm_data() {
local namespace="$1"
local name="$2"
local patch="$3"
if "${KUBECTL}" -n "${namespace}" get configmap "${name}" >/dev/null 2>&1; then
"${KUBECTL}" -n "${namespace}" patch configmap "${name}" --type merge -p "${patch}" >/dev/null
else
log "configmap ${namespace}/${name} not found; skipping patch"
fi
}
is_openshift() {
"${KUBECTL}" api-resources --api-group=security.openshift.io 2>/dev/null | grep -q '^securitycontextconstraints'
}
chaos_mesh_installed() {
"${KUBECTL}" get crd networkchaos.chaos-mesh.org >/dev/null 2>&1
}
install_chaos_mesh() {
need helm
local version_args=()
if [[ -n "${CHAOS_MESH_VERSION}" ]]; then
version_args=(--version "${CHAOS_MESH_VERSION}")
fi
log "installing/upgrading Chaos Mesh release ${CHAOS_MESH_RELEASE} in namespace ${CHAOS_MESH_NAMESPACE}"
helm repo add chaos-mesh https://charts.chaos-mesh.org >/dev/null
helm repo update chaos-mesh >/dev/null
helm upgrade --install "${CHAOS_MESH_RELEASE}" chaos-mesh/chaos-mesh \
--namespace "${CHAOS_MESH_NAMESPACE}" \
--create-namespace \
--set "chaosDaemon.runtime=${CHAOS_MESH_RUNTIME}" \
--set "chaosDaemon.socketPath=${CHAOS_MESH_SOCKET_PATH}" \
"${version_args[@]}"
if is_openshift; then
if command -v oc >/dev/null 2>&1; then
log "granting OpenShift privileged SCC to Chaos Mesh chaos-daemon service account"
oc adm policy add-scc-to-user privileged -n "${CHAOS_MESH_NAMESPACE}" -z chaos-daemon >/dev/null
else
log "OpenShift detected, but oc is not available; grant privileged SCC manually if chaos-daemon is blocked"
fi
fi
log "waiting for Chaos Mesh controllers"
"${KUBECTL}" -n "${CHAOS_MESH_NAMESPACE}" rollout status deployment/chaos-controller-manager --timeout=180s
"${KUBECTL}" -n "${CHAOS_MESH_NAMESPACE}" rollout status daemonset/chaos-daemon --timeout=180s
"${KUBECTL}" wait --for=condition=Established crd/networkchaos.chaos-mesh.org --timeout=60s
}
cleanup() {
if [[ -n "${PORT_FORWARD_PID:-}" ]]; then
kill "${PORT_FORWARD_PID}" >/dev/null 2>&1 || true
fi
if [[ "${CLEANUP}" == "true" ]]; then
"${KUBECTL}" -n "${RESOLVERS_NAMESPACE}" delete networkchaos resolver-api-delay --ignore-not-found >/dev/null 2>&1 || true
"${KUBECTL}" delete namespace "${LOAD_NAMESPACE}" --ignore-not-found >/dev/null
fi
}
trap cleanup EXIT
need curl
need awk
need "${KUBECTL}"
mkdir -p "${OUT_DIR}"
log "using context: $("${KUBECTL}" config current-context)"
log "writing pprof samples to ${OUT_DIR}"
if [[ "${INSTALL_CHAOS_MESH}" == "true" ]]; then
install_chaos_mesh
elif [[ -n "${CHAOS_MESH_APISERVER_DELAY}" ]] && ! chaos_mesh_installed; then
log "Chaos Mesh NetworkChaos CRD is missing; installing Chaos Mesh with sane OpenShift defaults"
install_chaos_mesh
fi
log "enabling resolver profiling and resolver feature flags"
patch_cm_data "${RESOLVERS_NAMESPACE}" config-observability \
'{"data":{"runtime-profiling":"enabled","metrics-protocol":"prometheus"}}'
patch_cm_data "${RESOLVERS_NAMESPACE}" resolvers-feature-flags \
'{"data":{"enable-git-resolver":"true","enable-cluster-resolver":"true"}}'
patch_cm_data "${TEKTON_NAMESPACE}" feature-flags \
'{"data":{"enable-api-fields":"beta"}}'
if [[ "${MODE}" == "git-timeout" ]]; then
log "setting git resolver fetch-timeout=${GIT_TIMEOUT}"
patch_cm_data "${RESOLVERS_NAMESPACE}" git-resolver-config \
"{\"data\":{\"fetch-timeout\":\"${GIT_TIMEOUT}\",\"default-cache-mode\":\"never\"}}"
fi
if [[ -n "${CHAOS_MESH_APISERVER_DELAY}" ]]; then
if ! chaos_mesh_installed; then
echo "Chaos Mesh NetworkChaos CRD is still missing after install attempt; cannot apply API server delay" >&2
exit 1
fi
log "applying Chaos Mesh API egress delay ${CHAOS_MESH_APISERVER_DELAY} to ${CHAOS_MESH_APISERVER_TARGET}"
"${KUBECTL}" -n "${RESOLVERS_NAMESPACE}" apply -f - <<YAML >/dev/null
apiVersion: chaos-mesh.org/v1alpha1
kind: NetworkChaos
metadata:
name: resolver-api-delay
spec:
action: delay
mode: all
selector:
namespaces:
- ${RESOLVERS_NAMESPACE}
labelSelectors:
app: tekton-pipelines-resolvers
direction: to
externalTargets:
- ${CHAOS_MESH_APISERVER_TARGET}
delay:
latency: "${CHAOS_MESH_APISERVER_DELAY}"
correlation: "100"
jitter: "0ms"
YAML
fi
if "${KUBECTL}" -n "${RESOLVERS_NAMESPACE}" get deployment "${RESOLVERS_DEPLOYMENT}" >/dev/null 2>&1; then
log "restarting ${RESOLVERS_NAMESPACE}/${RESOLVERS_DEPLOYMENT} so profiling/config changes are definitely active"
"${KUBECTL}" -n "${RESOLVERS_NAMESPACE}" rollout restart deployment "${RESOLVERS_DEPLOYMENT}" >/dev/null
"${KUBECTL}" -n "${RESOLVERS_NAMESPACE}" rollout status deployment "${RESOLVERS_DEPLOYMENT}" --timeout=180s
else
echo "resolver deployment not found: ${RESOLVERS_NAMESPACE}/${RESOLVERS_DEPLOYMENT}" >&2
exit 1
fi
log "creating workload namespace ${LOAD_NAMESPACE}"
"${KUBECTL}" create namespace "${LOAD_NAMESPACE}" --dry-run=client -o yaml | "${KUBECTL}" apply -f - >/dev/null
if [[ "${MODE}" == "git-timeout" ]]; then
log "deploying hanging HTTP endpoint used by git resolver"
"${KUBECTL}" -n "${LOAD_NAMESPACE}" apply -f - <<'YAML' >/dev/null
apiVersion: apps/v1
kind: Deployment
metadata:
name: hang-git
spec:
replicas: 1
selector:
matchLabels:
app: hang-git
template:
metadata:
labels:
app: hang-git
spec:
containers:
- name: server
image: registry.access.redhat.com/ubi9/python-311:latest
command:
- python3
- -c
- |
from http.server import ThreadingHTTPServer, BaseHTTPRequestHandler
import time
class Handler(BaseHTTPRequestHandler):
def do_GET(self):
time.sleep(3600)
def do_POST(self):
time.sleep(3600)
def log_message(self, fmt, *args):
pass
ThreadingHTTPServer(("", 8080), Handler).serve_forever()
ports:
- containerPort: 8080
---
apiVersion: v1
kind: Service
metadata:
name: hang-git
spec:
selector:
app: hang-git
ports:
- name: http
port: 8080
targetPort: 8080
YAML
"${KUBECTL}" -n "${LOAD_NAMESPACE}" rollout status deployment hang-git --timeout=180s
elif [[ "${MODE}" == "cluster" ]]; then
log "creating cluster resolver target Task"
"${KUBECTL}" -n "${LOAD_NAMESPACE}" apply -f - <<'YAML' >/dev/null
apiVersion: tekton.dev/v1
kind: Task
metadata:
name: leak-repro-noop
spec:
steps:
- name: nop
image: registry.access.redhat.com/ubi9/ubi-minimal:latest
script: |
#!/usr/bin/env sh
true
YAML
if [[ -z "${CHAOS_MESH_APISERVER_DELAY}" ]]; then
log "cluster mode requires external API latency injection; without it, requests usually resolve before timeout"
fi
fi
start_port_forward() {
log "starting pprof port-forward localhost:${PPROF_LOCAL_PORT} -> ${RESOLVERS_DEPLOYMENT}:${PPROF_REMOTE_PORT}"
"${KUBECTL}" -n "${RESOLVERS_NAMESPACE}" port-forward "deployment/${RESOLVERS_DEPLOYMENT}" \
"${PPROF_LOCAL_PORT}:${PPROF_REMOTE_PORT}" >"${OUT_DIR}/port-forward.log" 2>&1 &
PORT_FORWARD_PID="$!"
for _ in $(seq 1 30); do
if curl -fsS "http://127.0.0.1:${PPROF_LOCAL_PORT}/debug/pprof/goroutine?debug=1" >/dev/null 2>&1; then
return 0
fi
sleep 1
done
echo "pprof endpoint did not become reachable; see ${OUT_DIR}/port-forward.log" >&2
exit 1
}
sample_goroutines() {
local elapsed=0
local sample_file="${OUT_DIR}/goroutine-counts.tsv"
printf 'elapsed_seconds\tgoroutines\n' >"${sample_file}"
while [[ "${elapsed}" -le "${SAMPLE_SECONDS}" ]]; do
local profile="${OUT_DIR}/goroutine-${elapsed}s.txt"
curl -fsS "http://127.0.0.1:${PPROF_LOCAL_PORT}/debug/pprof/goroutine?debug=1" -o "${profile}"
local count
count="$(awk '/^goroutine profile: total / {print $4; exit}' "${profile}")"
printf '%s\t%s\n' "${elapsed}" "${count:-unknown}" | tee -a "${sample_file}"
sleep "${SAMPLE_INTERVAL_SECONDS}"
elapsed=$((elapsed + SAMPLE_INTERVAL_SECONDS))
done
}
create_batch() {
local start="$1"
local end="$2"
{
for i in $(seq "${start}" "${end}"); do
if [[ "${MODE}" == "git-timeout" ]]; then
cat <<YAML
apiVersion: tekton.dev/v1
kind: PipelineRun
metadata:
generateName: leak-repro-git-${i}-
namespace: ${LOAD_NAMESPACE}
labels:
leak-repro.tekton.dev/run: "true"
leak-repro.tekton.dev/mode: git-timeout
spec:
pipelineSpec:
tasks:
- name: resolve-slow-git-task
taskRef:
resolver: git
params:
- name: url
value: http://hang-git.${LOAD_NAMESPACE}.svc.cluster.local:8080/repo.git
- name: revision
value: main
- name: pathInRepo
value: task/noop/0.1/noop.yaml
- name: cache
value: never
---
YAML
else
cat <<YAML
apiVersion: tekton.dev/v1
kind: PipelineRun
metadata:
generateName: leak-repro-cluster-${i}-
namespace: ${LOAD_NAMESPACE}
labels:
leak-repro.tekton.dev/run: "true"
leak-repro.tekton.dev/mode: cluster
spec:
pipelineSpec:
tasks:
- name: resolve-cluster-task
taskRef:
resolver: cluster
params:
- name: kind
value: task
- name: namespace
value: ${LOAD_NAMESPACE}
- name: name
value: leak-repro-noop
- name: cache
value: never
---
YAML
fi
done
} | "${KUBECTL}" apply -f - >/dev/null
}
start_port_forward
log "creating ${TOTAL} PipelineRuns in batches of ${BATCH_SIZE}"
created=0
while [[ "${created}" -lt "${TOTAL}" ]]; do
start=$((created + 1))
end=$((created + BATCH_SIZE))
if [[ "${end}" -gt "${TOTAL}" ]]; then
end="${TOTAL}"
fi
create_batch "${start}" "${end}"
created="${end}"
log "created ${created}/${TOTAL}"
sleep "${BATCH_SLEEP_SECONDS}"
done
log "sampling goroutine profile for ${SAMPLE_SECONDS}s"
sample_goroutines
log "done. Goroutine counts: ${OUT_DIR}/goroutine-counts.tsv"
log "inspect blocked senders with: grep -R \"chan send\" ${OUT_DIR}/goroutine-*.txt | head"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment