chmouel · May 26, 2026 13:14
diff --git a/reproduce-resolver-goroutine-leak.sh b/reproduce-resolver-goroutine-leak.sh
 #!/usr/bin/env bash
 #
 # Reproduce the resolver goroutine leak fixed by tektoncd/pipeline#10098.
 #
 # The default mode creates many PipelineRuns whose TaskRef is resolved by the
 # git resolver against an in-cluster HTTP server that accepts requests and never
 # replies. With a short git resolver fetch-timeout, the resolver reconciler's
 # timeout case wins, then the worker goroutine blocks forever while sending on
 # an unbuffered result channel in vulnerable builds.
 #
 # This is intended for disposable Tekton test clusters.
 #
 # Handoff note:
 # This harness reproduces the resolver goroutine leak fixed by upstream
 # Tekton PR #10098. It drives many PipelineRuns through remote resolution,
 # enables resolver pprof, samples goroutine profiles, and writes goroutine
 # counts over time.
 #
 # Deterministic default repro:
 #
 #   hack/reproduce-resolver-goroutine-leak.sh \
 #     --total 3000 \
 #     --batch-size 200 \
 #     --sample-seconds 900
 #
 # The default mode uses the git resolver against an in-cluster HTTP endpoint
 # that accepts connections but never replies. The script lowers the git resolver
 # timeout, so the resolver reconciler hits the timeout/cancel path reliably.
 #
 # Cluster resolver / API latency repro:
 #
 #   MODE=cluster hack/reproduce-resolver-goroutine-leak.sh \
 #     --total 2000 \
 #     --chaos-mesh-apiserver-delay 75s
 #
 # Cluster mode creates many PipelineRuns with taskRef.resolver: cluster. If
 # Chaos Mesh is not installed, the script installs it with Helm using OpenShift
 # defaults for CRI-O. It then injects latency from the resolver pod to
 # kubernetes.default.svc, simulating a slow Kubernetes API path.
 #
 # Expected validation output:
 #
 #   resolver-leak-pprof-*/goroutine-counts.tsv
 #   resolver-leak-pprof-*/goroutine-*.txt
 #
 # On a vulnerable build, goroutine counts should climb roughly linearly and not
 # recover. Profiles should show blocked channel sends around the resolver
 # reconciler. On a fixed build, counts should plateau after the workload drains.

 set -euo pipefail

 KUBECTL="${KUBECTL:-kubectl}"
 MODE="${MODE:-git-timeout}"
 LOAD_NAMESPACE="${LOAD_NAMESPACE:-resolver-leak-repro}"
 TEKTON_NAMESPACE="${TEKTON_NAMESPACE:-tekton-pipelines}"
 RESOLVERS_NAMESPACE="${RESOLVERS_NAMESPACE:-tekton-pipelines-resolvers}"
 RESOLVERS_DEPLOYMENT="${RESOLVERS_DEPLOYMENT:-tekton-pipelines-remote-resolvers}"
 TOTAL="${TOTAL:-1200}"
 BATCH_SIZE="${BATCH_SIZE:-100}"
 BATCH_SLEEP_SECONDS="${BATCH_SLEEP_SECONDS:-1}"
 GIT_TIMEOUT="${GIT_TIMEOUT:-750ms}"
 SAMPLE_SECONDS="${SAMPLE_SECONDS:-600}"
 SAMPLE_INTERVAL_SECONDS="${SAMPLE_INTERVAL_SECONDS:-30}"
 PPROF_LOCAL_PORT="${PPROF_LOCAL_PORT:-18008}"
 PPROF_REMOTE_PORT="${PPROF_REMOTE_PORT:-8008}"
 OUT_DIR="${OUT_DIR:-resolver-leak-pprof-$(date +%Y%m%d-%H%M%S)}"
 CLEANUP="${CLEANUP:-false}"
 CHAOS_MESH_APISERVER_DELAY="${CHAOS_MESH_APISERVER_DELAY:-}"
 CHAOS_MESH_APISERVER_TARGET="${CHAOS_MESH_APISERVER_TARGET:-kubernetes.default.svc}"
 INSTALL_CHAOS_MESH="${INSTALL_CHAOS_MESH:-false}"
 CHAOS_MESH_NAMESPACE="${CHAOS_MESH_NAMESPACE:-chaos-mesh}"
 CHAOS_MESH_RELEASE="${CHAOS_MESH_RELEASE:-chaos-mesh}"
 CHAOS_MESH_VERSION="${CHAOS_MESH_VERSION:-}"
 CHAOS_MESH_RUNTIME="${CHAOS_MESH_RUNTIME:-crio}"
 CHAOS_MESH_SOCKET_PATH="${CHAOS_MESH_SOCKET_PATH:-/var/run/crio/crio.sock}"

 usage() {
  cat <<EOF
 Usage: $0 [options]

 Options:
  --mode git-timeout|cluster            Workload mode. Default: ${MODE}
  --namespace NAME                      Namespace for generated workload. Default: ${LOAD_NAMESPACE}
  --total N                             Total PipelineRuns to create. Default: ${TOTAL}
  --batch-size N                        PipelineRuns per kubectl apply batch. Default: ${BATCH_SIZE}
  --batch-sleep-seconds N               Sleep between batches. Default: ${BATCH_SLEEP_SECONDS}
  --git-timeout DURATION                git-resolver fetch-timeout. Default: ${GIT_TIMEOUT}
  --sample-seconds N                    pprof sampling window. Default: ${SAMPLE_SECONDS}
  --sample-interval-seconds N           pprof sampling interval. Default: ${SAMPLE_INTERVAL_SECONDS}
  --out-dir DIR                         Directory for pprof samples. Default: ${OUT_DIR}
  --install-chaos-mesh                  Force install/upgrade Chaos Mesh with Helm before applying chaos.
  --chaos-mesh-apiserver-delay DURATION Apply Chaos Mesh egress delay; installs Chaos Mesh if missing.
  --chaos-mesh-version VERSION          Install a specific Chaos Mesh chart version.
  --chaos-mesh-runtime RUNTIME          Runtime passed to Helm. Default: ${CHAOS_MESH_RUNTIME}
  --chaos-mesh-socket-path PATH         Runtime socket passed to Helm. Default: ${CHAOS_MESH_SOCKET_PATH}
  --cleanup                             Delete the workload namespace before exit.
  -h, --help                            Show this help.

 Environment overrides:
  KUBECTL, TEKTON_NAMESPACE, RESOLVERS_NAMESPACE, RESOLVERS_DEPLOYMENT,
  PPROF_LOCAL_PORT, PPROF_REMOTE_PORT, CHAOS_MESH_APISERVER_TARGET,
  CHAOS_MESH_NAMESPACE, CHAOS_MESH_RELEASE.

 Examples:
  $0 --total 3000 --batch-size 200 --sample-seconds 900
  MODE=cluster $0 --total 2000 --chaos-mesh-apiserver-delay 75s
 EOF
 }

 while [[ $# -gt 0 ]]; do
  case "$1" in
    --mode)
      MODE="$2"
      shift 2
      ;;
    --namespace)
      LOAD_NAMESPACE="$2"
      shift 2
      ;;
    --total)
      TOTAL="$2"
      shift 2
      ;;
    --batch-size)
      BATCH_SIZE="$2"
      shift 2
      ;;
    --batch-sleep-seconds)
      BATCH_SLEEP_SECONDS="$2"
      shift 2
      ;;
    --git-timeout)
      GIT_TIMEOUT="$2"
      shift 2
      ;;
    --sample-seconds)
      SAMPLE_SECONDS="$2"
      shift 2
      ;;
    --sample-interval-seconds)
      SAMPLE_INTERVAL_SECONDS="$2"
      shift 2
      ;;
    --out-dir)
      OUT_DIR="$2"
      shift 2
      ;;
    --install-chaos-mesh)
      INSTALL_CHAOS_MESH="true"
      shift
      ;;
    --chaos-mesh-apiserver-delay)
      CHAOS_MESH_APISERVER_DELAY="$2"
      shift 2
      ;;
    --chaos-mesh-version)
      CHAOS_MESH_VERSION="$2"
      shift 2
      ;;
    --chaos-mesh-runtime)
      CHAOS_MESH_RUNTIME="$2"
      shift 2
      ;;
    --chaos-mesh-socket-path)
      CHAOS_MESH_SOCKET_PATH="$2"
      shift 2
      ;;
    --cleanup)
      CLEANUP="true"
      shift
      ;;
    -h|--help)
      usage
      exit 0
      ;;
    *)
      echo "unknown argument: $1" >&2
      usage >&2
      exit 2
      ;;
  esac
 done

 if [[ "${MODE}" != "git-timeout" && "${MODE}" != "cluster" ]]; then
  echo "--mode must be git-timeout or cluster" >&2
  exit 2
 fi

 need() {
  if ! command -v "$1" >/dev/null 2>&1; then
    echo "required command not found: $1" >&2
    exit 1
  fi
 }

 log() {
  printf '[%(%Y-%m-%dT%H:%M:%S%z)T] %s\n' -1 "$*"
 }

 patch_cm_data() {
  local namespace="$1"
  local name="$2"
  local patch="$3"

  if "${KUBECTL}" -n "${namespace}" get configmap "${name}" >/dev/null 2>&1; then
    "${KUBECTL}" -n "${namespace}" patch configmap "${name}" --type merge -p "${patch}" >/dev/null
  else
    log "configmap ${namespace}/${name} not found; skipping patch"
  fi
 }

 is_openshift() {
  "${KUBECTL}" api-resources --api-group=security.openshift.io 2>/dev/null | grep -q '^securitycontextconstraints'
 }

 chaos_mesh_installed() {
  "${KUBECTL}" get crd networkchaos.chaos-mesh.org >/dev/null 2>&1
 }

 install_chaos_mesh() {
  need helm

  local version_args=()
  if [[ -n "${CHAOS_MESH_VERSION}" ]]; then
    version_args=(--version "${CHAOS_MESH_VERSION}")
  fi

  log "installing/upgrading Chaos Mesh release ${CHAOS_MESH_RELEASE} in namespace ${CHAOS_MESH_NAMESPACE}"
  helm repo add chaos-mesh https://charts.chaos-mesh.org >/dev/null
  helm repo update chaos-mesh >/dev/null
  helm upgrade --install "${CHAOS_MESH_RELEASE}" chaos-mesh/chaos-mesh \
    --namespace "${CHAOS_MESH_NAMESPACE}" \
    --create-namespace \
    --set "chaosDaemon.runtime=${CHAOS_MESH_RUNTIME}" \
    --set "chaosDaemon.socketPath=${CHAOS_MESH_SOCKET_PATH}" \
    "${version_args[@]}"

  if is_openshift; then
    if command -v oc >/dev/null 2>&1; then
      log "granting OpenShift privileged SCC to Chaos Mesh chaos-daemon service account"
      oc adm policy add-scc-to-user privileged -n "${CHAOS_MESH_NAMESPACE}" -z chaos-daemon >/dev/null
    else
      log "OpenShift detected, but oc is not available; grant privileged SCC manually if chaos-daemon is blocked"
    fi
  fi

  log "waiting for Chaos Mesh controllers"
  "${KUBECTL}" -n "${CHAOS_MESH_NAMESPACE}" rollout status deployment/chaos-controller-manager --timeout=180s
  "${KUBECTL}" -n "${CHAOS_MESH_NAMESPACE}" rollout status daemonset/chaos-daemon --timeout=180s
  "${KUBECTL}" wait --for=condition=Established crd/networkchaos.chaos-mesh.org --timeout=60s
 }

 cleanup() {
  if [[ -n "${PORT_FORWARD_PID:-}" ]]; then
    kill "${PORT_FORWARD_PID}" >/dev/null 2>&1 || true
  fi
  if [[ "${CLEANUP}" == "true" ]]; then
    "${KUBECTL}" -n "${RESOLVERS_NAMESPACE}" delete networkchaos resolver-api-delay --ignore-not-found >/dev/null 2>&1 || true
    "${KUBECTL}" delete namespace "${LOAD_NAMESPACE}" --ignore-not-found >/dev/null
  fi
 }
 trap cleanup EXIT

 need curl
 need awk
 need "${KUBECTL}"

 mkdir -p "${OUT_DIR}"

 log "using context: $("${KUBECTL}" config current-context)"
 log "writing pprof samples to ${OUT_DIR}"

 if [[ "${INSTALL_CHAOS_MESH}" == "true" ]]; then
  install_chaos_mesh
 elif [[ -n "${CHAOS_MESH_APISERVER_DELAY}" ]] && ! chaos_mesh_installed; then
  log "Chaos Mesh NetworkChaos CRD is missing; installing Chaos Mesh with sane OpenShift defaults"
  install_chaos_mesh
 fi

 log "enabling resolver profiling and resolver feature flags"
 patch_cm_data "${RESOLVERS_NAMESPACE}" config-observability \
  '{"data":{"runtime-profiling":"enabled","metrics-protocol":"prometheus"}}'
 patch_cm_data "${RESOLVERS_NAMESPACE}" resolvers-feature-flags \
  '{"data":{"enable-git-resolver":"true","enable-cluster-resolver":"true"}}'
 patch_cm_data "${TEKTON_NAMESPACE}" feature-flags \
  '{"data":{"enable-api-fields":"beta"}}'

 if [[ "${MODE}" == "git-timeout" ]]; then
  log "setting git resolver fetch-timeout=${GIT_TIMEOUT}"
  patch_cm_data "${RESOLVERS_NAMESPACE}" git-resolver-config \
    "{\"data\":{\"fetch-timeout\":\"${GIT_TIMEOUT}\",\"default-cache-mode\":\"never\"}}"
 fi

 if [[ -n "${CHAOS_MESH_APISERVER_DELAY}" ]]; then
  if ! chaos_mesh_installed; then
    echo "Chaos Mesh NetworkChaos CRD is still missing after install attempt; cannot apply API server delay" >&2
    exit 1
  fi
  log "applying Chaos Mesh API egress delay ${CHAOS_MESH_APISERVER_DELAY} to ${CHAOS_MESH_APISERVER_TARGET}"
  "${KUBECTL}" -n "${RESOLVERS_NAMESPACE}" apply -f - <<YAML >/dev/null
 apiVersion: chaos-mesh.org/v1alpha1
 kind: NetworkChaos
 metadata:
  name: resolver-api-delay
 spec:
  action: delay
  mode: all
  selector:
    namespaces:
    - ${RESOLVERS_NAMESPACE}
    labelSelectors:
      app: tekton-pipelines-resolvers
  direction: to
  externalTargets:
  - ${CHAOS_MESH_APISERVER_TARGET}
  delay:
    latency: "${CHAOS_MESH_APISERVER_DELAY}"
    correlation: "100"
    jitter: "0ms"
 YAML
 fi

 if "${KUBECTL}" -n "${RESOLVERS_NAMESPACE}" get deployment "${RESOLVERS_DEPLOYMENT}" >/dev/null 2>&1; then
  log "restarting ${RESOLVERS_NAMESPACE}/${RESOLVERS_DEPLOYMENT} so profiling/config changes are definitely active"
  "${KUBECTL}" -n "${RESOLVERS_NAMESPACE}" rollout restart deployment "${RESOLVERS_DEPLOYMENT}" >/dev/null
  "${KUBECTL}" -n "${RESOLVERS_NAMESPACE}" rollout status deployment "${RESOLVERS_DEPLOYMENT}" --timeout=180s
 else
  echo "resolver deployment not found: ${RESOLVERS_NAMESPACE}/${RESOLVERS_DEPLOYMENT}" >&2
  exit 1
 fi

 log "creating workload namespace ${LOAD_NAMESPACE}"
 "${KUBECTL}" create namespace "${LOAD_NAMESPACE}" --dry-run=client -o yaml | "${KUBECTL}" apply -f - >/dev/null

 if [[ "${MODE}" == "git-timeout" ]]; then
  log "deploying hanging HTTP endpoint used by git resolver"
  "${KUBECTL}" -n "${LOAD_NAMESPACE}" apply -f - <<'YAML' >/dev/null
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: hang-git
 spec:
  replicas: 1
  selector:
    matchLabels:
      app: hang-git
  template:
    metadata:
      labels:
        app: hang-git
    spec:
      containers:
      - name: server
        image: registry.access.redhat.com/ubi9/python-311:latest
        command:
        - python3
        - -c
        - |
          from http.server import ThreadingHTTPServer, BaseHTTPRequestHandler
          import time
          class Handler(BaseHTTPRequestHandler):
              def do_GET(self):
                  time.sleep(3600)
              def do_POST(self):
                  time.sleep(3600)
              def log_message(self, fmt, *args):
                  pass
          ThreadingHTTPServer(("", 8080), Handler).serve_forever()
        ports:
        - containerPort: 8080
 ---
 apiVersion: v1
 kind: Service
 metadata:
  name: hang-git
 spec:
  selector:
    app: hang-git
  ports:
  - name: http
    port: 8080
    targetPort: 8080
 YAML
  "${KUBECTL}" -n "${LOAD_NAMESPACE}" rollout status deployment hang-git --timeout=180s
 elif [[ "${MODE}" == "cluster" ]]; then
  log "creating cluster resolver target Task"
  "${KUBECTL}" -n "${LOAD_NAMESPACE}" apply -f - <<'YAML' >/dev/null
 apiVersion: tekton.dev/v1
 kind: Task
 metadata:
  name: leak-repro-noop
 spec:
  steps:
  - name: nop
    image: registry.access.redhat.com/ubi9/ubi-minimal:latest
    script: |
      #!/usr/bin/env sh
      true
 YAML
  if [[ -z "${CHAOS_MESH_APISERVER_DELAY}" ]]; then
    log "cluster mode requires external API latency injection; without it, requests usually resolve before timeout"
  fi
 fi

 start_port_forward() {
  log "starting pprof port-forward localhost:${PPROF_LOCAL_PORT} -> ${RESOLVERS_DEPLOYMENT}:${PPROF_REMOTE_PORT}"
  "${KUBECTL}" -n "${RESOLVERS_NAMESPACE}" port-forward "deployment/${RESOLVERS_DEPLOYMENT}" \
    "${PPROF_LOCAL_PORT}:${PPROF_REMOTE_PORT}" >"${OUT_DIR}/port-forward.log" 2>&1 &
  PORT_FORWARD_PID="$!"

  for _ in $(seq 1 30); do
    if curl -fsS "http://127.0.0.1:${PPROF_LOCAL_PORT}/debug/pprof/goroutine?debug=1" >/dev/null 2>&1; then
      return 0
    fi
    sleep 1
  done

  echo "pprof endpoint did not become reachable; see ${OUT_DIR}/port-forward.log" >&2
  exit 1
 }

 sample_goroutines() {
  local elapsed=0
  local sample_file="${OUT_DIR}/goroutine-counts.tsv"

  printf 'elapsed_seconds\tgoroutines\n' >"${sample_file}"
  while [[ "${elapsed}" -le "${SAMPLE_SECONDS}" ]]; do
    local profile="${OUT_DIR}/goroutine-${elapsed}s.txt"
    curl -fsS "http://127.0.0.1:${PPROF_LOCAL_PORT}/debug/pprof/goroutine?debug=1" -o "${profile}"
    local count
    count="$(awk '/^goroutine profile: total / {print $4; exit}' "${profile}")"
    printf '%s\t%s\n' "${elapsed}" "${count:-unknown}" | tee -a "${sample_file}"
    sleep "${SAMPLE_INTERVAL_SECONDS}"
    elapsed=$((elapsed + SAMPLE_INTERVAL_SECONDS))
  done
 }

 create_batch() {
  local start="$1"
  local end="$2"

  {
    for i in $(seq "${start}" "${end}"); do
      if [[ "${MODE}" == "git-timeout" ]]; then
        cat <<YAML
 apiVersion: tekton.dev/v1
 kind: PipelineRun
 metadata:
  generateName: leak-repro-git-${i}-
  namespace: ${LOAD_NAMESPACE}
  labels:
    leak-repro.tekton.dev/run: "true"
    leak-repro.tekton.dev/mode: git-timeout
 spec:
  pipelineSpec:
    tasks:
    - name: resolve-slow-git-task
      taskRef:
        resolver: git
        params:
        - name: url
          value: http://hang-git.${LOAD_NAMESPACE}.svc.cluster.local:8080/repo.git
        - name: revision
          value: main
        - name: pathInRepo
          value: task/noop/0.1/noop.yaml
        - name: cache
          value: never
 ---
 YAML
      else
        cat <<YAML
 apiVersion: tekton.dev/v1
 kind: PipelineRun
 metadata:
  generateName: leak-repro-cluster-${i}-
  namespace: ${LOAD_NAMESPACE}
  labels:
    leak-repro.tekton.dev/run: "true"
    leak-repro.tekton.dev/mode: cluster
 spec:
  pipelineSpec:
    tasks:
    - name: resolve-cluster-task
      taskRef:
        resolver: cluster
        params:
        - name: kind
          value: task
        - name: namespace
          value: ${LOAD_NAMESPACE}
        - name: name
          value: leak-repro-noop
        - name: cache
          value: never
 ---
 YAML
      fi
    done
  } | "${KUBECTL}" apply -f - >/dev/null
 }

 start_port_forward

 log "creating ${TOTAL} PipelineRuns in batches of ${BATCH_SIZE}"
 created=0
 while [[ "${created}" -lt "${TOTAL}" ]]; do
  start=$((created + 1))
  end=$((created + BATCH_SIZE))
  if [[ "${end}" -gt "${TOTAL}" ]]; then
    end="${TOTAL}"
  fi
  create_batch "${start}" "${end}"
  created="${end}"
  log "created ${created}/${TOTAL}"
  sleep "${BATCH_SLEEP_SECONDS}"
 done

 log "sampling goroutine profile for ${SAMPLE_SECONDS}s"
 sample_goroutines

 log "done. Goroutine counts: ${OUT_DIR}/goroutine-counts.tsv"
 log "inspect blocked senders with: grep -R \"chan send\" ${OUT_DIR}/goroutine-*.txt | head"
	#!/usr/bin/env bash
	#
	# Reproduce the resolver goroutine leak fixed by tektoncd/pipeline#10098.
	#
	# The default mode creates many PipelineRuns whose TaskRef is resolved by the
	# git resolver against an in-cluster HTTP server that accepts requests and never
	# replies. With a short git resolver fetch-timeout, the resolver reconciler's
	# timeout case wins, then the worker goroutine blocks forever while sending on
	# an unbuffered result channel in vulnerable builds.
	#
	# This is intended for disposable Tekton test clusters.
	#
	# Handoff note:
	# This harness reproduces the resolver goroutine leak fixed by upstream
	# Tekton PR #10098. It drives many PipelineRuns through remote resolution,
	# enables resolver pprof, samples goroutine profiles, and writes goroutine
	# counts over time.
	#
	# Deterministic default repro:
	#
	# hack/reproduce-resolver-goroutine-leak.sh \
	# --total 3000 \
	# --batch-size 200 \
	# --sample-seconds 900
	#
	# The default mode uses the git resolver against an in-cluster HTTP endpoint
	# that accepts connections but never replies. The script lowers the git resolver
	# timeout, so the resolver reconciler hits the timeout/cancel path reliably.
	#
	# Cluster resolver / API latency repro:
	#
	# MODE=cluster hack/reproduce-resolver-goroutine-leak.sh \
	# --total 2000 \
	# --chaos-mesh-apiserver-delay 75s
	#
	# Cluster mode creates many PipelineRuns with taskRef.resolver: cluster. If
	# Chaos Mesh is not installed, the script installs it with Helm using OpenShift
	# defaults for CRI-O. It then injects latency from the resolver pod to
	# kubernetes.default.svc, simulating a slow Kubernetes API path.
	#
	# Expected validation output:
	#
	# resolver-leak-pprof-*/goroutine-counts.tsv
	# resolver-leak-pprof-/goroutine-.txt
	#
	# On a vulnerable build, goroutine counts should climb roughly linearly and not
	# recover. Profiles should show blocked channel sends around the resolver
	# reconciler. On a fixed build, counts should plateau after the workload drains.

	set -euo pipefail

	KUBECTL="${KUBECTL:-kubectl}"
	MODE="${MODE:-git-timeout}"
	LOAD_NAMESPACE="${LOAD_NAMESPACE:-resolver-leak-repro}"
	TEKTON_NAMESPACE="${TEKTON_NAMESPACE:-tekton-pipelines}"
	RESOLVERS_NAMESPACE="${RESOLVERS_NAMESPACE:-tekton-pipelines-resolvers}"
	RESOLVERS_DEPLOYMENT="${RESOLVERS_DEPLOYMENT:-tekton-pipelines-remote-resolvers}"
	TOTAL="${TOTAL:-1200}"
	BATCH_SIZE="${BATCH_SIZE:-100}"
	BATCH_SLEEP_SECONDS="${BATCH_SLEEP_SECONDS:-1}"
	GIT_TIMEOUT="${GIT_TIMEOUT:-750ms}"
	SAMPLE_SECONDS="${SAMPLE_SECONDS:-600}"
	SAMPLE_INTERVAL_SECONDS="${SAMPLE_INTERVAL_SECONDS:-30}"
	PPROF_LOCAL_PORT="${PPROF_LOCAL_PORT:-18008}"
	PPROF_REMOTE_PORT="${PPROF_REMOTE_PORT:-8008}"
	OUT_DIR="${OUT_DIR:-resolver-leak-pprof-$(date +%Y%m%d-%H%M%S)}"
	CLEANUP="${CLEANUP:-false}"
	CHAOS_MESH_APISERVER_DELAY="${CHAOS_MESH_APISERVER_DELAY:-}"
	CHAOS_MESH_APISERVER_TARGET="${CHAOS_MESH_APISERVER_TARGET:-kubernetes.default.svc}"
	INSTALL_CHAOS_MESH="${INSTALL_CHAOS_MESH:-false}"
	CHAOS_MESH_NAMESPACE="${CHAOS_MESH_NAMESPACE:-chaos-mesh}"
	CHAOS_MESH_RELEASE="${CHAOS_MESH_RELEASE:-chaos-mesh}"
	CHAOS_MESH_VERSION="${CHAOS_MESH_VERSION:-}"
	CHAOS_MESH_RUNTIME="${CHAOS_MESH_RUNTIME:-crio}"
	CHAOS_MESH_SOCKET_PATH="${CHAOS_MESH_SOCKET_PATH:-/var/run/crio/crio.sock}"

	usage() {
	cat <<EOF
	Usage: $0 [options]

	Options:
	--mode git-timeout\|cluster Workload mode. Default: ${MODE}
	--namespace NAME Namespace for generated workload. Default: ${LOAD_NAMESPACE}
	--total N Total PipelineRuns to create. Default: ${TOTAL}
	--batch-size N PipelineRuns per kubectl apply batch. Default: ${BATCH_SIZE}
	--batch-sleep-seconds N Sleep between batches. Default: ${BATCH_SLEEP_SECONDS}
	--git-timeout DURATION git-resolver fetch-timeout. Default: ${GIT_TIMEOUT}
	--sample-seconds N pprof sampling window. Default: ${SAMPLE_SECONDS}
	--sample-interval-seconds N pprof sampling interval. Default: ${SAMPLE_INTERVAL_SECONDS}
	--out-dir DIR Directory for pprof samples. Default: ${OUT_DIR}
	--install-chaos-mesh Force install/upgrade Chaos Mesh with Helm before applying chaos.
	--chaos-mesh-apiserver-delay DURATION Apply Chaos Mesh egress delay; installs Chaos Mesh if missing.
	--chaos-mesh-version VERSION Install a specific Chaos Mesh chart version.
	--chaos-mesh-runtime RUNTIME Runtime passed to Helm. Default: ${CHAOS_MESH_RUNTIME}
	--chaos-mesh-socket-path PATH Runtime socket passed to Helm. Default: ${CHAOS_MESH_SOCKET_PATH}
	--cleanup Delete the workload namespace before exit.
	-h, --help Show this help.

	Environment overrides:
	KUBECTL, TEKTON_NAMESPACE, RESOLVERS_NAMESPACE, RESOLVERS_DEPLOYMENT,
	PPROF_LOCAL_PORT, PPROF_REMOTE_PORT, CHAOS_MESH_APISERVER_TARGET,
	CHAOS_MESH_NAMESPACE, CHAOS_MESH_RELEASE.

	Examples:
	$0 --total 3000 --batch-size 200 --sample-seconds 900
	MODE=cluster $0 --total 2000 --chaos-mesh-apiserver-delay 75s
	EOF
	}

	while [[ $# -gt 0 ]]; do
	case "$1" in
	--mode)
	MODE="$2"
	shift 2
	;;
	--namespace)
	LOAD_NAMESPACE="$2"
	shift 2
	;;
	--total)
	TOTAL="$2"
	shift 2
	;;
	--batch-size)
	BATCH_SIZE="$2"
	shift 2
	;;
	--batch-sleep-seconds)
	BATCH_SLEEP_SECONDS="$2"
	shift 2
	;;
	--git-timeout)
	GIT_TIMEOUT="$2"
	shift 2
	;;
	--sample-seconds)
	SAMPLE_SECONDS="$2"
	shift 2
	;;
	--sample-interval-seconds)
	SAMPLE_INTERVAL_SECONDS="$2"
	shift 2
	;;
	--out-dir)
	OUT_DIR="$2"
	shift 2
	;;
	--install-chaos-mesh)
	INSTALL_CHAOS_MESH="true"
	shift
	;;
	--chaos-mesh-apiserver-delay)
	CHAOS_MESH_APISERVER_DELAY="$2"
	shift 2
	;;
	--chaos-mesh-version)
	CHAOS_MESH_VERSION="$2"
	shift 2
	;;
	--chaos-mesh-runtime)
	CHAOS_MESH_RUNTIME="$2"
	shift 2
	;;
	--chaos-mesh-socket-path)
	CHAOS_MESH_SOCKET_PATH="$2"
	shift 2
	;;
	--cleanup)
	CLEANUP="true"
	shift
	;;
	-h\|--help)
	usage
	exit 0
	;;
	*)
	echo "unknown argument: $1" >&2
	usage >&2
	exit 2
	;;
	esac
	done

	if [[ "${MODE}" != "git-timeout" && "${MODE}" != "cluster" ]]; then
	echo "--mode must be git-timeout or cluster" >&2
	exit 2
	fi

	need() {
	if ! command -v "$1" >/dev/null 2>&1; then
	echo "required command not found: $1" >&2
	exit 1
	fi
	}

	log() {
	printf '[%(%Y-%m-%dT%H:%M:%S%z)T] %s\n' -1 "$*"
	}

	patch_cm_data() {
	local namespace="$1"
	local name="$2"
	local patch="$3"

	if "${KUBECTL}" -n "${namespace}" get configmap "${name}" >/dev/null 2>&1; then
	"${KUBECTL}" -n "${namespace}" patch configmap "${name}" --type merge -p "${patch}" >/dev/null
	else
	log "configmap ${namespace}/${name} not found; skipping patch"
	fi
	}

	is_openshift() {
	"${KUBECTL}" api-resources --api-group=security.openshift.io 2>/dev/null \| grep -q '^securitycontextconstraints'
	}

	chaos_mesh_installed() {
	"${KUBECTL}" get crd networkchaos.chaos-mesh.org >/dev/null 2>&1
	}

	install_chaos_mesh() {
	need helm

	local version_args=()
	if [[ -n "${CHAOS_MESH_VERSION}" ]]; then
	version_args=(--version "${CHAOS_MESH_VERSION}")
	fi

	log "installing/upgrading Chaos Mesh release ${CHAOS_MESH_RELEASE} in namespace ${CHAOS_MESH_NAMESPACE}"
	helm repo add chaos-mesh https://charts.chaos-mesh.org >/dev/null
	helm repo update chaos-mesh >/dev/null
	helm upgrade --install "${CHAOS_MESH_RELEASE}" chaos-mesh/chaos-mesh \
	--namespace "${CHAOS_MESH_NAMESPACE}" \
	--create-namespace \
	--set "chaosDaemon.runtime=${CHAOS_MESH_RUNTIME}" \
	--set "chaosDaemon.socketPath=${CHAOS_MESH_SOCKET_PATH}" \
	"${version_args[@]}"

	if is_openshift; then
	if command -v oc >/dev/null 2>&1; then
	log "granting OpenShift privileged SCC to Chaos Mesh chaos-daemon service account"
	oc adm policy add-scc-to-user privileged -n "${CHAOS_MESH_NAMESPACE}" -z chaos-daemon >/dev/null
	else
	log "OpenShift detected, but oc is not available; grant privileged SCC manually if chaos-daemon is blocked"
	fi
	fi

	log "waiting for Chaos Mesh controllers"
	"${KUBECTL}" -n "${CHAOS_MESH_NAMESPACE}" rollout status deployment/chaos-controller-manager --timeout=180s
	"${KUBECTL}" -n "${CHAOS_MESH_NAMESPACE}" rollout status daemonset/chaos-daemon --timeout=180s
	"${KUBECTL}" wait --for=condition=Established crd/networkchaos.chaos-mesh.org --timeout=60s
	}

	cleanup() {
	if [[ -n "${PORT_FORWARD_PID:-}" ]]; then
	kill "${PORT_FORWARD_PID}" >/dev/null 2>&1 \|\| true
	fi
	if [[ "${CLEANUP}" == "true" ]]; then
	"${KUBECTL}" -n "${RESOLVERS_NAMESPACE}" delete networkchaos resolver-api-delay --ignore-not-found >/dev/null 2>&1 \|\| true
	"${KUBECTL}" delete namespace "${LOAD_NAMESPACE}" --ignore-not-found >/dev/null
	fi
	}
	trap cleanup EXIT

	need curl
	need awk
	need "${KUBECTL}"

	mkdir -p "${OUT_DIR}"

	log "using context: $("${KUBECTL}" config current-context)"
	log "writing pprof samples to ${OUT_DIR}"

	if [[ "${INSTALL_CHAOS_MESH}" == "true" ]]; then
	install_chaos_mesh
	elif [[ -n "${CHAOS_MESH_APISERVER_DELAY}" ]] && ! chaos_mesh_installed; then
	log "Chaos Mesh NetworkChaos CRD is missing; installing Chaos Mesh with sane OpenShift defaults"
	install_chaos_mesh
	fi

	log "enabling resolver profiling and resolver feature flags"
	patch_cm_data "${RESOLVERS_NAMESPACE}" config-observability \
	'{"data":{"runtime-profiling":"enabled","metrics-protocol":"prometheus"}}'
	patch_cm_data "${RESOLVERS_NAMESPACE}" resolvers-feature-flags \
	'{"data":{"enable-git-resolver":"true","enable-cluster-resolver":"true"}}'
	patch_cm_data "${TEKTON_NAMESPACE}" feature-flags \
	'{"data":{"enable-api-fields":"beta"}}'

	if [[ "${MODE}" == "git-timeout" ]]; then
	log "setting git resolver fetch-timeout=${GIT_TIMEOUT}"
	patch_cm_data "${RESOLVERS_NAMESPACE}" git-resolver-config \
	"{\"data\":{\"fetch-timeout\":\"${GIT_TIMEOUT}\",\"default-cache-mode\":\"never\"}}"
	fi

	if [[ -n "${CHAOS_MESH_APISERVER_DELAY}" ]]; then
	if ! chaos_mesh_installed; then
	echo "Chaos Mesh NetworkChaos CRD is still missing after install attempt; cannot apply API server delay" >&2
	exit 1
	fi
	log "applying Chaos Mesh API egress delay ${CHAOS_MESH_APISERVER_DELAY} to ${CHAOS_MESH_APISERVER_TARGET}"
	"${KUBECTL}" -n "${RESOLVERS_NAMESPACE}" apply -f - <<YAML >/dev/null
	apiVersion: chaos-mesh.org/v1alpha1
	kind: NetworkChaos
	metadata:
	name: resolver-api-delay
	spec:
	action: delay
	mode: all
	selector:
	namespaces:
	- ${RESOLVERS_NAMESPACE}
	labelSelectors:
	app: tekton-pipelines-resolvers
	direction: to
	externalTargets:
	- ${CHAOS_MESH_APISERVER_TARGET}
	delay:
	latency: "${CHAOS_MESH_APISERVER_DELAY}"
	correlation: "100"
	jitter: "0ms"
	YAML
	fi

	if "${KUBECTL}" -n "${RESOLVERS_NAMESPACE}" get deployment "${RESOLVERS_DEPLOYMENT}" >/dev/null 2>&1; then
	log "restarting ${RESOLVERS_NAMESPACE}/${RESOLVERS_DEPLOYMENT} so profiling/config changes are definitely active"
	"${KUBECTL}" -n "${RESOLVERS_NAMESPACE}" rollout restart deployment "${RESOLVERS_DEPLOYMENT}" >/dev/null
	"${KUBECTL}" -n "${RESOLVERS_NAMESPACE}" rollout status deployment "${RESOLVERS_DEPLOYMENT}" --timeout=180s
	else
	echo "resolver deployment not found: ${RESOLVERS_NAMESPACE}/${RESOLVERS_DEPLOYMENT}" >&2
	exit 1
	fi

	log "creating workload namespace ${LOAD_NAMESPACE}"
	"${KUBECTL}" create namespace "${LOAD_NAMESPACE}" --dry-run=client -o yaml \| "${KUBECTL}" apply -f - >/dev/null

	if [[ "${MODE}" == "git-timeout" ]]; then
	log "deploying hanging HTTP endpoint used by git resolver"
	"${KUBECTL}" -n "${LOAD_NAMESPACE}" apply -f - <<'YAML' >/dev/null
	apiVersion: apps/v1
	kind: Deployment
	metadata:
	name: hang-git
	spec:
	replicas: 1
	selector:
	matchLabels:
	app: hang-git
	template:
	metadata:
	labels:
	app: hang-git
	spec:
	containers:
	- name: server
	image: registry.access.redhat.com/ubi9/python-311:latest
	command:
	- python3
	- -c
	- \|
	from http.server import ThreadingHTTPServer, BaseHTTPRequestHandler
	import time
	class Handler(BaseHTTPRequestHandler):
	def do_GET(self):
	time.sleep(3600)
	def do_POST(self):
	time.sleep(3600)
	def log_message(self, fmt, *args):
	pass
	ThreadingHTTPServer(("", 8080), Handler).serve_forever()
	ports:
	- containerPort: 8080
	---
	apiVersion: v1
	kind: Service
	metadata:
	name: hang-git
	spec:
	selector:
	app: hang-git
	ports:
	- name: http
	port: 8080
	targetPort: 8080
	YAML
	"${KUBECTL}" -n "${LOAD_NAMESPACE}" rollout status deployment hang-git --timeout=180s
	elif [[ "${MODE}" == "cluster" ]]; then
	log "creating cluster resolver target Task"
	"${KUBECTL}" -n "${LOAD_NAMESPACE}" apply -f - <<'YAML' >/dev/null
	apiVersion: tekton.dev/v1
	kind: Task
	metadata:
	name: leak-repro-noop
	spec:
	steps:
	- name: nop
	image: registry.access.redhat.com/ubi9/ubi-minimal:latest
	script: \|
	#!/usr/bin/env sh
	true
	YAML
	if [[ -z "${CHAOS_MESH_APISERVER_DELAY}" ]]; then
	log "cluster mode requires external API latency injection; without it, requests usually resolve before timeout"
	fi
	fi

	start_port_forward() {
	log "starting pprof port-forward localhost:${PPROF_LOCAL_PORT} -> ${RESOLVERS_DEPLOYMENT}:${PPROF_REMOTE_PORT}"
	"${KUBECTL}" -n "${RESOLVERS_NAMESPACE}" port-forward "deployment/${RESOLVERS_DEPLOYMENT}" \
	"${PPROF_LOCAL_PORT}:${PPROF_REMOTE_PORT}" >"${OUT_DIR}/port-forward.log" 2>&1 &
	PORT_FORWARD_PID="$!"

	for _ in $(seq 1 30); do
	if curl -fsS "http://127.0.0.1:${PPROF_LOCAL_PORT}/debug/pprof/goroutine?debug=1" >/dev/null 2>&1; then
	return 0
	fi
	sleep 1
	done

	echo "pprof endpoint did not become reachable; see ${OUT_DIR}/port-forward.log" >&2
	exit 1
	}

	sample_goroutines() {
	local elapsed=0
	local sample_file="${OUT_DIR}/goroutine-counts.tsv"

	printf 'elapsed_seconds\tgoroutines\n' >"${sample_file}"
	while [[ "${elapsed}" -le "${SAMPLE_SECONDS}" ]]; do
	local profile="${OUT_DIR}/goroutine-${elapsed}s.txt"
	curl -fsS "http://127.0.0.1:${PPROF_LOCAL_PORT}/debug/pprof/goroutine?debug=1" -o "${profile}"
	local count
	count="$(awk '/^goroutine profile: total / {print $4; exit}' "${profile}")"
	printf '%s\t%s\n' "${elapsed}" "${count:-unknown}" \| tee -a "${sample_file}"
	sleep "${SAMPLE_INTERVAL_SECONDS}"
	elapsed=$((elapsed + SAMPLE_INTERVAL_SECONDS))
	done
	}

	create_batch() {
	local start="$1"
	local end="$2"

	{
	for i in $(seq "${start}" "${end}"); do
	if [[ "${MODE}" == "git-timeout" ]]; then
	cat <<YAML
	apiVersion: tekton.dev/v1
	kind: PipelineRun
	metadata:
	generateName: leak-repro-git-${i}-
	namespace: ${LOAD_NAMESPACE}
	labels:
	leak-repro.tekton.dev/run: "true"
	leak-repro.tekton.dev/mode: git-timeout
	spec:
	pipelineSpec:
	tasks:
	- name: resolve-slow-git-task
	taskRef:
	resolver: git
	params:
	- name: url
	value: http://hang-git.${LOAD_NAMESPACE}.svc.cluster.local:8080/repo.git
	- name: revision
	value: main
	- name: pathInRepo
	value: task/noop/0.1/noop.yaml
	- name: cache
	value: never
	---
	YAML
	else
	cat <<YAML
	apiVersion: tekton.dev/v1
	kind: PipelineRun
	metadata:
	generateName: leak-repro-cluster-${i}-
	namespace: ${LOAD_NAMESPACE}
	labels:
	leak-repro.tekton.dev/run: "true"
	leak-repro.tekton.dev/mode: cluster
	spec:
	pipelineSpec:
	tasks:
	- name: resolve-cluster-task
	taskRef:
	resolver: cluster
	params:
	- name: kind
	value: task
	- name: namespace
	value: ${LOAD_NAMESPACE}
	- name: name
	value: leak-repro-noop
	- name: cache
	value: never
	---
	YAML
	fi
	done
	} \| "${KUBECTL}" apply -f - >/dev/null
	}

	start_port_forward

	log "creating ${TOTAL} PipelineRuns in batches of ${BATCH_SIZE}"
	created=0
	while [[ "${created}" -lt "${TOTAL}" ]]; do
	start=$((created + 1))
	end=$((created + BATCH_SIZE))
	if [[ "${end}" -gt "${TOTAL}" ]]; then
	end="${TOTAL}"
	fi
	create_batch "${start}" "${end}"
	created="${end}"
	log "created ${created}/${TOTAL}"
	sleep "${BATCH_SLEEP_SECONDS}"
	done

	log "sampling goroutine profile for ${SAMPLE_SECONDS}s"
	sample_goroutines

	log "done. Goroutine counts: ${OUT_DIR}/goroutine-counts.tsv"
	log "inspect blocked senders with: grep -R \"chan send\" ${OUT_DIR}/goroutine-*.txt \| head"
No results found