Created
May 26, 2026 13:14
-
-
Save chmouel/ad15c891e011c88feed344b6fa0b7a3f to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env bash | |
| # | |
| # Reproduce the resolver goroutine leak fixed by tektoncd/pipeline#10098. | |
| # | |
| # The default mode creates many PipelineRuns whose TaskRef is resolved by the | |
| # git resolver against an in-cluster HTTP server that accepts requests and never | |
| # replies. With a short git resolver fetch-timeout, the resolver reconciler's | |
| # timeout case wins, then the worker goroutine blocks forever while sending on | |
| # an unbuffered result channel in vulnerable builds. | |
| # | |
| # This is intended for disposable Tekton test clusters. | |
| # | |
| # Handoff note: | |
| # This harness reproduces the resolver goroutine leak fixed by upstream | |
| # Tekton PR #10098. It drives many PipelineRuns through remote resolution, | |
| # enables resolver pprof, samples goroutine profiles, and writes goroutine | |
| # counts over time. | |
| # | |
| # Deterministic default repro: | |
| # | |
| # hack/reproduce-resolver-goroutine-leak.sh \ | |
| # --total 3000 \ | |
| # --batch-size 200 \ | |
| # --sample-seconds 900 | |
| # | |
| # The default mode uses the git resolver against an in-cluster HTTP endpoint | |
| # that accepts connections but never replies. The script lowers the git resolver | |
| # timeout, so the resolver reconciler hits the timeout/cancel path reliably. | |
| # | |
| # Cluster resolver / API latency repro: | |
| # | |
| # MODE=cluster hack/reproduce-resolver-goroutine-leak.sh \ | |
| # --total 2000 \ | |
| # --chaos-mesh-apiserver-delay 75s | |
| # | |
| # Cluster mode creates many PipelineRuns with taskRef.resolver: cluster. If | |
| # Chaos Mesh is not installed, the script installs it with Helm using OpenShift | |
| # defaults for CRI-O. It then injects latency from the resolver pod to | |
| # kubernetes.default.svc, simulating a slow Kubernetes API path. | |
| # | |
| # Expected validation output: | |
| # | |
| # resolver-leak-pprof-*/goroutine-counts.tsv | |
| # resolver-leak-pprof-*/goroutine-*.txt | |
| # | |
| # On a vulnerable build, goroutine counts should climb roughly linearly and not | |
| # recover. Profiles should show blocked channel sends around the resolver | |
| # reconciler. On a fixed build, counts should plateau after the workload drains. | |
| set -euo pipefail | |
| KUBECTL="${KUBECTL:-kubectl}" | |
| MODE="${MODE:-git-timeout}" | |
| LOAD_NAMESPACE="${LOAD_NAMESPACE:-resolver-leak-repro}" | |
| TEKTON_NAMESPACE="${TEKTON_NAMESPACE:-tekton-pipelines}" | |
| RESOLVERS_NAMESPACE="${RESOLVERS_NAMESPACE:-tekton-pipelines-resolvers}" | |
| RESOLVERS_DEPLOYMENT="${RESOLVERS_DEPLOYMENT:-tekton-pipelines-remote-resolvers}" | |
| TOTAL="${TOTAL:-1200}" | |
| BATCH_SIZE="${BATCH_SIZE:-100}" | |
| BATCH_SLEEP_SECONDS="${BATCH_SLEEP_SECONDS:-1}" | |
| GIT_TIMEOUT="${GIT_TIMEOUT:-750ms}" | |
| SAMPLE_SECONDS="${SAMPLE_SECONDS:-600}" | |
| SAMPLE_INTERVAL_SECONDS="${SAMPLE_INTERVAL_SECONDS:-30}" | |
| PPROF_LOCAL_PORT="${PPROF_LOCAL_PORT:-18008}" | |
| PPROF_REMOTE_PORT="${PPROF_REMOTE_PORT:-8008}" | |
| OUT_DIR="${OUT_DIR:-resolver-leak-pprof-$(date +%Y%m%d-%H%M%S)}" | |
| CLEANUP="${CLEANUP:-false}" | |
| CHAOS_MESH_APISERVER_DELAY="${CHAOS_MESH_APISERVER_DELAY:-}" | |
| CHAOS_MESH_APISERVER_TARGET="${CHAOS_MESH_APISERVER_TARGET:-kubernetes.default.svc}" | |
| INSTALL_CHAOS_MESH="${INSTALL_CHAOS_MESH:-false}" | |
| CHAOS_MESH_NAMESPACE="${CHAOS_MESH_NAMESPACE:-chaos-mesh}" | |
| CHAOS_MESH_RELEASE="${CHAOS_MESH_RELEASE:-chaos-mesh}" | |
| CHAOS_MESH_VERSION="${CHAOS_MESH_VERSION:-}" | |
| CHAOS_MESH_RUNTIME="${CHAOS_MESH_RUNTIME:-crio}" | |
| CHAOS_MESH_SOCKET_PATH="${CHAOS_MESH_SOCKET_PATH:-/var/run/crio/crio.sock}" | |
| usage() { | |
| cat <<EOF | |
| Usage: $0 [options] | |
| Options: | |
| --mode git-timeout|cluster Workload mode. Default: ${MODE} | |
| --namespace NAME Namespace for generated workload. Default: ${LOAD_NAMESPACE} | |
| --total N Total PipelineRuns to create. Default: ${TOTAL} | |
| --batch-size N PipelineRuns per kubectl apply batch. Default: ${BATCH_SIZE} | |
| --batch-sleep-seconds N Sleep between batches. Default: ${BATCH_SLEEP_SECONDS} | |
| --git-timeout DURATION git-resolver fetch-timeout. Default: ${GIT_TIMEOUT} | |
| --sample-seconds N pprof sampling window. Default: ${SAMPLE_SECONDS} | |
| --sample-interval-seconds N pprof sampling interval. Default: ${SAMPLE_INTERVAL_SECONDS} | |
| --out-dir DIR Directory for pprof samples. Default: ${OUT_DIR} | |
| --install-chaos-mesh Force install/upgrade Chaos Mesh with Helm before applying chaos. | |
| --chaos-mesh-apiserver-delay DURATION Apply Chaos Mesh egress delay; installs Chaos Mesh if missing. | |
| --chaos-mesh-version VERSION Install a specific Chaos Mesh chart version. | |
| --chaos-mesh-runtime RUNTIME Runtime passed to Helm. Default: ${CHAOS_MESH_RUNTIME} | |
| --chaos-mesh-socket-path PATH Runtime socket passed to Helm. Default: ${CHAOS_MESH_SOCKET_PATH} | |
| --cleanup Delete the workload namespace before exit. | |
| -h, --help Show this help. | |
| Environment overrides: | |
| KUBECTL, TEKTON_NAMESPACE, RESOLVERS_NAMESPACE, RESOLVERS_DEPLOYMENT, | |
| PPROF_LOCAL_PORT, PPROF_REMOTE_PORT, CHAOS_MESH_APISERVER_TARGET, | |
| CHAOS_MESH_NAMESPACE, CHAOS_MESH_RELEASE. | |
| Examples: | |
| $0 --total 3000 --batch-size 200 --sample-seconds 900 | |
| MODE=cluster $0 --total 2000 --chaos-mesh-apiserver-delay 75s | |
| EOF | |
| } | |
| while [[ $# -gt 0 ]]; do | |
| case "$1" in | |
| --mode) | |
| MODE="$2" | |
| shift 2 | |
| ;; | |
| --namespace) | |
| LOAD_NAMESPACE="$2" | |
| shift 2 | |
| ;; | |
| --total) | |
| TOTAL="$2" | |
| shift 2 | |
| ;; | |
| --batch-size) | |
| BATCH_SIZE="$2" | |
| shift 2 | |
| ;; | |
| --batch-sleep-seconds) | |
| BATCH_SLEEP_SECONDS="$2" | |
| shift 2 | |
| ;; | |
| --git-timeout) | |
| GIT_TIMEOUT="$2" | |
| shift 2 | |
| ;; | |
| --sample-seconds) | |
| SAMPLE_SECONDS="$2" | |
| shift 2 | |
| ;; | |
| --sample-interval-seconds) | |
| SAMPLE_INTERVAL_SECONDS="$2" | |
| shift 2 | |
| ;; | |
| --out-dir) | |
| OUT_DIR="$2" | |
| shift 2 | |
| ;; | |
| --install-chaos-mesh) | |
| INSTALL_CHAOS_MESH="true" | |
| shift | |
| ;; | |
| --chaos-mesh-apiserver-delay) | |
| CHAOS_MESH_APISERVER_DELAY="$2" | |
| shift 2 | |
| ;; | |
| --chaos-mesh-version) | |
| CHAOS_MESH_VERSION="$2" | |
| shift 2 | |
| ;; | |
| --chaos-mesh-runtime) | |
| CHAOS_MESH_RUNTIME="$2" | |
| shift 2 | |
| ;; | |
| --chaos-mesh-socket-path) | |
| CHAOS_MESH_SOCKET_PATH="$2" | |
| shift 2 | |
| ;; | |
| --cleanup) | |
| CLEANUP="true" | |
| shift | |
| ;; | |
| -h|--help) | |
| usage | |
| exit 0 | |
| ;; | |
| *) | |
| echo "unknown argument: $1" >&2 | |
| usage >&2 | |
| exit 2 | |
| ;; | |
| esac | |
| done | |
| if [[ "${MODE}" != "git-timeout" && "${MODE}" != "cluster" ]]; then | |
| echo "--mode must be git-timeout or cluster" >&2 | |
| exit 2 | |
| fi | |
| need() { | |
| if ! command -v "$1" >/dev/null 2>&1; then | |
| echo "required command not found: $1" >&2 | |
| exit 1 | |
| fi | |
| } | |
| log() { | |
| printf '[%(%Y-%m-%dT%H:%M:%S%z)T] %s\n' -1 "$*" | |
| } | |
| patch_cm_data() { | |
| local namespace="$1" | |
| local name="$2" | |
| local patch="$3" | |
| if "${KUBECTL}" -n "${namespace}" get configmap "${name}" >/dev/null 2>&1; then | |
| "${KUBECTL}" -n "${namespace}" patch configmap "${name}" --type merge -p "${patch}" >/dev/null | |
| else | |
| log "configmap ${namespace}/${name} not found; skipping patch" | |
| fi | |
| } | |
| is_openshift() { | |
| "${KUBECTL}" api-resources --api-group=security.openshift.io 2>/dev/null | grep -q '^securitycontextconstraints' | |
| } | |
| chaos_mesh_installed() { | |
| "${KUBECTL}" get crd networkchaos.chaos-mesh.org >/dev/null 2>&1 | |
| } | |
| install_chaos_mesh() { | |
| need helm | |
| local version_args=() | |
| if [[ -n "${CHAOS_MESH_VERSION}" ]]; then | |
| version_args=(--version "${CHAOS_MESH_VERSION}") | |
| fi | |
| log "installing/upgrading Chaos Mesh release ${CHAOS_MESH_RELEASE} in namespace ${CHAOS_MESH_NAMESPACE}" | |
| helm repo add chaos-mesh https://charts.chaos-mesh.org >/dev/null | |
| helm repo update chaos-mesh >/dev/null | |
| helm upgrade --install "${CHAOS_MESH_RELEASE}" chaos-mesh/chaos-mesh \ | |
| --namespace "${CHAOS_MESH_NAMESPACE}" \ | |
| --create-namespace \ | |
| --set "chaosDaemon.runtime=${CHAOS_MESH_RUNTIME}" \ | |
| --set "chaosDaemon.socketPath=${CHAOS_MESH_SOCKET_PATH}" \ | |
| "${version_args[@]}" | |
| if is_openshift; then | |
| if command -v oc >/dev/null 2>&1; then | |
| log "granting OpenShift privileged SCC to Chaos Mesh chaos-daemon service account" | |
| oc adm policy add-scc-to-user privileged -n "${CHAOS_MESH_NAMESPACE}" -z chaos-daemon >/dev/null | |
| else | |
| log "OpenShift detected, but oc is not available; grant privileged SCC manually if chaos-daemon is blocked" | |
| fi | |
| fi | |
| log "waiting for Chaos Mesh controllers" | |
| "${KUBECTL}" -n "${CHAOS_MESH_NAMESPACE}" rollout status deployment/chaos-controller-manager --timeout=180s | |
| "${KUBECTL}" -n "${CHAOS_MESH_NAMESPACE}" rollout status daemonset/chaos-daemon --timeout=180s | |
| "${KUBECTL}" wait --for=condition=Established crd/networkchaos.chaos-mesh.org --timeout=60s | |
| } | |
| cleanup() { | |
| if [[ -n "${PORT_FORWARD_PID:-}" ]]; then | |
| kill "${PORT_FORWARD_PID}" >/dev/null 2>&1 || true | |
| fi | |
| if [[ "${CLEANUP}" == "true" ]]; then | |
| "${KUBECTL}" -n "${RESOLVERS_NAMESPACE}" delete networkchaos resolver-api-delay --ignore-not-found >/dev/null 2>&1 || true | |
| "${KUBECTL}" delete namespace "${LOAD_NAMESPACE}" --ignore-not-found >/dev/null | |
| fi | |
| } | |
| trap cleanup EXIT | |
| need curl | |
| need awk | |
| need "${KUBECTL}" | |
| mkdir -p "${OUT_DIR}" | |
| log "using context: $("${KUBECTL}" config current-context)" | |
| log "writing pprof samples to ${OUT_DIR}" | |
| if [[ "${INSTALL_CHAOS_MESH}" == "true" ]]; then | |
| install_chaos_mesh | |
| elif [[ -n "${CHAOS_MESH_APISERVER_DELAY}" ]] && ! chaos_mesh_installed; then | |
| log "Chaos Mesh NetworkChaos CRD is missing; installing Chaos Mesh with sane OpenShift defaults" | |
| install_chaos_mesh | |
| fi | |
| log "enabling resolver profiling and resolver feature flags" | |
| patch_cm_data "${RESOLVERS_NAMESPACE}" config-observability \ | |
| '{"data":{"runtime-profiling":"enabled","metrics-protocol":"prometheus"}}' | |
| patch_cm_data "${RESOLVERS_NAMESPACE}" resolvers-feature-flags \ | |
| '{"data":{"enable-git-resolver":"true","enable-cluster-resolver":"true"}}' | |
| patch_cm_data "${TEKTON_NAMESPACE}" feature-flags \ | |
| '{"data":{"enable-api-fields":"beta"}}' | |
| if [[ "${MODE}" == "git-timeout" ]]; then | |
| log "setting git resolver fetch-timeout=${GIT_TIMEOUT}" | |
| patch_cm_data "${RESOLVERS_NAMESPACE}" git-resolver-config \ | |
| "{\"data\":{\"fetch-timeout\":\"${GIT_TIMEOUT}\",\"default-cache-mode\":\"never\"}}" | |
| fi | |
| if [[ -n "${CHAOS_MESH_APISERVER_DELAY}" ]]; then | |
| if ! chaos_mesh_installed; then | |
| echo "Chaos Mesh NetworkChaos CRD is still missing after install attempt; cannot apply API server delay" >&2 | |
| exit 1 | |
| fi | |
| log "applying Chaos Mesh API egress delay ${CHAOS_MESH_APISERVER_DELAY} to ${CHAOS_MESH_APISERVER_TARGET}" | |
| "${KUBECTL}" -n "${RESOLVERS_NAMESPACE}" apply -f - <<YAML >/dev/null | |
| apiVersion: chaos-mesh.org/v1alpha1 | |
| kind: NetworkChaos | |
| metadata: | |
| name: resolver-api-delay | |
| spec: | |
| action: delay | |
| mode: all | |
| selector: | |
| namespaces: | |
| - ${RESOLVERS_NAMESPACE} | |
| labelSelectors: | |
| app: tekton-pipelines-resolvers | |
| direction: to | |
| externalTargets: | |
| - ${CHAOS_MESH_APISERVER_TARGET} | |
| delay: | |
| latency: "${CHAOS_MESH_APISERVER_DELAY}" | |
| correlation: "100" | |
| jitter: "0ms" | |
| YAML | |
| fi | |
| if "${KUBECTL}" -n "${RESOLVERS_NAMESPACE}" get deployment "${RESOLVERS_DEPLOYMENT}" >/dev/null 2>&1; then | |
| log "restarting ${RESOLVERS_NAMESPACE}/${RESOLVERS_DEPLOYMENT} so profiling/config changes are definitely active" | |
| "${KUBECTL}" -n "${RESOLVERS_NAMESPACE}" rollout restart deployment "${RESOLVERS_DEPLOYMENT}" >/dev/null | |
| "${KUBECTL}" -n "${RESOLVERS_NAMESPACE}" rollout status deployment "${RESOLVERS_DEPLOYMENT}" --timeout=180s | |
| else | |
| echo "resolver deployment not found: ${RESOLVERS_NAMESPACE}/${RESOLVERS_DEPLOYMENT}" >&2 | |
| exit 1 | |
| fi | |
| log "creating workload namespace ${LOAD_NAMESPACE}" | |
| "${KUBECTL}" create namespace "${LOAD_NAMESPACE}" --dry-run=client -o yaml | "${KUBECTL}" apply -f - >/dev/null | |
| if [[ "${MODE}" == "git-timeout" ]]; then | |
| log "deploying hanging HTTP endpoint used by git resolver" | |
| "${KUBECTL}" -n "${LOAD_NAMESPACE}" apply -f - <<'YAML' >/dev/null | |
| apiVersion: apps/v1 | |
| kind: Deployment | |
| metadata: | |
| name: hang-git | |
| spec: | |
| replicas: 1 | |
| selector: | |
| matchLabels: | |
| app: hang-git | |
| template: | |
| metadata: | |
| labels: | |
| app: hang-git | |
| spec: | |
| containers: | |
| - name: server | |
| image: registry.access.redhat.com/ubi9/python-311:latest | |
| command: | |
| - python3 | |
| - -c | |
| - | | |
| from http.server import ThreadingHTTPServer, BaseHTTPRequestHandler | |
| import time | |
| class Handler(BaseHTTPRequestHandler): | |
| def do_GET(self): | |
| time.sleep(3600) | |
| def do_POST(self): | |
| time.sleep(3600) | |
| def log_message(self, fmt, *args): | |
| pass | |
| ThreadingHTTPServer(("", 8080), Handler).serve_forever() | |
| ports: | |
| - containerPort: 8080 | |
| --- | |
| apiVersion: v1 | |
| kind: Service | |
| metadata: | |
| name: hang-git | |
| spec: | |
| selector: | |
| app: hang-git | |
| ports: | |
| - name: http | |
| port: 8080 | |
| targetPort: 8080 | |
| YAML | |
| "${KUBECTL}" -n "${LOAD_NAMESPACE}" rollout status deployment hang-git --timeout=180s | |
| elif [[ "${MODE}" == "cluster" ]]; then | |
| log "creating cluster resolver target Task" | |
| "${KUBECTL}" -n "${LOAD_NAMESPACE}" apply -f - <<'YAML' >/dev/null | |
| apiVersion: tekton.dev/v1 | |
| kind: Task | |
| metadata: | |
| name: leak-repro-noop | |
| spec: | |
| steps: | |
| - name: nop | |
| image: registry.access.redhat.com/ubi9/ubi-minimal:latest | |
| script: | | |
| #!/usr/bin/env sh | |
| true | |
| YAML | |
| if [[ -z "${CHAOS_MESH_APISERVER_DELAY}" ]]; then | |
| log "cluster mode requires external API latency injection; without it, requests usually resolve before timeout" | |
| fi | |
| fi | |
| start_port_forward() { | |
| log "starting pprof port-forward localhost:${PPROF_LOCAL_PORT} -> ${RESOLVERS_DEPLOYMENT}:${PPROF_REMOTE_PORT}" | |
| "${KUBECTL}" -n "${RESOLVERS_NAMESPACE}" port-forward "deployment/${RESOLVERS_DEPLOYMENT}" \ | |
| "${PPROF_LOCAL_PORT}:${PPROF_REMOTE_PORT}" >"${OUT_DIR}/port-forward.log" 2>&1 & | |
| PORT_FORWARD_PID="$!" | |
| for _ in $(seq 1 30); do | |
| if curl -fsS "http://127.0.0.1:${PPROF_LOCAL_PORT}/debug/pprof/goroutine?debug=1" >/dev/null 2>&1; then | |
| return 0 | |
| fi | |
| sleep 1 | |
| done | |
| echo "pprof endpoint did not become reachable; see ${OUT_DIR}/port-forward.log" >&2 | |
| exit 1 | |
| } | |
| sample_goroutines() { | |
| local elapsed=0 | |
| local sample_file="${OUT_DIR}/goroutine-counts.tsv" | |
| printf 'elapsed_seconds\tgoroutines\n' >"${sample_file}" | |
| while [[ "${elapsed}" -le "${SAMPLE_SECONDS}" ]]; do | |
| local profile="${OUT_DIR}/goroutine-${elapsed}s.txt" | |
| curl -fsS "http://127.0.0.1:${PPROF_LOCAL_PORT}/debug/pprof/goroutine?debug=1" -o "${profile}" | |
| local count | |
| count="$(awk '/^goroutine profile: total / {print $4; exit}' "${profile}")" | |
| printf '%s\t%s\n' "${elapsed}" "${count:-unknown}" | tee -a "${sample_file}" | |
| sleep "${SAMPLE_INTERVAL_SECONDS}" | |
| elapsed=$((elapsed + SAMPLE_INTERVAL_SECONDS)) | |
| done | |
| } | |
| create_batch() { | |
| local start="$1" | |
| local end="$2" | |
| { | |
| for i in $(seq "${start}" "${end}"); do | |
| if [[ "${MODE}" == "git-timeout" ]]; then | |
| cat <<YAML | |
| apiVersion: tekton.dev/v1 | |
| kind: PipelineRun | |
| metadata: | |
| generateName: leak-repro-git-${i}- | |
| namespace: ${LOAD_NAMESPACE} | |
| labels: | |
| leak-repro.tekton.dev/run: "true" | |
| leak-repro.tekton.dev/mode: git-timeout | |
| spec: | |
| pipelineSpec: | |
| tasks: | |
| - name: resolve-slow-git-task | |
| taskRef: | |
| resolver: git | |
| params: | |
| - name: url | |
| value: http://hang-git.${LOAD_NAMESPACE}.svc.cluster.local:8080/repo.git | |
| - name: revision | |
| value: main | |
| - name: pathInRepo | |
| value: task/noop/0.1/noop.yaml | |
| - name: cache | |
| value: never | |
| --- | |
| YAML | |
| else | |
| cat <<YAML | |
| apiVersion: tekton.dev/v1 | |
| kind: PipelineRun | |
| metadata: | |
| generateName: leak-repro-cluster-${i}- | |
| namespace: ${LOAD_NAMESPACE} | |
| labels: | |
| leak-repro.tekton.dev/run: "true" | |
| leak-repro.tekton.dev/mode: cluster | |
| spec: | |
| pipelineSpec: | |
| tasks: | |
| - name: resolve-cluster-task | |
| taskRef: | |
| resolver: cluster | |
| params: | |
| - name: kind | |
| value: task | |
| - name: namespace | |
| value: ${LOAD_NAMESPACE} | |
| - name: name | |
| value: leak-repro-noop | |
| - name: cache | |
| value: never | |
| --- | |
| YAML | |
| fi | |
| done | |
| } | "${KUBECTL}" apply -f - >/dev/null | |
| } | |
| start_port_forward | |
| log "creating ${TOTAL} PipelineRuns in batches of ${BATCH_SIZE}" | |
| created=0 | |
| while [[ "${created}" -lt "${TOTAL}" ]]; do | |
| start=$((created + 1)) | |
| end=$((created + BATCH_SIZE)) | |
| if [[ "${end}" -gt "${TOTAL}" ]]; then | |
| end="${TOTAL}" | |
| fi | |
| create_batch "${start}" "${end}" | |
| created="${end}" | |
| log "created ${created}/${TOTAL}" | |
| sleep "${BATCH_SLEEP_SECONDS}" | |
| done | |
| log "sampling goroutine profile for ${SAMPLE_SECONDS}s" | |
| sample_goroutines | |
| log "done. Goroutine counts: ${OUT_DIR}/goroutine-counts.tsv" | |
| log "inspect blocked senders with: grep -R \"chan send\" ${OUT_DIR}/goroutine-*.txt | head" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment