Skip to content

Instantly share code, notes, and snippets.

@DiTo97
Last active December 14, 2025 04:09
Show Gist options
  • Select an option

  • Save DiTo97/22bd1c1341a827d9360a3644a266d798 to your computer and use it in GitHub Desktop.

Select an option

Save DiTo97/22bd1c1341a827d9360a3644a266d798 to your computer and use it in GitHub Desktop.
efficient and reliable sampling methodologies for pass@k and pass^k evaluation
# /// script
# requires-python = ">=3.12"
# dependencies = [
# "scipy>1,<2",
# ]
# ///
import math
import random
from dataclasses import dataclass, field
from enum import Enum
from typing import Any, Callable
import numpy as np
import scipy.optimize as optimize
import scipy.special as special
# ==============================================================================
# 1. Data Structures
# ==============================================================================
@dataclass
class Problem:
"""Represents a single problem to be evaluated."""
problem_id: str
input_state: dict[str, Any]
prompt_template: str
output_state: dict[str, Any]
tests: list[Callable[[Any], bool]]
def get_prompt(self) -> str:
"""Generate the full prompt from template and input state."""
return self.prompt_template.format(**self.input_state)
@dataclass
class ProblemStats:
"""Tracks sampling statistics for a single problem."""
problem_id: str
successes: int = 0
attempts: int = 0
@dataclass
class ProblemLevelMetrics:
"""Metrics computed for a single problem."""
problem_id: str
attempts: int
successes: int
pass_at_k: dict[int, float] = field(default_factory=dict)
pass_power_k: dict[int, float] = field(default_factory=dict)
@dataclass
class DatasetLevelMetrics:
"""Aggregated metrics across the entire dataset."""
num_problems: int
total_attempts: int
pass_at_k: dict[int, float] = field(default_factory=dict)
pass_power_k: dict[int, float] = field(default_factory=dict)
@dataclass
class EvaluationResult:
"""Complete evaluation results."""
problem_metrics: list[ProblemLevelMetrics]
dataset_metrics: DatasetLevelMetrics
beta_binomial_params: dict[str, float] | None = None
class SamplingStrategy(Enum):
"""Sampling strategy for selecting problems to evaluate."""
uniform = "uniform"
dynamic = "dynamic"
# ==============================================================================
# 2. Evaluation Function
# ==============================================================================
def evaluate_solution(solution: Any, problem: Problem) -> bool:
"""
Evaluates a generated solution against problem tests.
Args:
solution: The generated solution (could be code, text, etc.)
problem: The Problem object containing test cases
Returns:
bool: True if solution passes all tests, False otherwise
"""
try:
for test in problem.tests:
if not test(solution):
return False
return True
except Exception:
return False
# ==============================================================================
# 3. Sampling Strategies
# ==============================================================================
def uniform_sampling(
problems: list[Problem],
generator_func: Callable[[Problem], Any],
budget: int
) -> dict[str, ProblemStats]:
"""
Uniform sampling: randomly select problems with equal probability.
Args:
problems: List of Problem objects
generator_func: Function that generates a solution for a problem
budget: Total number of samples to generate
Returns:
Dictionary mapping problem_id to ProblemStats
"""
stats: dict[str, ProblemStats] = {
p.problem_id: ProblemStats(problem_id=p.problem_id)
for p in problems
}
for _ in range(budget):
# Randomly select a problem
problem = random.choice(problems)
# Generate and evaluate solution
solution = generator_func(problem)
is_correct = evaluate_solution(solution, problem)
# Update stats
stats[problem.problem_id].attempts += 1
if is_correct:
stats[problem.problem_id].successes += 1
return stats
def dynamic_sampling(
problems: list[Problem],
generator_func: Callable[[Problem], Any],
budget: int
) -> dict[str, ProblemStats]:
"""
Dynamic sampling: select hardest problems (min successes, then min attempts).
see Algorithm 2 from https://arxiv.org/abs/2510.05197
Args:
problems: List of Problem objects
generator_func: Function that generates a solution for a problem
budget: Total number of samples to generate
Returns:
Dictionary mapping problem_id to ProblemStats
"""
stats: dict[str, ProblemStats] = {
p.problem_id: ProblemStats(problem_id=p.problem_id)
for p in problems
}
problem_map = {p.problem_id: p for p in problems}
for _ in range(budget):
# Find minimum number of successes
min_successes = min(s.successes for s in stats.values())
# Get all problems with minimum successes
candidates = [
pid for pid, s in stats.items()
if s.successes == min_successes
]
# Among candidates, find minimum attempts
min_attempts = min(stats[pid].attempts for pid in candidates)
# Get final candidates with min attempts
final_candidates = [
pid for pid in candidates
if stats[pid].attempts == min_attempts
]
# Randomly select one if multiple ties
selected_id = random.choice(final_candidates)
problem = problem_map[selected_id]
# Generate and evaluate solution
solution = generator_func(problem)
is_correct = evaluate_solution(solution, problem)
# Update stats
stats[selected_id].attempts += 1
if is_correct:
stats[selected_id].successes += 1
return stats
# ==============================================================================
# 4. Beta-Binomial Parameter Estimation
# ==============================================================================
def fit_beta_binomial(
successes: np.ndarray,
attempts: np.ndarray
) -> dict[str, float]:
"""
Fit Beta-Binomial distribution parameters using MLE.
see https://arxiv.org/abs/2510.05197
Args:
successes: Array of success counts for each problem
attempts: Array of attempt counts for each problem
Returns:
Dictionary with 'alpha' and 'beta' parameters
"""
def neg_log_likelihood(params: tuple[float, float]) -> float:
alpha, beta = params
if alpha <= 0 or beta <= 0:
return 1e9
# Compute log-likelihood using log-gamma functions
term1 = (
special.gammaln(successes + alpha) +
special.gammaln(attempts - successes + beta) -
special.gammaln(attempts + alpha + beta)
)
term2 = (
special.gammaln(alpha) +
special.gammaln(beta) -
special.gammaln(alpha + beta)
)
log_L = np.sum(term1 - term2)
return -log_L
# Initial guess
initial_guess = [1.0, 1.0]
# Optimize
result = optimize.minimize(
neg_log_likelihood,
initial_guess,
bounds=((1e-5, None), (1e-5, None)),
method='L-BFGS-B'
)
return {'alpha': result.x[0], 'beta': result.x[1]}
# ==============================================================================
# 5. Pass@k Estimation
# ==============================================================================
def pass_at_k_unbiased(n: int, c: int, k: int) -> float:
"""
Unbiased estimator for pass@k when k <= n.
see https://www.philschmid.de/agents-pass-at-k-pass-power-k
Args:
n: Total number of attempts
c: Number of successes
k: The k in pass@k
Returns:
Estimated pass@k value
"""
if n < k:
raise ValueError(f"n ({n}) must be >= k ({k})")
if n - c < k:
return 1.0
log_comb_n_k = math.log(math.comb(n, k))
log_comb_n_minus_c_k = math.log(math.comb(n - c, k))
return 1.0 - math.exp(log_comb_n_minus_c_k - log_comb_n_k)
def pass_at_k_beta_binomial(
successes: int,
attempts: int,
k: int,
alpha: float,
beta: float
) -> float:
"""
Beta-Binomial estimator for pass@k when k > n.
Uses the posterior predictive distribution.
see https://arxiv.org/abs/2510.05197
Args:
successes: Number of successes for this problem
attempts: Number of attempts for this problem
k: The k in pass@k
alpha: Beta distribution alpha parameter (from MLE)
beta: Beta distribution beta parameter (from MLE)
Returns:
Estimated pass@k value
"""
# Posterior parameters
alpha_post = alpha + successes
beta_post = beta + attempts - successes
# Compute E[(1-p)^k] using product formula in log-space
log_prob_fail = 0.0
for j in range(k):
log_prob_fail += (
np.log(beta_post + j) -
np.log(alpha_post + beta_post + j)
)
prob_fail = np.exp(log_prob_fail)
return 1.0 - prob_fail
def pass_power_k(n: int, c: int, k: int) -> float:
"""
Compute pass^k metric.
see https://www.philschmid.de/agents-pass-at-k-pass-power-k
Args:
n: Total number of attempts
c: Number of successes
k: The k in pass^k
Returns:
Estimated pass^k value
"""
if n == 0:
return 0.0
p = c / n
return p ** k
# ==============================================================================
# 6. Main Evaluation Pipeline
# ==============================================================================
def evaluate_dataset(
problems: list[Problem],
generator_func: Callable[[Problem], Any],
k_values: list[int],
budget: int,
sampling_strategy: SamplingStrategy = SamplingStrategy.uniform
) -> EvaluationResult:
"""
Evaluate a dataset of problems with a given budget.
Args:
problems: List of Problem objects
generator_func: Function to generate solutions
k_values: List of k values to compute metrics for
budget: Total number of samples to generate
sampling_strategy: Which sampling strategy to use
Returns:
EvaluationResult with problem-level and dataset-level metrics
"""
# Step 1: Collect samples using chosen strategy
if sampling_strategy == SamplingStrategy.uniform:
stats = uniform_sampling(problems, generator_func, budget)
else: # DYNAMIC
stats = dynamic_sampling(problems, generator_func, budget)
# Step 2: Fit Beta-Binomial parameters on whole dataset
successes_array = np.array([stats[p.problem_id].successes for p in problems])
attempts_array = np.array([stats[p.problem_id].attempts for p in problems])
beta_binomial_params = fit_beta_binomial(successes_array, attempts_array)
alpha, beta = beta_binomial_params['alpha'], beta_binomial_params['beta']
# Step 3: Compute metrics for each problem
problem_metrics: list[ProblemLevelMetrics] = []
for problem in problems:
stat = stats[problem.problem_id]
metrics = ProblemLevelMetrics(
problem_id=problem.problem_id,
attempts=stat.attempts,
successes=stat.successes
)
for k in k_values:
# pass@k: use unbiased if k <= attempts, else Beta-Binomial
if k <= stat.attempts and stat.attempts > 0:
metrics.pass_at_k[k] = pass_at_k_unbiased(
stat.attempts, stat.successes, k
)
else:
metrics.pass_at_k[k] = pass_at_k_beta_binomial(
stat.successes, stat.attempts, k, alpha, beta
)
# pass^k: always use the same formula
metrics.pass_power_k[k] = pass_power_k(
stat.attempts, stat.successes, k
)
problem_metrics.append(metrics)
# Step 4: Aggregate to dataset level
dataset_metrics = DatasetLevelMetrics(
num_problems=len(problems),
total_attempts=sum(s.attempts for s in stats.values())
)
for k in k_values:
# Average over all problems
dataset_metrics.pass_at_k[k] = np.mean([
m.pass_at_k[k] for m in problem_metrics
])
dataset_metrics.pass_power_k[k] = np.mean([
m.pass_power_k[k] for m in problem_metrics
])
return EvaluationResult(
problem_metrics=problem_metrics,
dataset_metrics=dataset_metrics,
beta_binomial_params=beta_binomial_params
)
# ==============================================================================
# 7. Example Usage
# ==============================================================================
if __name__ == '__main__':
# Define mock problems with the new structure
def test_length(solution: str) -> bool:
return len(solution) > 50
def test_contains_return(solution: str) -> bool:
return 'return' in solution
mock_problems = [
Problem(
problem_id='prob_001',
input_state={'operation': 'add'},
prompt_template='Write a function that {operation}s two numbers.',
output_state={},
tests=[test_length, test_contains_return]
),
Problem(
problem_id='prob_002',
input_state={'operation': 'reverse'},
prompt_template='Write a function that {operation}s a string.',
output_state={},
tests=[test_length, test_contains_return]
),
Problem(
problem_id='prob_003',
input_state={'operation': 'sort'},
prompt_template='Write a function that {operation}s a list.',
output_state={},
tests=[test_length, test_contains_return]
),
]
# Mock generator: 60% success rate
def mock_generator(problem: Problem) -> str:
if np.random.rand() < 0.6:
return f"def solution():\n return 'correct implementation'" + "A" * 50
else:
return "def solution():\n pass"
# Evaluation parameters
k_values_to_test = [1, 5, 10, 20]
budget = 100
# Run with uniform sampling
print("=" * 70)
print("UNIFORM SAMPLING EVALUATION")
print("=" * 70)
uniform_results = evaluate_dataset(
problems=mock_problems,
generator_func=mock_generator,
k_values=k_values_to_test,
budget=budget,
sampling_strategy=SamplingStrategy.uniform
)
print(f"\n--- Dataset-Level Metrics ---")
print(f"Total problems: {uniform_results.dataset_metrics.num_problems}")
print(f"Total attempts: {uniform_results.dataset_metrics.total_attempts}")
print(f"Beta-Binomial params: α={uniform_results.beta_binomial_params['alpha']:.3f}, "
f"β={uniform_results.beta_binomial_params['beta']:.3f}")
print(f"\npass@k metrics:")
for k, val in uniform_results.dataset_metrics.pass_at_k.items():
print(f" pass@{k}: {val:.4f}")
print(f"\npass^k metrics:")
for k, val in uniform_results.dataset_metrics.pass_power_k.items():
print(f" pass^{k}: {val:.4f}")
print(f"\n--- Problem-Level Metrics ---")
for pm in uniform_results.problem_metrics:
print(f"\nProblem {pm.problem_id}:")
print(f" Attempts: {pm.attempts}, Successes: {pm.successes}")
print(f" pass@k: {pm.pass_at_k}")
print(f" pass^k: {pm.pass_power_k}")
# Run with dynamic sampling
print("\n" + "=" * 70)
print("DYNAMIC SAMPLING EVALUATION")
print("=" * 70)
dynamic_results = evaluate_dataset(
problems=mock_problems,
generator_func=mock_generator,
k_values=k_values_to_test,
budget=budget,
sampling_strategy=SamplingStrategy.dynamic
)
print(f"\n--- Dataset-Level Metrics ---")
print(f"Total problems: {dynamic_results.dataset_metrics.num_problems}")
print(f"Total attempts: {dynamic_results.dataset_metrics.total_attempts}")
print(f"Beta-Binomial params: α={dynamic_results.beta_binomial_params['alpha']:.3f}, "
f"β={dynamic_results.beta_binomial_params['beta']:.3f}")
print(f"\npass@k metrics:")
for k, val in dynamic_results.dataset_metrics.pass_at_k.items():
print(f" pass@{k}: {val:.4f}")
print(f"\npass^k metrics:")
for k, val in dynamic_results.dataset_metrics.pass_power_k.items():
print(f" pass^{k}: {val:.4f}")
print(f"\n--- Problem-Level Metrics ---")
for pm in dynamic_results.problem_metrics:
print(f"\nProblem {pm.problem_id}:")
print(f" Attempts: {pm.attempts}, Successes: {pm.successes}")
print(f" pass@k: {pm.pass_at_k}")
print(f" pass^k: {pm.pass_power_k}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment