Last active
December 14, 2025 04:09
-
-
Save DiTo97/22bd1c1341a827d9360a3644a266d798 to your computer and use it in GitHub Desktop.
efficient and reliable sampling methodologies for pass@k and pass^k evaluation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # /// script | |
| # requires-python = ">=3.12" | |
| # dependencies = [ | |
| # "scipy>1,<2", | |
| # ] | |
| # /// | |
| import math | |
| import random | |
| from dataclasses import dataclass, field | |
| from enum import Enum | |
| from typing import Any, Callable | |
| import numpy as np | |
| import scipy.optimize as optimize | |
| import scipy.special as special | |
| # ============================================================================== | |
| # 1. Data Structures | |
| # ============================================================================== | |
| @dataclass | |
| class Problem: | |
| """Represents a single problem to be evaluated.""" | |
| problem_id: str | |
| input_state: dict[str, Any] | |
| prompt_template: str | |
| output_state: dict[str, Any] | |
| tests: list[Callable[[Any], bool]] | |
| def get_prompt(self) -> str: | |
| """Generate the full prompt from template and input state.""" | |
| return self.prompt_template.format(**self.input_state) | |
| @dataclass | |
| class ProblemStats: | |
| """Tracks sampling statistics for a single problem.""" | |
| problem_id: str | |
| successes: int = 0 | |
| attempts: int = 0 | |
| @dataclass | |
| class ProblemLevelMetrics: | |
| """Metrics computed for a single problem.""" | |
| problem_id: str | |
| attempts: int | |
| successes: int | |
| pass_at_k: dict[int, float] = field(default_factory=dict) | |
| pass_power_k: dict[int, float] = field(default_factory=dict) | |
| @dataclass | |
| class DatasetLevelMetrics: | |
| """Aggregated metrics across the entire dataset.""" | |
| num_problems: int | |
| total_attempts: int | |
| pass_at_k: dict[int, float] = field(default_factory=dict) | |
| pass_power_k: dict[int, float] = field(default_factory=dict) | |
| @dataclass | |
| class EvaluationResult: | |
| """Complete evaluation results.""" | |
| problem_metrics: list[ProblemLevelMetrics] | |
| dataset_metrics: DatasetLevelMetrics | |
| beta_binomial_params: dict[str, float] | None = None | |
| class SamplingStrategy(Enum): | |
| """Sampling strategy for selecting problems to evaluate.""" | |
| uniform = "uniform" | |
| dynamic = "dynamic" | |
| # ============================================================================== | |
| # 2. Evaluation Function | |
| # ============================================================================== | |
| def evaluate_solution(solution: Any, problem: Problem) -> bool: | |
| """ | |
| Evaluates a generated solution against problem tests. | |
| Args: | |
| solution: The generated solution (could be code, text, etc.) | |
| problem: The Problem object containing test cases | |
| Returns: | |
| bool: True if solution passes all tests, False otherwise | |
| """ | |
| try: | |
| for test in problem.tests: | |
| if not test(solution): | |
| return False | |
| return True | |
| except Exception: | |
| return False | |
| # ============================================================================== | |
| # 3. Sampling Strategies | |
| # ============================================================================== | |
| def uniform_sampling( | |
| problems: list[Problem], | |
| generator_func: Callable[[Problem], Any], | |
| budget: int | |
| ) -> dict[str, ProblemStats]: | |
| """ | |
| Uniform sampling: randomly select problems with equal probability. | |
| Args: | |
| problems: List of Problem objects | |
| generator_func: Function that generates a solution for a problem | |
| budget: Total number of samples to generate | |
| Returns: | |
| Dictionary mapping problem_id to ProblemStats | |
| """ | |
| stats: dict[str, ProblemStats] = { | |
| p.problem_id: ProblemStats(problem_id=p.problem_id) | |
| for p in problems | |
| } | |
| for _ in range(budget): | |
| # Randomly select a problem | |
| problem = random.choice(problems) | |
| # Generate and evaluate solution | |
| solution = generator_func(problem) | |
| is_correct = evaluate_solution(solution, problem) | |
| # Update stats | |
| stats[problem.problem_id].attempts += 1 | |
| if is_correct: | |
| stats[problem.problem_id].successes += 1 | |
| return stats | |
| def dynamic_sampling( | |
| problems: list[Problem], | |
| generator_func: Callable[[Problem], Any], | |
| budget: int | |
| ) -> dict[str, ProblemStats]: | |
| """ | |
| Dynamic sampling: select hardest problems (min successes, then min attempts). | |
| see Algorithm 2 from https://arxiv.org/abs/2510.05197 | |
| Args: | |
| problems: List of Problem objects | |
| generator_func: Function that generates a solution for a problem | |
| budget: Total number of samples to generate | |
| Returns: | |
| Dictionary mapping problem_id to ProblemStats | |
| """ | |
| stats: dict[str, ProblemStats] = { | |
| p.problem_id: ProblemStats(problem_id=p.problem_id) | |
| for p in problems | |
| } | |
| problem_map = {p.problem_id: p for p in problems} | |
| for _ in range(budget): | |
| # Find minimum number of successes | |
| min_successes = min(s.successes for s in stats.values()) | |
| # Get all problems with minimum successes | |
| candidates = [ | |
| pid for pid, s in stats.items() | |
| if s.successes == min_successes | |
| ] | |
| # Among candidates, find minimum attempts | |
| min_attempts = min(stats[pid].attempts for pid in candidates) | |
| # Get final candidates with min attempts | |
| final_candidates = [ | |
| pid for pid in candidates | |
| if stats[pid].attempts == min_attempts | |
| ] | |
| # Randomly select one if multiple ties | |
| selected_id = random.choice(final_candidates) | |
| problem = problem_map[selected_id] | |
| # Generate and evaluate solution | |
| solution = generator_func(problem) | |
| is_correct = evaluate_solution(solution, problem) | |
| # Update stats | |
| stats[selected_id].attempts += 1 | |
| if is_correct: | |
| stats[selected_id].successes += 1 | |
| return stats | |
| # ============================================================================== | |
| # 4. Beta-Binomial Parameter Estimation | |
| # ============================================================================== | |
| def fit_beta_binomial( | |
| successes: np.ndarray, | |
| attempts: np.ndarray | |
| ) -> dict[str, float]: | |
| """ | |
| Fit Beta-Binomial distribution parameters using MLE. | |
| see https://arxiv.org/abs/2510.05197 | |
| Args: | |
| successes: Array of success counts for each problem | |
| attempts: Array of attempt counts for each problem | |
| Returns: | |
| Dictionary with 'alpha' and 'beta' parameters | |
| """ | |
| def neg_log_likelihood(params: tuple[float, float]) -> float: | |
| alpha, beta = params | |
| if alpha <= 0 or beta <= 0: | |
| return 1e9 | |
| # Compute log-likelihood using log-gamma functions | |
| term1 = ( | |
| special.gammaln(successes + alpha) + | |
| special.gammaln(attempts - successes + beta) - | |
| special.gammaln(attempts + alpha + beta) | |
| ) | |
| term2 = ( | |
| special.gammaln(alpha) + | |
| special.gammaln(beta) - | |
| special.gammaln(alpha + beta) | |
| ) | |
| log_L = np.sum(term1 - term2) | |
| return -log_L | |
| # Initial guess | |
| initial_guess = [1.0, 1.0] | |
| # Optimize | |
| result = optimize.minimize( | |
| neg_log_likelihood, | |
| initial_guess, | |
| bounds=((1e-5, None), (1e-5, None)), | |
| method='L-BFGS-B' | |
| ) | |
| return {'alpha': result.x[0], 'beta': result.x[1]} | |
| # ============================================================================== | |
| # 5. Pass@k Estimation | |
| # ============================================================================== | |
| def pass_at_k_unbiased(n: int, c: int, k: int) -> float: | |
| """ | |
| Unbiased estimator for pass@k when k <= n. | |
| see https://www.philschmid.de/agents-pass-at-k-pass-power-k | |
| Args: | |
| n: Total number of attempts | |
| c: Number of successes | |
| k: The k in pass@k | |
| Returns: | |
| Estimated pass@k value | |
| """ | |
| if n < k: | |
| raise ValueError(f"n ({n}) must be >= k ({k})") | |
| if n - c < k: | |
| return 1.0 | |
| log_comb_n_k = math.log(math.comb(n, k)) | |
| log_comb_n_minus_c_k = math.log(math.comb(n - c, k)) | |
| return 1.0 - math.exp(log_comb_n_minus_c_k - log_comb_n_k) | |
| def pass_at_k_beta_binomial( | |
| successes: int, | |
| attempts: int, | |
| k: int, | |
| alpha: float, | |
| beta: float | |
| ) -> float: | |
| """ | |
| Beta-Binomial estimator for pass@k when k > n. | |
| Uses the posterior predictive distribution. | |
| see https://arxiv.org/abs/2510.05197 | |
| Args: | |
| successes: Number of successes for this problem | |
| attempts: Number of attempts for this problem | |
| k: The k in pass@k | |
| alpha: Beta distribution alpha parameter (from MLE) | |
| beta: Beta distribution beta parameter (from MLE) | |
| Returns: | |
| Estimated pass@k value | |
| """ | |
| # Posterior parameters | |
| alpha_post = alpha + successes | |
| beta_post = beta + attempts - successes | |
| # Compute E[(1-p)^k] using product formula in log-space | |
| log_prob_fail = 0.0 | |
| for j in range(k): | |
| log_prob_fail += ( | |
| np.log(beta_post + j) - | |
| np.log(alpha_post + beta_post + j) | |
| ) | |
| prob_fail = np.exp(log_prob_fail) | |
| return 1.0 - prob_fail | |
| def pass_power_k(n: int, c: int, k: int) -> float: | |
| """ | |
| Compute pass^k metric. | |
| see https://www.philschmid.de/agents-pass-at-k-pass-power-k | |
| Args: | |
| n: Total number of attempts | |
| c: Number of successes | |
| k: The k in pass^k | |
| Returns: | |
| Estimated pass^k value | |
| """ | |
| if n == 0: | |
| return 0.0 | |
| p = c / n | |
| return p ** k | |
| # ============================================================================== | |
| # 6. Main Evaluation Pipeline | |
| # ============================================================================== | |
| def evaluate_dataset( | |
| problems: list[Problem], | |
| generator_func: Callable[[Problem], Any], | |
| k_values: list[int], | |
| budget: int, | |
| sampling_strategy: SamplingStrategy = SamplingStrategy.uniform | |
| ) -> EvaluationResult: | |
| """ | |
| Evaluate a dataset of problems with a given budget. | |
| Args: | |
| problems: List of Problem objects | |
| generator_func: Function to generate solutions | |
| k_values: List of k values to compute metrics for | |
| budget: Total number of samples to generate | |
| sampling_strategy: Which sampling strategy to use | |
| Returns: | |
| EvaluationResult with problem-level and dataset-level metrics | |
| """ | |
| # Step 1: Collect samples using chosen strategy | |
| if sampling_strategy == SamplingStrategy.uniform: | |
| stats = uniform_sampling(problems, generator_func, budget) | |
| else: # DYNAMIC | |
| stats = dynamic_sampling(problems, generator_func, budget) | |
| # Step 2: Fit Beta-Binomial parameters on whole dataset | |
| successes_array = np.array([stats[p.problem_id].successes for p in problems]) | |
| attempts_array = np.array([stats[p.problem_id].attempts for p in problems]) | |
| beta_binomial_params = fit_beta_binomial(successes_array, attempts_array) | |
| alpha, beta = beta_binomial_params['alpha'], beta_binomial_params['beta'] | |
| # Step 3: Compute metrics for each problem | |
| problem_metrics: list[ProblemLevelMetrics] = [] | |
| for problem in problems: | |
| stat = stats[problem.problem_id] | |
| metrics = ProblemLevelMetrics( | |
| problem_id=problem.problem_id, | |
| attempts=stat.attempts, | |
| successes=stat.successes | |
| ) | |
| for k in k_values: | |
| # pass@k: use unbiased if k <= attempts, else Beta-Binomial | |
| if k <= stat.attempts and stat.attempts > 0: | |
| metrics.pass_at_k[k] = pass_at_k_unbiased( | |
| stat.attempts, stat.successes, k | |
| ) | |
| else: | |
| metrics.pass_at_k[k] = pass_at_k_beta_binomial( | |
| stat.successes, stat.attempts, k, alpha, beta | |
| ) | |
| # pass^k: always use the same formula | |
| metrics.pass_power_k[k] = pass_power_k( | |
| stat.attempts, stat.successes, k | |
| ) | |
| problem_metrics.append(metrics) | |
| # Step 4: Aggregate to dataset level | |
| dataset_metrics = DatasetLevelMetrics( | |
| num_problems=len(problems), | |
| total_attempts=sum(s.attempts for s in stats.values()) | |
| ) | |
| for k in k_values: | |
| # Average over all problems | |
| dataset_metrics.pass_at_k[k] = np.mean([ | |
| m.pass_at_k[k] for m in problem_metrics | |
| ]) | |
| dataset_metrics.pass_power_k[k] = np.mean([ | |
| m.pass_power_k[k] for m in problem_metrics | |
| ]) | |
| return EvaluationResult( | |
| problem_metrics=problem_metrics, | |
| dataset_metrics=dataset_metrics, | |
| beta_binomial_params=beta_binomial_params | |
| ) | |
| # ============================================================================== | |
| # 7. Example Usage | |
| # ============================================================================== | |
| if __name__ == '__main__': | |
| # Define mock problems with the new structure | |
| def test_length(solution: str) -> bool: | |
| return len(solution) > 50 | |
| def test_contains_return(solution: str) -> bool: | |
| return 'return' in solution | |
| mock_problems = [ | |
| Problem( | |
| problem_id='prob_001', | |
| input_state={'operation': 'add'}, | |
| prompt_template='Write a function that {operation}s two numbers.', | |
| output_state={}, | |
| tests=[test_length, test_contains_return] | |
| ), | |
| Problem( | |
| problem_id='prob_002', | |
| input_state={'operation': 'reverse'}, | |
| prompt_template='Write a function that {operation}s a string.', | |
| output_state={}, | |
| tests=[test_length, test_contains_return] | |
| ), | |
| Problem( | |
| problem_id='prob_003', | |
| input_state={'operation': 'sort'}, | |
| prompt_template='Write a function that {operation}s a list.', | |
| output_state={}, | |
| tests=[test_length, test_contains_return] | |
| ), | |
| ] | |
| # Mock generator: 60% success rate | |
| def mock_generator(problem: Problem) -> str: | |
| if np.random.rand() < 0.6: | |
| return f"def solution():\n return 'correct implementation'" + "A" * 50 | |
| else: | |
| return "def solution():\n pass" | |
| # Evaluation parameters | |
| k_values_to_test = [1, 5, 10, 20] | |
| budget = 100 | |
| # Run with uniform sampling | |
| print("=" * 70) | |
| print("UNIFORM SAMPLING EVALUATION") | |
| print("=" * 70) | |
| uniform_results = evaluate_dataset( | |
| problems=mock_problems, | |
| generator_func=mock_generator, | |
| k_values=k_values_to_test, | |
| budget=budget, | |
| sampling_strategy=SamplingStrategy.uniform | |
| ) | |
| print(f"\n--- Dataset-Level Metrics ---") | |
| print(f"Total problems: {uniform_results.dataset_metrics.num_problems}") | |
| print(f"Total attempts: {uniform_results.dataset_metrics.total_attempts}") | |
| print(f"Beta-Binomial params: α={uniform_results.beta_binomial_params['alpha']:.3f}, " | |
| f"β={uniform_results.beta_binomial_params['beta']:.3f}") | |
| print(f"\npass@k metrics:") | |
| for k, val in uniform_results.dataset_metrics.pass_at_k.items(): | |
| print(f" pass@{k}: {val:.4f}") | |
| print(f"\npass^k metrics:") | |
| for k, val in uniform_results.dataset_metrics.pass_power_k.items(): | |
| print(f" pass^{k}: {val:.4f}") | |
| print(f"\n--- Problem-Level Metrics ---") | |
| for pm in uniform_results.problem_metrics: | |
| print(f"\nProblem {pm.problem_id}:") | |
| print(f" Attempts: {pm.attempts}, Successes: {pm.successes}") | |
| print(f" pass@k: {pm.pass_at_k}") | |
| print(f" pass^k: {pm.pass_power_k}") | |
| # Run with dynamic sampling | |
| print("\n" + "=" * 70) | |
| print("DYNAMIC SAMPLING EVALUATION") | |
| print("=" * 70) | |
| dynamic_results = evaluate_dataset( | |
| problems=mock_problems, | |
| generator_func=mock_generator, | |
| k_values=k_values_to_test, | |
| budget=budget, | |
| sampling_strategy=SamplingStrategy.dynamic | |
| ) | |
| print(f"\n--- Dataset-Level Metrics ---") | |
| print(f"Total problems: {dynamic_results.dataset_metrics.num_problems}") | |
| print(f"Total attempts: {dynamic_results.dataset_metrics.total_attempts}") | |
| print(f"Beta-Binomial params: α={dynamic_results.beta_binomial_params['alpha']:.3f}, " | |
| f"β={dynamic_results.beta_binomial_params['beta']:.3f}") | |
| print(f"\npass@k metrics:") | |
| for k, val in dynamic_results.dataset_metrics.pass_at_k.items(): | |
| print(f" pass@{k}: {val:.4f}") | |
| print(f"\npass^k metrics:") | |
| for k, val in dynamic_results.dataset_metrics.pass_power_k.items(): | |
| print(f" pass^{k}: {val:.4f}") | |
| print(f"\n--- Problem-Level Metrics ---") | |
| for pm in dynamic_results.problem_metrics: | |
| print(f"\nProblem {pm.problem_id}:") | |
| print(f" Attempts: {pm.attempts}, Successes: {pm.successes}") | |
| print(f" pass@k: {pm.pass_at_k}") | |
| print(f" pass^k: {pm.pass_power_k}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment