Created
October 22, 2025 04:07
-
-
Save johngrimes/a26fbf70cf1806ab4a4d7d510e2e7273 to your computer and use it in GitHub Desktop.
Script for comparing two JMH benchmarks
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import json | |
| from dataclasses import dataclass | |
| from typing import Tuple, Optional | |
| import statistics | |
| @dataclass | |
| class BenchmarkResult: | |
| """Represents a single benchmark result.""" | |
| benchmark: str | |
| source_type: str | |
| score: float | |
| score_error: float | |
| confidence_lower: float | |
| confidence_upper: float | |
| raw_data: list | |
| @classmethod | |
| def from_jmh_result(cls, result: dict): | |
| """Parse a JMH result dictionary.""" | |
| # rawData might be in different locations depending on JMH version | |
| raw_data = result['primaryMetric'].get('rawData') | |
| if raw_data is None: | |
| raw_data = result['primaryMetric'].get('rawDataHistogram', []) | |
| return cls( | |
| benchmark=result['benchmark'], | |
| source_type=result['params']['sourceType'], | |
| score=result['primaryMetric']['score'], | |
| score_error=result['primaryMetric']['scoreError'], | |
| confidence_lower=result['primaryMetric']['scoreConfidence'][0], | |
| confidence_upper=result['primaryMetric']['scoreConfidence'][1], | |
| raw_data=raw_data | |
| ) | |
| def welch_t_test(result1: BenchmarkResult, | |
| result2: BenchmarkResult) -> Tuple[float, bool]: | |
| """ | |
| Perform Welch's t-test for unequal variances. | |
| Returns (t_statistic, is_significant_at_p05). | |
| """ | |
| # Flatten raw data - handle both rawData and rawDataHistogram formats | |
| def flatten_data(raw_data): | |
| values = [] | |
| for fork in raw_data: | |
| for iteration in fork: | |
| if isinstance(iteration, list): | |
| # rawDataHistogram format: list of [value, count] pairs | |
| for item in iteration: | |
| if isinstance(item, list) and len(item) >= 2: | |
| value, count = item[0], item[1] | |
| values.extend([value] * int(count)) | |
| else: | |
| values.append(item) | |
| else: | |
| values.append(iteration) | |
| return values | |
| data1 = flatten_data(result1.raw_data) | |
| data2 = flatten_data(result2.raw_data) | |
| if len(data1) < 2 or len(data2) < 2: | |
| # Not enough data for statistical test | |
| return 0, False | |
| n1, n2 = len(data1), len(data2) | |
| mean1, mean2 = statistics.mean(data1), statistics.mean(data2) | |
| var1 = statistics.variance(data1) if n1 > 1 else 0 | |
| var2 = statistics.variance(data2) if n2 > 1 else 0 | |
| # Welch's t-statistic | |
| if var1 + var2 == 0: | |
| return 0, False | |
| t_stat = (mean1 - mean2) / ((var1/n1 + var2/n2) ** 0.5) | |
| # Degrees of freedom (Welch-Satterthwaite) | |
| if var1 == 0 or var2 == 0: | |
| df = n1 + n2 - 2 | |
| else: | |
| numerator = (var1/n1 + var2/n2) ** 2 | |
| denominator = (var1/n1)**2/(n1-1) + (var2/n2)**2/(n2-1) | |
| df = numerator / denominator | |
| # Critical value for two-tailed test at p=0.05 | |
| # Approximation: for df > 30, use 1.96; for df > 60, use 1.96 | |
| if df > 30: | |
| critical_value = 1.96 | |
| else: | |
| # Simplified approximation | |
| critical_value = 2.0 + (30 - df) * 0.01 | |
| is_significant = abs(t_stat) > critical_value | |
| return t_stat, is_significant | |
| def percentage_change(result1: BenchmarkResult, | |
| result2: BenchmarkResult) -> float: | |
| """Calculate percentage change (negative = improvement for time-based metrics).""" | |
| return ((result2.score - result1.score) / result1.score) * 100 | |
| def compare_benchmarks(baseline_file: str, | |
| comparison_file: str) -> dict: | |
| """ | |
| Compare two JMH benchmark reports. | |
| Args: | |
| baseline_file: Path to baseline JSON report | |
| comparison_file: Path to comparison JSON report | |
| Returns: | |
| Dictionary with comparison results | |
| """ | |
| with open(baseline_file) as f: | |
| baseline_data = json.load(f) | |
| with open(comparison_file) as f: | |
| comparison_data = json.load(f) | |
| # Create lookup dictionary | |
| baseline_results = {} | |
| for result in baseline_data: | |
| r = BenchmarkResult.from_jmh_result(result) | |
| key = (r.benchmark, r.source_type) | |
| baseline_results[key] = r | |
| comparison_results = {} | |
| for result in comparison_data: | |
| r = BenchmarkResult.from_jmh_result(result) | |
| key = (r.benchmark, r.source_type) | |
| comparison_results[key] = r | |
| # Compare matching benchmarks | |
| comparisons = [] | |
| for key in baseline_results: | |
| if key not in comparison_results: | |
| continue | |
| baseline = baseline_results[key] | |
| comparison = comparison_results[key] | |
| t_stat, is_significant = welch_t_test(baseline, comparison) | |
| pct_change = percentage_change(baseline, comparison) | |
| # Determine verdict | |
| if is_significant: | |
| if pct_change < 0: | |
| verdict = "IMPROVEMENT" | |
| else: | |
| verdict = "REGRESSION" | |
| else: | |
| verdict = "NO CHANGE" | |
| comparisons.append({ | |
| 'benchmark': key[0], | |
| 'source_type': key[1], | |
| 'baseline_score': baseline.score, | |
| 'comparison_score': comparison.score, | |
| 'percentage_change': pct_change, | |
| 't_statistic': t_stat, | |
| 'statistically_significant': is_significant, | |
| 'verdict': verdict | |
| }) | |
| return { | |
| 'comparisons': comparisons, | |
| 'summary': summarize_comparisons(comparisons) | |
| } | |
| def summarize_comparisons(comparisons: list) -> dict: | |
| """Generate summary statistics.""" | |
| improvements = sum(1 for c in comparisons if c['verdict'] == 'IMPROVEMENT') | |
| regressions = sum(1 for c in comparisons if c['verdict'] == 'REGRESSION') | |
| no_change = sum(1 for c in comparisons if c['verdict'] == 'NO CHANGE') | |
| return { | |
| 'total_benchmarks': len(comparisons), | |
| 'improvements': improvements, | |
| 'regressions': regressions, | |
| 'no_change': no_change | |
| } | |
| def print_comparison_report(results: dict): | |
| """Print a formatted comparison report.""" | |
| print("JMH Benchmark Comparison Report") | |
| print("=" * 80) | |
| print() | |
| for comp in results['comparisons']: | |
| print(f"Benchmark: {comp['benchmark']}") | |
| print(f"Source Type: {comp['source_type']}") | |
| print(f"Baseline: {comp['baseline_score']:.2f} ms/op") | |
| print(f"Comparison: {comp['comparison_score']:.2f} ms/op") | |
| print(f"Change: {comp['percentage_change']:+.2f}%") | |
| print(f"Statistically Significant: {comp['statistically_significant']}") | |
| print(f"Verdict: {comp['verdict']}") | |
| print("-" * 80) | |
| print() | |
| summary = results['summary'] | |
| print("Summary:") | |
| print(f" Total Benchmarks: {summary['total_benchmarks']}") | |
| print(f" Improvements: {summary['improvements']}") | |
| print(f" Regressions: {summary['regressions']}") | |
| print(f" No Change: {summary['no_change']}") | |
| # Example usage | |
| if __name__ == "__main__": | |
| import sys | |
| if len(sys.argv) != 3: | |
| print("Usage: python script.py <baseline.json> <comparison.json>") | |
| print() | |
| print("Compare two JMH benchmark reports and determine statistical significance.") | |
| print() | |
| print("Arguments:") | |
| print(" baseline.json - Path to the baseline benchmark report") | |
| print(" comparison.json - Path to the comparison benchmark report") | |
| sys.exit(1) | |
| baseline_file = sys.argv[1] | |
| comparison_file = sys.argv[2] | |
| try: | |
| results = compare_benchmarks(baseline_file, comparison_file) | |
| print_comparison_report(results) | |
| except FileNotFoundError as e: | |
| print(f"Error: Could not find file - {e}") | |
| sys.exit(1) | |
| except json.JSONDecodeError as e: | |
| print(f"Error: Invalid JSON format - {e}") | |
| sys.exit(1) | |
| except Exception as e: | |
| print(f"Error: {e}") | |
| sys.exit(1) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment