Skip to content

Instantly share code, notes, and snippets.

@johngrimes
Created October 22, 2025 04:07
Show Gist options
  • Save johngrimes/a26fbf70cf1806ab4a4d7d510e2e7273 to your computer and use it in GitHub Desktop.
Save johngrimes/a26fbf70cf1806ab4a4d7d510e2e7273 to your computer and use it in GitHub Desktop.
Script for comparing two JMH benchmarks
import json
from dataclasses import dataclass
from typing import Tuple, Optional
import statistics
@dataclass
class BenchmarkResult:
"""Represents a single benchmark result."""
benchmark: str
source_type: str
score: float
score_error: float
confidence_lower: float
confidence_upper: float
raw_data: list
@classmethod
def from_jmh_result(cls, result: dict):
"""Parse a JMH result dictionary."""
# rawData might be in different locations depending on JMH version
raw_data = result['primaryMetric'].get('rawData')
if raw_data is None:
raw_data = result['primaryMetric'].get('rawDataHistogram', [])
return cls(
benchmark=result['benchmark'],
source_type=result['params']['sourceType'],
score=result['primaryMetric']['score'],
score_error=result['primaryMetric']['scoreError'],
confidence_lower=result['primaryMetric']['scoreConfidence'][0],
confidence_upper=result['primaryMetric']['scoreConfidence'][1],
raw_data=raw_data
)
def welch_t_test(result1: BenchmarkResult,
result2: BenchmarkResult) -> Tuple[float, bool]:
"""
Perform Welch's t-test for unequal variances.
Returns (t_statistic, is_significant_at_p05).
"""
# Flatten raw data - handle both rawData and rawDataHistogram formats
def flatten_data(raw_data):
values = []
for fork in raw_data:
for iteration in fork:
if isinstance(iteration, list):
# rawDataHistogram format: list of [value, count] pairs
for item in iteration:
if isinstance(item, list) and len(item) >= 2:
value, count = item[0], item[1]
values.extend([value] * int(count))
else:
values.append(item)
else:
values.append(iteration)
return values
data1 = flatten_data(result1.raw_data)
data2 = flatten_data(result2.raw_data)
if len(data1) < 2 or len(data2) < 2:
# Not enough data for statistical test
return 0, False
n1, n2 = len(data1), len(data2)
mean1, mean2 = statistics.mean(data1), statistics.mean(data2)
var1 = statistics.variance(data1) if n1 > 1 else 0
var2 = statistics.variance(data2) if n2 > 1 else 0
# Welch's t-statistic
if var1 + var2 == 0:
return 0, False
t_stat = (mean1 - mean2) / ((var1/n1 + var2/n2) ** 0.5)
# Degrees of freedom (Welch-Satterthwaite)
if var1 == 0 or var2 == 0:
df = n1 + n2 - 2
else:
numerator = (var1/n1 + var2/n2) ** 2
denominator = (var1/n1)**2/(n1-1) + (var2/n2)**2/(n2-1)
df = numerator / denominator
# Critical value for two-tailed test at p=0.05
# Approximation: for df > 30, use 1.96; for df > 60, use 1.96
if df > 30:
critical_value = 1.96
else:
# Simplified approximation
critical_value = 2.0 + (30 - df) * 0.01
is_significant = abs(t_stat) > critical_value
return t_stat, is_significant
def percentage_change(result1: BenchmarkResult,
result2: BenchmarkResult) -> float:
"""Calculate percentage change (negative = improvement for time-based metrics)."""
return ((result2.score - result1.score) / result1.score) * 100
def compare_benchmarks(baseline_file: str,
comparison_file: str) -> dict:
"""
Compare two JMH benchmark reports.
Args:
baseline_file: Path to baseline JSON report
comparison_file: Path to comparison JSON report
Returns:
Dictionary with comparison results
"""
with open(baseline_file) as f:
baseline_data = json.load(f)
with open(comparison_file) as f:
comparison_data = json.load(f)
# Create lookup dictionary
baseline_results = {}
for result in baseline_data:
r = BenchmarkResult.from_jmh_result(result)
key = (r.benchmark, r.source_type)
baseline_results[key] = r
comparison_results = {}
for result in comparison_data:
r = BenchmarkResult.from_jmh_result(result)
key = (r.benchmark, r.source_type)
comparison_results[key] = r
# Compare matching benchmarks
comparisons = []
for key in baseline_results:
if key not in comparison_results:
continue
baseline = baseline_results[key]
comparison = comparison_results[key]
t_stat, is_significant = welch_t_test(baseline, comparison)
pct_change = percentage_change(baseline, comparison)
# Determine verdict
if is_significant:
if pct_change < 0:
verdict = "IMPROVEMENT"
else:
verdict = "REGRESSION"
else:
verdict = "NO CHANGE"
comparisons.append({
'benchmark': key[0],
'source_type': key[1],
'baseline_score': baseline.score,
'comparison_score': comparison.score,
'percentage_change': pct_change,
't_statistic': t_stat,
'statistically_significant': is_significant,
'verdict': verdict
})
return {
'comparisons': comparisons,
'summary': summarize_comparisons(comparisons)
}
def summarize_comparisons(comparisons: list) -> dict:
"""Generate summary statistics."""
improvements = sum(1 for c in comparisons if c['verdict'] == 'IMPROVEMENT')
regressions = sum(1 for c in comparisons if c['verdict'] == 'REGRESSION')
no_change = sum(1 for c in comparisons if c['verdict'] == 'NO CHANGE')
return {
'total_benchmarks': len(comparisons),
'improvements': improvements,
'regressions': regressions,
'no_change': no_change
}
def print_comparison_report(results: dict):
"""Print a formatted comparison report."""
print("JMH Benchmark Comparison Report")
print("=" * 80)
print()
for comp in results['comparisons']:
print(f"Benchmark: {comp['benchmark']}")
print(f"Source Type: {comp['source_type']}")
print(f"Baseline: {comp['baseline_score']:.2f} ms/op")
print(f"Comparison: {comp['comparison_score']:.2f} ms/op")
print(f"Change: {comp['percentage_change']:+.2f}%")
print(f"Statistically Significant: {comp['statistically_significant']}")
print(f"Verdict: {comp['verdict']}")
print("-" * 80)
print()
summary = results['summary']
print("Summary:")
print(f" Total Benchmarks: {summary['total_benchmarks']}")
print(f" Improvements: {summary['improvements']}")
print(f" Regressions: {summary['regressions']}")
print(f" No Change: {summary['no_change']}")
# Example usage
if __name__ == "__main__":
import sys
if len(sys.argv) != 3:
print("Usage: python script.py <baseline.json> <comparison.json>")
print()
print("Compare two JMH benchmark reports and determine statistical significance.")
print()
print("Arguments:")
print(" baseline.json - Path to the baseline benchmark report")
print(" comparison.json - Path to the comparison benchmark report")
sys.exit(1)
baseline_file = sys.argv[1]
comparison_file = sys.argv[2]
try:
results = compare_benchmarks(baseline_file, comparison_file)
print_comparison_report(results)
except FileNotFoundError as e:
print(f"Error: Could not find file - {e}")
sys.exit(1)
except json.JSONDecodeError as e:
print(f"Error: Invalid JSON format - {e}")
sys.exit(1)
except Exception as e:
print(f"Error: {e}")
sys.exit(1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment