Last active
October 31, 2024 10:38
-
-
Save tsvikas/1b914863c4589b5529368666c8b0e782 to your computer and use it in GitHub Desktop.
download and analyze data from llm-arena
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# this file uses a leading space in its name, to give the gist a nice name | |
this gist is some python code to reproduce the LLM arena leaderboard | |
it is based on https://colab.research.google.com/drive/1KdwokPjirkTmpO_P1WByFNFiqxWQquwH | |
but convert the huge json it uses to a smaller parquet file |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pathlib import Path | |
import pandas as pd | |
import pooch | |
def convert_to_ndjson(json_fn: Path, ndjson_fn: Path | None = None) -> Path: | |
# set filenames | |
json_fn = Path(json_fn) | |
ndjson_fn = json_fn.with_suffix(".ndjson") if ndjson_fn is None else Path(ndjson_fn) | |
# convert | |
with ndjson_fn.open("w") as f_out, json_fn.open() as f: | |
lines = [] | |
if not f.readline() == "[\n": | |
raise RuntimeError("expected a different line") | |
n_records = 0 | |
for line in f.readlines(): | |
if line in {" },\n", " }\n"}: | |
record = f"{{ {''.join(lines[1:])} }}\n" | |
f_out.write(record) | |
n_records += 1 | |
if n_records % 10000 == 0: | |
print(n_records) | |
lines = [] | |
else: | |
lines.append(line.strip()) | |
return ndjson_fn | |
def convert_to_parquet( | |
ndjson_fn: Path, parquet_fn: Path | None = None, chunksize: int = 1024 | |
) -> Path: | |
# set filenames | |
ndjson_fn = Path(ndjson_fn) | |
parquet_fn = ( | |
ndjson_fn.with_suffix(".parquet") if parquet_fn is None else Path(parquet_fn) | |
) | |
# load df | |
json_reader = pd.read_json( | |
ndjson_fn, lines=True, orient="records", chunksize=chunksize | |
) | |
df = pd.concat(json_reader) | |
# fix df | |
df["tstamp"] = pd.to_datetime(df.tstamp, unit="s") | |
for col in ["conv_metadata", "dedup_tag", "category_tag"]: | |
ser = df.pop(col) | |
exploded = pd.json_normalize(ser).add_prefix(f"{col}.") | |
df = df.join(exploded) | |
# save df | |
df.to_parquet(parquet_fn) | |
return parquet_fn | |
def main() -> None: | |
fn = "clean_battle_20240814_public" | |
data_dir = Path("data") | |
print("retrieving data:") | |
json_fn = pooch.retrieve( | |
f"https://storage.googleapis.com/arena_external_data/public/{fn}.json", | |
"747c1c937dfa941d5a455a1fd70e2879d29642058da39b97996b977233b9bd1b", | |
fname=f"{fn}.json", | |
path=data_dir, | |
) | |
print("retrieved data") | |
print("converting to ndjson:") | |
ndjson_fn = data_dir / f"{fn}.ndjson" | |
if not ndjson_fn.exists(): | |
ndjson_fn = convert_to_ndjson(json_fn, ndjson_fn) | |
print("converted to ndjson") | |
else: | |
print("using existing ndjson file") | |
print("converting to parquet:") | |
parquet_fn = convert_to_parquet(ndjson_fn=ndjson_fn, chunksize=200000) | |
print("converted to parquet") | |
print("done") | |
if __name__ == "__main__": | |
main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import math | |
from pathlib import Path | |
from typing import Callable | |
import numpy as np | |
import pandas as pd | |
from sklearn.linear_model import LogisticRegression | |
from tqdm import tqdm | |
ADD_CONFIDENCE = True | |
def compute_mle_elo( | |
df: pd.DataFrame, SCALE: int = 400, BASE: int = 10, INIT_RATING: int = 1000 | |
) -> pd.Series: | |
# source: https://colab.research.google.com/drive/1KdwokPjirkTmpO_P1WByFNFiqxWQquwH | |
ptbl_a_win = pd.pivot_table( | |
df[df["winner"] == "model_a"], | |
index="model_a", | |
columns="model_b", | |
aggfunc="size", | |
fill_value=0, | |
) | |
# if no tie, create a zero matrix | |
if sum(df["winner"].isin(["tie", "tie (bothbad)"])) == 0: | |
ptbl_tie = pd.DataFrame(0, index=ptbl_a_win.index, columns=ptbl_a_win.columns) | |
else: | |
ptbl_tie = pd.pivot_table( | |
df[df["winner"].isin(["tie", "tie (bothbad)"])], | |
index="model_a", | |
columns="model_b", | |
aggfunc="size", | |
fill_value=0, | |
) | |
ptbl_tie = ptbl_tie + ptbl_tie.T | |
ptbl_b_win = pd.pivot_table( | |
df[df["winner"] == "model_b"], | |
index="model_a", | |
columns="model_b", | |
aggfunc="size", | |
fill_value=0, | |
) | |
ptbl_win = ptbl_a_win * 2 + ptbl_b_win.T * 2 + ptbl_tie | |
models = pd.Series(np.arange(len(ptbl_win.index)), index=ptbl_win.index) | |
p = len(models) | |
X = np.zeros([p * (p - 1) * 2, p]) | |
Y = np.zeros(p * (p - 1) * 2) | |
cur_row = 0 | |
sample_weights = [] | |
for m_a in ptbl_win.index: | |
for m_b in ptbl_win.columns: | |
if m_a == m_b: | |
continue | |
# if nan skip | |
if math.isnan(ptbl_win.loc[m_a, m_b]) or math.isnan(ptbl_win.loc[m_b, m_a]): | |
continue | |
X[cur_row, models[m_a]] = +math.log(BASE) | |
X[cur_row, models[m_b]] = -math.log(BASE) | |
Y[cur_row] = 1.0 | |
sample_weights.append(ptbl_win.loc[m_a, m_b]) | |
X[cur_row + 1, models[m_a]] = math.log(BASE) | |
X[cur_row + 1, models[m_b]] = -math.log(BASE) | |
Y[cur_row + 1] = 0.0 | |
sample_weights.append(ptbl_win.loc[m_b, m_a]) | |
cur_row += 2 | |
X = X[:cur_row] | |
Y = Y[:cur_row] | |
lr = LogisticRegression(fit_intercept=False, penalty=None, tol=1e-6) | |
lr.fit(X, Y, sample_weight=sample_weights) | |
elo_scores = SCALE * lr.coef_[0] + INIT_RATING | |
if "mixtral-8x7b-instruct-v0.1" in models.index: | |
elo_scores += 1114 - elo_scores[models["mixtral-8x7b-instruct-v0.1"]] | |
return ( | |
pd.Series(elo_scores, index=models.index) | |
.sort_values(ascending=False) | |
.rename_axis(index="model") | |
.rename("elo_rating") | |
) | |
def get_bootstrap_result( | |
battles: pd.DataFrame, func_compute_elo: Callable, num_round: int, seed: int | |
) -> pd.DataFrame: | |
rng = np.random.default_rng(seed) | |
rows = [ | |
func_compute_elo(battles.sample(frac=1.0, replace=True, random_state=rng)) | |
for i in tqdm(range(num_round), desc="bootstrap") | |
] | |
df = pd.DataFrame(rows) | |
# sort models from best to worse | |
return df[df.median().sort_values(ascending=False).index] | |
def main() -> None: | |
data_dir = Path("data") | |
input_fn = data_dir / "clean_battle_20240814_public.parquet" | |
output_fn = data_dir / "scores.csv" | |
df = pd.read_parquet(input_fn) | |
battles = df[df["dedup_tag.sampled"] & df["anony"]] | |
model_a_min = df.groupby("model_a").tstamp.min().rename_axis(index="model") | |
model_b_min = df.groupby("model_b").tstamp.min().rename_axis(index="model") | |
model_min_tstamp = ( | |
model_a_min.to_frame("min_tstamp") | |
.join(model_b_min) | |
.min(axis=1) | |
.rename("min_tstamp") | |
) | |
models = model_min_tstamp.index.to_series() | |
model_basename = models.str.extract(r"([^\s\-\d\.]*)")[0].rename("model_basename") | |
model_params = ( | |
models.str.extract(r".*-([\d\.]+)[bB](-.*)?")[0] | |
.astype(float) | |
.rename("b_parameters") | |
) | |
if ADD_CONFIDENCE: | |
bootstrap_elo_lu = get_bootstrap_result( | |
battles, compute_mle_elo, num_round=100, seed=42 | |
) | |
elo_mle_ratings_confidence = bootstrap_elo_lu.quantile([0.025, 0.5, 0.975]).T | |
elo_mle_ratings = pd.DataFrame({"elo_rating": elo_mle_ratings_confidence[0.5]}) | |
elo_mle_ratings["confidence_down"] = ( | |
elo_mle_ratings_confidence[0.5] - elo_mle_ratings_confidence[0.025] | |
) | |
elo_mle_ratings["confidence_up"] = ( | |
elo_mle_ratings_confidence[0.975] - elo_mle_ratings_confidence[0.5] | |
) | |
else: | |
elo_mle_ratings = compute_mle_elo(battles) | |
scores = pd.concat( | |
[model_basename, model_params, model_min_tstamp, elo_mle_ratings], axis=1 | |
) | |
scores.to_csv(output_fn) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment