Skip to content

Instantly share code, notes, and snippets.

@tsvikas
Last active October 31, 2024 10:38
Show Gist options
  • Save tsvikas/1b914863c4589b5529368666c8b0e782 to your computer and use it in GitHub Desktop.
Save tsvikas/1b914863c4589b5529368666c8b0e782 to your computer and use it in GitHub Desktop.
download and analyze data from llm-arena
# this file uses a leading space in its name, to give the gist a nice name
this gist is some python code to reproduce the LLM arena leaderboard
it is based on https://colab.research.google.com/drive/1KdwokPjirkTmpO_P1WByFNFiqxWQquwH
but convert the huge json it uses to a smaller parquet file
from pathlib import Path
import pandas as pd
import pooch
def convert_to_ndjson(json_fn: Path, ndjson_fn: Path | None = None) -> Path:
# set filenames
json_fn = Path(json_fn)
ndjson_fn = json_fn.with_suffix(".ndjson") if ndjson_fn is None else Path(ndjson_fn)
# convert
with ndjson_fn.open("w") as f_out, json_fn.open() as f:
lines = []
if not f.readline() == "[\n":
raise RuntimeError("expected a different line")
n_records = 0
for line in f.readlines():
if line in {" },\n", " }\n"}:
record = f"{{ {''.join(lines[1:])} }}\n"
f_out.write(record)
n_records += 1
if n_records % 10000 == 0:
print(n_records)
lines = []
else:
lines.append(line.strip())
return ndjson_fn
def convert_to_parquet(
ndjson_fn: Path, parquet_fn: Path | None = None, chunksize: int = 1024
) -> Path:
# set filenames
ndjson_fn = Path(ndjson_fn)
parquet_fn = (
ndjson_fn.with_suffix(".parquet") if parquet_fn is None else Path(parquet_fn)
)
# load df
json_reader = pd.read_json(
ndjson_fn, lines=True, orient="records", chunksize=chunksize
)
df = pd.concat(json_reader)
# fix df
df["tstamp"] = pd.to_datetime(df.tstamp, unit="s")
for col in ["conv_metadata", "dedup_tag", "category_tag"]:
ser = df.pop(col)
exploded = pd.json_normalize(ser).add_prefix(f"{col}.")
df = df.join(exploded)
# save df
df.to_parquet(parquet_fn)
return parquet_fn
def main() -> None:
fn = "clean_battle_20240814_public"
data_dir = Path("data")
print("retrieving data:")
json_fn = pooch.retrieve(
f"https://storage.googleapis.com/arena_external_data/public/{fn}.json",
"747c1c937dfa941d5a455a1fd70e2879d29642058da39b97996b977233b9bd1b",
fname=f"{fn}.json",
path=data_dir,
)
print("retrieved data")
print("converting to ndjson:")
ndjson_fn = data_dir / f"{fn}.ndjson"
if not ndjson_fn.exists():
ndjson_fn = convert_to_ndjson(json_fn, ndjson_fn)
print("converted to ndjson")
else:
print("using existing ndjson file")
print("converting to parquet:")
parquet_fn = convert_to_parquet(ndjson_fn=ndjson_fn, chunksize=200000)
print("converted to parquet")
print("done")
if __name__ == "__main__":
main()
import math
from pathlib import Path
from typing import Callable
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from tqdm import tqdm
ADD_CONFIDENCE = True
def compute_mle_elo(
df: pd.DataFrame, SCALE: int = 400, BASE: int = 10, INIT_RATING: int = 1000
) -> pd.Series:
# source: https://colab.research.google.com/drive/1KdwokPjirkTmpO_P1WByFNFiqxWQquwH
ptbl_a_win = pd.pivot_table(
df[df["winner"] == "model_a"],
index="model_a",
columns="model_b",
aggfunc="size",
fill_value=0,
)
# if no tie, create a zero matrix
if sum(df["winner"].isin(["tie", "tie (bothbad)"])) == 0:
ptbl_tie = pd.DataFrame(0, index=ptbl_a_win.index, columns=ptbl_a_win.columns)
else:
ptbl_tie = pd.pivot_table(
df[df["winner"].isin(["tie", "tie (bothbad)"])],
index="model_a",
columns="model_b",
aggfunc="size",
fill_value=0,
)
ptbl_tie = ptbl_tie + ptbl_tie.T
ptbl_b_win = pd.pivot_table(
df[df["winner"] == "model_b"],
index="model_a",
columns="model_b",
aggfunc="size",
fill_value=0,
)
ptbl_win = ptbl_a_win * 2 + ptbl_b_win.T * 2 + ptbl_tie
models = pd.Series(np.arange(len(ptbl_win.index)), index=ptbl_win.index)
p = len(models)
X = np.zeros([p * (p - 1) * 2, p])
Y = np.zeros(p * (p - 1) * 2)
cur_row = 0
sample_weights = []
for m_a in ptbl_win.index:
for m_b in ptbl_win.columns:
if m_a == m_b:
continue
# if nan skip
if math.isnan(ptbl_win.loc[m_a, m_b]) or math.isnan(ptbl_win.loc[m_b, m_a]):
continue
X[cur_row, models[m_a]] = +math.log(BASE)
X[cur_row, models[m_b]] = -math.log(BASE)
Y[cur_row] = 1.0
sample_weights.append(ptbl_win.loc[m_a, m_b])
X[cur_row + 1, models[m_a]] = math.log(BASE)
X[cur_row + 1, models[m_b]] = -math.log(BASE)
Y[cur_row + 1] = 0.0
sample_weights.append(ptbl_win.loc[m_b, m_a])
cur_row += 2
X = X[:cur_row]
Y = Y[:cur_row]
lr = LogisticRegression(fit_intercept=False, penalty=None, tol=1e-6)
lr.fit(X, Y, sample_weight=sample_weights)
elo_scores = SCALE * lr.coef_[0] + INIT_RATING
if "mixtral-8x7b-instruct-v0.1" in models.index:
elo_scores += 1114 - elo_scores[models["mixtral-8x7b-instruct-v0.1"]]
return (
pd.Series(elo_scores, index=models.index)
.sort_values(ascending=False)
.rename_axis(index="model")
.rename("elo_rating")
)
def get_bootstrap_result(
battles: pd.DataFrame, func_compute_elo: Callable, num_round: int, seed: int
) -> pd.DataFrame:
rng = np.random.default_rng(seed)
rows = [
func_compute_elo(battles.sample(frac=1.0, replace=True, random_state=rng))
for i in tqdm(range(num_round), desc="bootstrap")
]
df = pd.DataFrame(rows)
# sort models from best to worse
return df[df.median().sort_values(ascending=False).index]
def main() -> None:
data_dir = Path("data")
input_fn = data_dir / "clean_battle_20240814_public.parquet"
output_fn = data_dir / "scores.csv"
df = pd.read_parquet(input_fn)
battles = df[df["dedup_tag.sampled"] & df["anony"]]
model_a_min = df.groupby("model_a").tstamp.min().rename_axis(index="model")
model_b_min = df.groupby("model_b").tstamp.min().rename_axis(index="model")
model_min_tstamp = (
model_a_min.to_frame("min_tstamp")
.join(model_b_min)
.min(axis=1)
.rename("min_tstamp")
)
models = model_min_tstamp.index.to_series()
model_basename = models.str.extract(r"([^\s\-\d\.]*)")[0].rename("model_basename")
model_params = (
models.str.extract(r".*-([\d\.]+)[bB](-.*)?")[0]
.astype(float)
.rename("b_parameters")
)
if ADD_CONFIDENCE:
bootstrap_elo_lu = get_bootstrap_result(
battles, compute_mle_elo, num_round=100, seed=42
)
elo_mle_ratings_confidence = bootstrap_elo_lu.quantile([0.025, 0.5, 0.975]).T
elo_mle_ratings = pd.DataFrame({"elo_rating": elo_mle_ratings_confidence[0.5]})
elo_mle_ratings["confidence_down"] = (
elo_mle_ratings_confidence[0.5] - elo_mle_ratings_confidence[0.025]
)
elo_mle_ratings["confidence_up"] = (
elo_mle_ratings_confidence[0.975] - elo_mle_ratings_confidence[0.5]
)
else:
elo_mle_ratings = compute_mle_elo(battles)
scores = pd.concat(
[model_basename, model_params, model_min_tstamp, elo_mle_ratings], axis=1
)
scores.to_csv(output_fn)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment