tsvikas · October 31, 2024 10:38
diff --git a/analyze data from llm-arena b/analyze data from llm-arena
 # this file uses a leading space in its name, to give the gist a nice name

 this gist is some python code to reproduce the LLM arena leaderboard
 it is based on https://colab.research.google.com/drive/1KdwokPjirkTmpO_P1WByFNFiqxWQquwH
 but convert the huge json it uses to a smaller parquet file
diff --git a/convert_json.py b/convert_json.py
 from pathlib import Path

 import pandas as pd
 import pooch


 def convert_to_ndjson(json_fn: Path, ndjson_fn: Path | None = None) -> Path:
    # set filenames
    json_fn = Path(json_fn)
    ndjson_fn = json_fn.with_suffix(".ndjson") if ndjson_fn is None else Path(ndjson_fn)
    # convert
    with ndjson_fn.open("w") as f_out, json_fn.open() as f:
        lines = []
        if not f.readline() == "[\n":
            raise RuntimeError("expected a different line")
        n_records = 0
        for line in f.readlines():
            if line in {"    },\n", "    }\n"}:
                record = f"{{ {''.join(lines[1:])} }}\n"
                f_out.write(record)
                n_records += 1
                if n_records % 10000 == 0:
                    print(n_records)
                lines = []
            else:
                lines.append(line.strip())
    return ndjson_fn


 def convert_to_parquet(
    ndjson_fn: Path, parquet_fn: Path | None = None, chunksize: int = 1024
 ) -> Path:
    # set filenames
    ndjson_fn = Path(ndjson_fn)
    parquet_fn = (
        ndjson_fn.with_suffix(".parquet") if parquet_fn is None else Path(parquet_fn)
    )
    # load df
    json_reader = pd.read_json(
        ndjson_fn, lines=True, orient="records", chunksize=chunksize
    )
    df = pd.concat(json_reader)
    # fix df
    df["tstamp"] = pd.to_datetime(df.tstamp, unit="s")
    for col in ["conv_metadata", "dedup_tag", "category_tag"]:
        ser = df.pop(col)
        exploded = pd.json_normalize(ser).add_prefix(f"{col}.")
        df = df.join(exploded)
    # save df
    df.to_parquet(parquet_fn)
    return parquet_fn


 def main() -> None:
    fn = "clean_battle_20240814_public"
    data_dir = Path("data")

    print("retrieving data:")
    json_fn = pooch.retrieve(
        f"https://storage.googleapis.com/arena_external_data/public/{fn}.json",
        "747c1c937dfa941d5a455a1fd70e2879d29642058da39b97996b977233b9bd1b",
        fname=f"{fn}.json",
        path=data_dir,
    )
    print("retrieved data")

    print("converting to ndjson:")
    ndjson_fn = data_dir / f"{fn}.ndjson"
    if not ndjson_fn.exists():
        ndjson_fn = convert_to_ndjson(json_fn, ndjson_fn)
        print("converted to ndjson")
    else:
        print("using existing ndjson file")

    print("converting to parquet:")
    parquet_fn = convert_to_parquet(ndjson_fn=ndjson_fn, chunksize=200000)
    print("converted to parquet")

    print("done")


 if __name__ == "__main__":
    main()
diff --git a/scores.py b/scores.py
 import math
 from pathlib import Path
 from typing import Callable

 import numpy as np
 import pandas as pd
 from sklearn.linear_model import LogisticRegression
 from tqdm import tqdm

 ADD_CONFIDENCE = True


 def compute_mle_elo(
    df: pd.DataFrame, SCALE: int = 400, BASE: int = 10, INIT_RATING: int = 1000
 ) -> pd.Series:
    # source: https://colab.research.google.com/drive/1KdwokPjirkTmpO_P1WByFNFiqxWQquwH

    ptbl_a_win = pd.pivot_table(
        df[df["winner"] == "model_a"],
        index="model_a",
        columns="model_b",
        aggfunc="size",
        fill_value=0,
    )
    # if no tie, create a zero matrix
    if sum(df["winner"].isin(["tie", "tie (bothbad)"])) == 0:
        ptbl_tie = pd.DataFrame(0, index=ptbl_a_win.index, columns=ptbl_a_win.columns)
    else:
        ptbl_tie = pd.pivot_table(
            df[df["winner"].isin(["tie", "tie (bothbad)"])],
            index="model_a",
            columns="model_b",
            aggfunc="size",
            fill_value=0,
        )
        ptbl_tie = ptbl_tie + ptbl_tie.T

    ptbl_b_win = pd.pivot_table(
        df[df["winner"] == "model_b"],
        index="model_a",
        columns="model_b",
        aggfunc="size",
        fill_value=0,
    )
    ptbl_win = ptbl_a_win * 2 + ptbl_b_win.T * 2 + ptbl_tie

    models = pd.Series(np.arange(len(ptbl_win.index)), index=ptbl_win.index)

    p = len(models)
    X = np.zeros([p * (p - 1) * 2, p])
    Y = np.zeros(p * (p - 1) * 2)

    cur_row = 0
    sample_weights = []
    for m_a in ptbl_win.index:
        for m_b in ptbl_win.columns:
            if m_a == m_b:
                continue
            # if nan skip
            if math.isnan(ptbl_win.loc[m_a, m_b]) or math.isnan(ptbl_win.loc[m_b, m_a]):
                continue
            X[cur_row, models[m_a]] = +math.log(BASE)
            X[cur_row, models[m_b]] = -math.log(BASE)
            Y[cur_row] = 1.0
            sample_weights.append(ptbl_win.loc[m_a, m_b])

            X[cur_row + 1, models[m_a]] = math.log(BASE)
            X[cur_row + 1, models[m_b]] = -math.log(BASE)
            Y[cur_row + 1] = 0.0
            sample_weights.append(ptbl_win.loc[m_b, m_a])
            cur_row += 2
    X = X[:cur_row]
    Y = Y[:cur_row]

    lr = LogisticRegression(fit_intercept=False, penalty=None, tol=1e-6)
    lr.fit(X, Y, sample_weight=sample_weights)
    elo_scores = SCALE * lr.coef_[0] + INIT_RATING
    if "mixtral-8x7b-instruct-v0.1" in models.index:
        elo_scores += 1114 - elo_scores[models["mixtral-8x7b-instruct-v0.1"]]
    return (
        pd.Series(elo_scores, index=models.index)
        .sort_values(ascending=False)
        .rename_axis(index="model")
        .rename("elo_rating")
    )


 def get_bootstrap_result(
    battles: pd.DataFrame, func_compute_elo: Callable, num_round: int, seed: int
 ) -> pd.DataFrame:
    rng = np.random.default_rng(seed)
    rows = [
        func_compute_elo(battles.sample(frac=1.0, replace=True, random_state=rng))
        for i in tqdm(range(num_round), desc="bootstrap")
    ]
    df = pd.DataFrame(rows)
    # sort models from best to worse
    return df[df.median().sort_values(ascending=False).index]


 def main() -> None:
    data_dir = Path("data")
    input_fn = data_dir / "clean_battle_20240814_public.parquet"
    output_fn = data_dir / "scores.csv"

    df = pd.read_parquet(input_fn)
    battles = df[df["dedup_tag.sampled"] & df["anony"]]
    model_a_min = df.groupby("model_a").tstamp.min().rename_axis(index="model")
    model_b_min = df.groupby("model_b").tstamp.min().rename_axis(index="model")
    model_min_tstamp = (
        model_a_min.to_frame("min_tstamp")
        .join(model_b_min)
        .min(axis=1)
        .rename("min_tstamp")
    )
    models = model_min_tstamp.index.to_series()
    model_basename = models.str.extract(r"([^\s\-\d\.]*)")[0].rename("model_basename")
    model_params = (
        models.str.extract(r".*-([\d\.]+)[bB](-.*)?")[0]
        .astype(float)
        .rename("b_parameters")
    )

    if ADD_CONFIDENCE:
        bootstrap_elo_lu = get_bootstrap_result(
            battles, compute_mle_elo, num_round=100, seed=42
        )
        elo_mle_ratings_confidence = bootstrap_elo_lu.quantile([0.025, 0.5, 0.975]).T
        elo_mle_ratings = pd.DataFrame({"elo_rating": elo_mle_ratings_confidence[0.5]})
        elo_mle_ratings["confidence_down"] = (
            elo_mle_ratings_confidence[0.5] - elo_mle_ratings_confidence[0.025]
        )
        elo_mle_ratings["confidence_up"] = (
            elo_mle_ratings_confidence[0.975] - elo_mle_ratings_confidence[0.5]
        )
    else:
        elo_mle_ratings = compute_mle_elo(battles)

    scores = pd.concat(
        [model_basename, model_params, model_min_tstamp, elo_mle_ratings], axis=1
    )
    scores.to_csv(output_fn)


 if __name__ == "__main__":
    main()
	# this file uses a leading space in its name, to give the gist a nice name

	this gist is some python code to reproduce the LLM arena leaderboard
	it is based on https://colab.research.google.com/drive/1KdwokPjirkTmpO_P1WByFNFiqxWQquwH
	but convert the huge json it uses to a smaller parquet file
	from pathlib import Path

	import pandas as pd
	import pooch


	def convert_to_ndjson(json_fn: Path, ndjson_fn: Path \| None = None) -> Path:
	# set filenames
	json_fn = Path(json_fn)
	ndjson_fn = json_fn.with_suffix(".ndjson") if ndjson_fn is None else Path(ndjson_fn)
	# convert
	with ndjson_fn.open("w") as f_out, json_fn.open() as f:
	lines = []
	if not f.readline() == "[\n":
	raise RuntimeError("expected a different line")
	n_records = 0
	for line in f.readlines():
	if line in {" },\n", " }\n"}:
	record = f"{{ {''.join(lines[1:])} }}\n"
	f_out.write(record)
	n_records += 1
	if n_records % 10000 == 0:
	print(n_records)
	lines = []
	else:
	lines.append(line.strip())
	return ndjson_fn


	def convert_to_parquet(
	ndjson_fn: Path, parquet_fn: Path \| None = None, chunksize: int = 1024
	) -> Path:
	# set filenames
	ndjson_fn = Path(ndjson_fn)
	parquet_fn = (
	ndjson_fn.with_suffix(".parquet") if parquet_fn is None else Path(parquet_fn)
	)
	# load df
	json_reader = pd.read_json(
	ndjson_fn, lines=True, orient="records", chunksize=chunksize
	)
	df = pd.concat(json_reader)
	# fix df
	df["tstamp"] = pd.to_datetime(df.tstamp, unit="s")
	for col in ["conv_metadata", "dedup_tag", "category_tag"]:
	ser = df.pop(col)
	exploded = pd.json_normalize(ser).add_prefix(f"{col}.")
	df = df.join(exploded)
	# save df
	df.to_parquet(parquet_fn)
	return parquet_fn


	def main() -> None:
	fn = "clean_battle_20240814_public"
	data_dir = Path("data")

	print("retrieving data:")
	json_fn = pooch.retrieve(
	f"https://storage.googleapis.com/arena_external_data/public/{fn}.json",
	"747c1c937dfa941d5a455a1fd70e2879d29642058da39b97996b977233b9bd1b",
	fname=f"{fn}.json",
	path=data_dir,
	)
	print("retrieved data")

	print("converting to ndjson:")
	ndjson_fn = data_dir / f"{fn}.ndjson"
	if not ndjson_fn.exists():
	ndjson_fn = convert_to_ndjson(json_fn, ndjson_fn)
	print("converted to ndjson")
	else:
	print("using existing ndjson file")

	print("converting to parquet:")
	parquet_fn = convert_to_parquet(ndjson_fn=ndjson_fn, chunksize=200000)
	print("converted to parquet")

	print("done")


	if __name__ == "__main__":
	main()
	import math
	from pathlib import Path
	from typing import Callable

	import numpy as np
	import pandas as pd
	from sklearn.linear_model import LogisticRegression
	from tqdm import tqdm

	ADD_CONFIDENCE = True


	def compute_mle_elo(
	df: pd.DataFrame, SCALE: int = 400, BASE: int = 10, INIT_RATING: int = 1000
	) -> pd.Series:
	# source: https://colab.research.google.com/drive/1KdwokPjirkTmpO_P1WByFNFiqxWQquwH

	ptbl_a_win = pd.pivot_table(
	df[df["winner"] == "model_a"],
	index="model_a",
	columns="model_b",
	aggfunc="size",
	fill_value=0,
	)
	# if no tie, create a zero matrix
	if sum(df["winner"].isin(["tie", "tie (bothbad)"])) == 0:
	ptbl_tie = pd.DataFrame(0, index=ptbl_a_win.index, columns=ptbl_a_win.columns)
	else:
	ptbl_tie = pd.pivot_table(
	df[df["winner"].isin(["tie", "tie (bothbad)"])],
	index="model_a",
	columns="model_b",
	aggfunc="size",
	fill_value=0,
	)
	ptbl_tie = ptbl_tie + ptbl_tie.T

	ptbl_b_win = pd.pivot_table(
	df[df["winner"] == "model_b"],
	index="model_a",
	columns="model_b",
	aggfunc="size",
	fill_value=0,
	)
	ptbl_win = ptbl_a_win * 2 + ptbl_b_win.T * 2 + ptbl_tie

	models = pd.Series(np.arange(len(ptbl_win.index)), index=ptbl_win.index)

	p = len(models)
	X = np.zeros([p * (p - 1) * 2, p])
	Y = np.zeros(p * (p - 1) * 2)

	cur_row = 0
	sample_weights = []
	for m_a in ptbl_win.index:
	for m_b in ptbl_win.columns:
	if m_a == m_b:
	continue
	# if nan skip
	if math.isnan(ptbl_win.loc[m_a, m_b]) or math.isnan(ptbl_win.loc[m_b, m_a]):
	continue
	X[cur_row, models[m_a]] = +math.log(BASE)
	X[cur_row, models[m_b]] = -math.log(BASE)
	Y[cur_row] = 1.0
	sample_weights.append(ptbl_win.loc[m_a, m_b])

	X[cur_row + 1, models[m_a]] = math.log(BASE)
	X[cur_row + 1, models[m_b]] = -math.log(BASE)
	Y[cur_row + 1] = 0.0
	sample_weights.append(ptbl_win.loc[m_b, m_a])
	cur_row += 2
	X = X[:cur_row]
	Y = Y[:cur_row]

	lr = LogisticRegression(fit_intercept=False, penalty=None, tol=1e-6)
	lr.fit(X, Y, sample_weight=sample_weights)
	elo_scores = SCALE * lr.coef_[0] + INIT_RATING
	if "mixtral-8x7b-instruct-v0.1" in models.index:
	elo_scores += 1114 - elo_scores[models["mixtral-8x7b-instruct-v0.1"]]
	return (
	pd.Series(elo_scores, index=models.index)
	.sort_values(ascending=False)
	.rename_axis(index="model")
	.rename("elo_rating")
	)


	def get_bootstrap_result(
	battles: pd.DataFrame, func_compute_elo: Callable, num_round: int, seed: int
	) -> pd.DataFrame:
	rng = np.random.default_rng(seed)
	rows = [
	func_compute_elo(battles.sample(frac=1.0, replace=True, random_state=rng))
	for i in tqdm(range(num_round), desc="bootstrap")
	]
	df = pd.DataFrame(rows)
	# sort models from best to worse
	return df[df.median().sort_values(ascending=False).index]


	def main() -> None:
	data_dir = Path("data")
	input_fn = data_dir / "clean_battle_20240814_public.parquet"
	output_fn = data_dir / "scores.csv"

	df = pd.read_parquet(input_fn)
	battles = df[df["dedup_tag.sampled"] & df["anony"]]
	model_a_min = df.groupby("model_a").tstamp.min().rename_axis(index="model")
	model_b_min = df.groupby("model_b").tstamp.min().rename_axis(index="model")
	model_min_tstamp = (
	model_a_min.to_frame("min_tstamp")
	.join(model_b_min)
	.min(axis=1)
	.rename("min_tstamp")
	)
	models = model_min_tstamp.index.to_series()
	model_basename = models.str.extract(r"([^\s\-\d\.]*)")[0].rename("model_basename")
	model_params = (
	models.str.extract(r".-([\d\.]+)[bB](-.)?")[0]
	.astype(float)
	.rename("b_parameters")
	)

	if ADD_CONFIDENCE:
	bootstrap_elo_lu = get_bootstrap_result(
	battles, compute_mle_elo, num_round=100, seed=42
	)
	elo_mle_ratings_confidence = bootstrap_elo_lu.quantile([0.025, 0.5, 0.975]).T
	elo_mle_ratings = pd.DataFrame({"elo_rating": elo_mle_ratings_confidence[0.5]})
	elo_mle_ratings["confidence_down"] = (
	elo_mle_ratings_confidence[0.5] - elo_mle_ratings_confidence[0.025]
	)
	elo_mle_ratings["confidence_up"] = (
	elo_mle_ratings_confidence[0.975] - elo_mle_ratings_confidence[0.5]
	)
	else:
	elo_mle_ratings = compute_mle_elo(battles)

	scores = pd.concat(
	[model_basename, model_params, model_min_tstamp, elo_mle_ratings], axis=1
	)
	scores.to_csv(output_fn)


	if __name__ == "__main__":
	main()