Skip to content

Instantly share code, notes, and snippets.

@manics
Last active February 23, 2025 19:48
Show Gist options
  • Save manics/dc8b7547e0d60e6d215bb41ad60f9988 to your computer and use it in GitHub Desktop.
Save manics/dc8b7547e0d60e6d215bb41ad60f9988 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"id": "4e18df6b-ce7e-4e86-93c7-459509739883",
"metadata": {},
"source": [
"# Analyse overlap in repositories across mybinder.org federation"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "e5eb32d8-ccb6-4d33-b9b8-6fa3587afdc9",
"metadata": {},
"outputs": [],
"source": [
"from datetime import date, timedelta\n",
"from glob import glob\n",
"import os\n",
"import pandas as pd\n",
"import requests"
]
},
{
"cell_type": "markdown",
"id": "4955bae4-bb06-4d52-a1a2-3ae57b96aa31",
"metadata": {},
"source": [
"Download some events from https://archive.analytics.mybinder.org and concatenate into a single DataFrame"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "d2a50e79-337e-4beb-ab50-9b5b9a666a92",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"events-2025-01-01.jsonl exists, delete to force a refresh\n",
"events-2025-01-02.jsonl exists, delete to force a refresh\n",
"events-2025-01-03.jsonl exists, delete to force a refresh\n",
"events-2025-01-04.jsonl exists, delete to force a refresh\n",
"events-2025-01-05.jsonl exists, delete to force a refresh\n",
"events-2025-01-06.jsonl exists, delete to force a refresh\n",
"events-2025-01-07.jsonl exists, delete to force a refresh\n",
"events-2025-01-08.jsonl exists, delete to force a refresh\n",
"events-2025-01-09.jsonl exists, delete to force a refresh\n",
"events-2025-01-10.jsonl exists, delete to force a refresh\n",
"events-2025-01-11.jsonl exists, delete to force a refresh\n",
"events-2025-01-12.jsonl exists, delete to force a refresh\n",
"events-2025-01-13.jsonl exists, delete to force a refresh\n",
"events-2025-01-14.jsonl exists, delete to force a refresh\n",
"events-2025-01-15.jsonl exists, delete to force a refresh\n",
"events-2025-01-16.jsonl exists, delete to force a refresh\n",
"events-2025-01-17.jsonl exists, delete to force a refresh\n",
"events-2025-01-18.jsonl exists, delete to force a refresh\n",
"events-2025-01-19.jsonl exists, delete to force a refresh\n",
"events-2025-01-20.jsonl exists, delete to force a refresh\n",
"events-2025-01-21.jsonl exists, delete to force a refresh\n",
"events-2025-01-22.jsonl exists, delete to force a refresh\n",
"events-2025-01-23.jsonl exists, delete to force a refresh\n",
"events-2025-01-24.jsonl exists, delete to force a refresh\n",
"events-2025-01-25.jsonl exists, delete to force a refresh\n",
"events-2025-01-26.jsonl exists, delete to force a refresh\n",
"events-2025-01-27.jsonl exists, delete to force a refresh\n",
"events-2025-01-28.jsonl exists, delete to force a refresh\n",
"events-2025-01-29.jsonl exists, delete to force a refresh\n",
"events-2025-01-30.jsonl exists, delete to force a refresh\n",
"events-2025-01-31.jsonl exists, delete to force a refresh\n",
"events-2025-02-01.jsonl exists, delete to force a refresh\n",
"events-2025-02-02.jsonl exists, delete to force a refresh\n",
"events-2025-02-03.jsonl exists, delete to force a refresh\n",
"events-2025-02-04.jsonl exists, delete to force a refresh\n",
"events-2025-02-05.jsonl exists, delete to force a refresh\n",
"events-2025-02-06.jsonl exists, delete to force a refresh\n",
"events-2025-02-07.jsonl exists, delete to force a refresh\n",
"events-2025-02-08.jsonl exists, delete to force a refresh\n",
"events-2025-02-09.jsonl exists, delete to force a refresh\n",
"events-2025-02-10.jsonl exists, delete to force a refresh\n",
"events-2025-02-11.jsonl exists, delete to force a refresh\n",
"events-2025-02-12.jsonl exists, delete to force a refresh\n",
"events-2025-02-13.jsonl exists, delete to force a refresh\n",
"events-2025-02-14.jsonl exists, delete to force a refresh\n",
"events-2025-02-15.jsonl exists, delete to force a refresh\n",
"events-2025-02-16.jsonl exists, delete to force a refresh\n",
"events-2025-02-17.jsonl exists, delete to force a refresh\n",
"events-2025-02-18.jsonl exists, delete to force a refresh\n",
"events-2025-02-19.jsonl exists, delete to force a refresh\n",
"events-2025-02-20.jsonl exists, delete to force a refresh\n"
]
}
],
"source": [
"start = date(2025,1,1)\n",
"end = date(2025,2,20)\n",
"\n",
"d = start\n",
"while d <= end:\n",
" jsonl = f\"https://archive.analytics.mybinder.org/events-{d.isoformat()}.jsonl\"\n",
" outfile = os.path.basename(jsonl)\n",
" d += timedelta(days=1)\n",
" if os.path.exists(outfile):\n",
" print(f\"{outfile} exists, delete to force a refresh\")\n",
" continue\n",
" else:\n",
" print(f\"Downloading {jsonl}\")\n",
" r = requests.get(jsonl)\n",
" r.raise_for_status()\n",
" with open(outfile, \"wb\") as f:\n",
" f.write(r.content)\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "1285a570-390f-4fb8-8b00-4a4d9410b483",
"metadata": {},
"outputs": [],
"source": [
"df = None\n",
"for f in sorted(glob(\"events-*.jsonl\")):\n",
" f = pd.read_json(f, lines=True)\n",
" if df is None:\n",
" df = f\n",
" else:\n",
" df = pd.concat([df, f], axis=0)\n",
"\n",
"df = df.reset_index(drop=True)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "e2cf8088-9f8e-4c1f-811a-e04110836a54",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>build_token</th>\n",
" <th>origin</th>\n",
" <th>provider</th>\n",
" <th>ref</th>\n",
" <th>schema</th>\n",
" <th>spec</th>\n",
" <th>status</th>\n",
" <th>timestamp</th>\n",
" <th>version</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>198910</th>\n",
" <td>False</td>\n",
" <td>2i2c.mybinder.org</td>\n",
" <td>GitHub</td>\n",
" <td>f37d4c63fb04b4217b33ae473843b9fdc388a2a7</td>\n",
" <td>binderhub.jupyter.org/launch</td>\n",
" <td>hanlpbot/hanlp-binder/main</td>\n",
" <td>success</td>\n",
" <td>2025-02-20 23:59:00+00:00</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>198911</th>\n",
" <td>False</td>\n",
" <td>2i2c.mybinder.org</td>\n",
" <td>GitHub</td>\n",
" <td>f37d4c63fb04b4217b33ae473843b9fdc388a2a7</td>\n",
" <td>binderhub.jupyter.org/launch</td>\n",
" <td>hanlpbot/hanlp-binder/main</td>\n",
" <td>success</td>\n",
" <td>2025-02-20 23:59:00+00:00</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>198912</th>\n",
" <td>True</td>\n",
" <td>2i2c.mybinder.org</td>\n",
" <td>GitHub</td>\n",
" <td>fc6d94b4372c23338db80ebd347b1339db1e7024</td>\n",
" <td>binderhub.jupyter.org/launch</td>\n",
" <td>janisbent/sc-lab/latest</td>\n",
" <td>success</td>\n",
" <td>2025-02-20 23:59:00+00:00</td>\n",
" <td>5</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" build_token origin provider \\\n",
"198910 False 2i2c.mybinder.org GitHub \n",
"198911 False 2i2c.mybinder.org GitHub \n",
"198912 True 2i2c.mybinder.org GitHub \n",
"\n",
" ref \\\n",
"198910 f37d4c63fb04b4217b33ae473843b9fdc388a2a7 \n",
"198911 f37d4c63fb04b4217b33ae473843b9fdc388a2a7 \n",
"198912 fc6d94b4372c23338db80ebd347b1339db1e7024 \n",
"\n",
" schema spec status \\\n",
"198910 binderhub.jupyter.org/launch hanlpbot/hanlp-binder/main success \n",
"198911 binderhub.jupyter.org/launch hanlpbot/hanlp-binder/main success \n",
"198912 binderhub.jupyter.org/launch janisbent/sc-lab/latest success \n",
"\n",
" timestamp version \n",
"198910 2025-02-20 23:59:00+00:00 5 \n",
"198911 2025-02-20 23:59:00+00:00 5 \n",
"198912 2025-02-20 23:59:00+00:00 5 "
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.tail(3)"
]
},
{
"cell_type": "markdown",
"id": "74810819-6fb8-4a32-beb7-59ca8056fb6f",
"metadata": {},
"source": [
"Use the `spec` column as a way to identify each repository, and create a table showing the number of repositories that overlap between all pairs of federation members"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "a744f4ef-5fcb-48af-868c-bb41995e44db",
"metadata": {},
"outputs": [],
"source": [
"spec_by_origin = {}\n",
"for origin in df.origin.unique():\n",
" spec_by_origin[origin] = df[df[\"origin\"]==origin][\"spec\"]\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "d9f3c05a-e2e5-4520-b3f7-e7886b3e4a4d",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>2i2c-bare.mybinder.org</th>\n",
" <th>2i2c.mybinder.org</th>\n",
" <th>notebooks.gesis.org</th>\n",
" <th>ovh.mybinder.org</th>\n",
" <th>ovh2.mybinder.org</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>2i2c-bare.mybinder.org</th>\n",
" <td>263</td>\n",
" <td>154</td>\n",
" <td>91</td>\n",
" <td>1</td>\n",
" <td>65</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2i2c.mybinder.org</th>\n",
" <td>154</td>\n",
" <td>5622</td>\n",
" <td>1051</td>\n",
" <td>5</td>\n",
" <td>714</td>\n",
" </tr>\n",
" <tr>\n",
" <th>notebooks.gesis.org</th>\n",
" <td>91</td>\n",
" <td>1051</td>\n",
" <td>2322</td>\n",
" <td>6</td>\n",
" <td>556</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ovh.mybinder.org</th>\n",
" <td>1</td>\n",
" <td>5</td>\n",
" <td>6</td>\n",
" <td>7</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ovh2.mybinder.org</th>\n",
" <td>65</td>\n",
" <td>714</td>\n",
" <td>556</td>\n",
" <td>4</td>\n",
" <td>1333</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" 2i2c-bare.mybinder.org 2i2c.mybinder.org \\\n",
"2i2c-bare.mybinder.org 263 154 \n",
"2i2c.mybinder.org 154 5622 \n",
"notebooks.gesis.org 91 1051 \n",
"ovh.mybinder.org 1 5 \n",
"ovh2.mybinder.org 65 714 \n",
"\n",
" notebooks.gesis.org ovh.mybinder.org ovh2.mybinder.org \n",
"2i2c-bare.mybinder.org 91 1 65 \n",
"2i2c.mybinder.org 1051 5 714 \n",
"notebooks.gesis.org 2322 6 556 \n",
"ovh.mybinder.org 6 7 4 \n",
"ovh2.mybinder.org 556 4 1333 "
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"origins = sorted(spec_by_origin.keys())\n",
"overlap = pd.DataFrame(index=origins, columns=origins)\n",
"for r in origins:\n",
" for c in origins:\n",
" overlap.loc[r, c] = len(set(spec_by_origin[r]).intersection(spec_by_origin[c]))\n",
"display(overlap)"
]
},
{
"cell_type": "markdown",
"id": "a010726f-e323-4385-b22a-fc21d2c68898",
"metadata": {},
"source": [
"For more accurate statistics we should probably combine columns to find out the number of tagged images instead of the number of repositories"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ce11ab68-f0ed-40fe-ab29-55b3b3984b65",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.8"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment