Last active
February 23, 2025 19:48
-
-
Save manics/dc8b7547e0d60e6d215bb41ad60f9988 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"id": "4e18df6b-ce7e-4e86-93c7-459509739883", | |
"metadata": {}, | |
"source": [ | |
"# Analyse overlap in repositories across mybinder.org federation" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "e5eb32d8-ccb6-4d33-b9b8-6fa3587afdc9", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from datetime import date, timedelta\n", | |
"from glob import glob\n", | |
"import os\n", | |
"import pandas as pd\n", | |
"import requests" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "4955bae4-bb06-4d52-a1a2-3ae57b96aa31", | |
"metadata": {}, | |
"source": [ | |
"Download some events from https://archive.analytics.mybinder.org and concatenate into a single DataFrame" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"id": "d2a50e79-337e-4beb-ab50-9b5b9a666a92", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"events-2025-01-01.jsonl exists, delete to force a refresh\n", | |
"events-2025-01-02.jsonl exists, delete to force a refresh\n", | |
"events-2025-01-03.jsonl exists, delete to force a refresh\n", | |
"events-2025-01-04.jsonl exists, delete to force a refresh\n", | |
"events-2025-01-05.jsonl exists, delete to force a refresh\n", | |
"events-2025-01-06.jsonl exists, delete to force a refresh\n", | |
"events-2025-01-07.jsonl exists, delete to force a refresh\n", | |
"events-2025-01-08.jsonl exists, delete to force a refresh\n", | |
"events-2025-01-09.jsonl exists, delete to force a refresh\n", | |
"events-2025-01-10.jsonl exists, delete to force a refresh\n", | |
"events-2025-01-11.jsonl exists, delete to force a refresh\n", | |
"events-2025-01-12.jsonl exists, delete to force a refresh\n", | |
"events-2025-01-13.jsonl exists, delete to force a refresh\n", | |
"events-2025-01-14.jsonl exists, delete to force a refresh\n", | |
"events-2025-01-15.jsonl exists, delete to force a refresh\n", | |
"events-2025-01-16.jsonl exists, delete to force a refresh\n", | |
"events-2025-01-17.jsonl exists, delete to force a refresh\n", | |
"events-2025-01-18.jsonl exists, delete to force a refresh\n", | |
"events-2025-01-19.jsonl exists, delete to force a refresh\n", | |
"events-2025-01-20.jsonl exists, delete to force a refresh\n", | |
"events-2025-01-21.jsonl exists, delete to force a refresh\n", | |
"events-2025-01-22.jsonl exists, delete to force a refresh\n", | |
"events-2025-01-23.jsonl exists, delete to force a refresh\n", | |
"events-2025-01-24.jsonl exists, delete to force a refresh\n", | |
"events-2025-01-25.jsonl exists, delete to force a refresh\n", | |
"events-2025-01-26.jsonl exists, delete to force a refresh\n", | |
"events-2025-01-27.jsonl exists, delete to force a refresh\n", | |
"events-2025-01-28.jsonl exists, delete to force a refresh\n", | |
"events-2025-01-29.jsonl exists, delete to force a refresh\n", | |
"events-2025-01-30.jsonl exists, delete to force a refresh\n", | |
"events-2025-01-31.jsonl exists, delete to force a refresh\n", | |
"events-2025-02-01.jsonl exists, delete to force a refresh\n", | |
"events-2025-02-02.jsonl exists, delete to force a refresh\n", | |
"events-2025-02-03.jsonl exists, delete to force a refresh\n", | |
"events-2025-02-04.jsonl exists, delete to force a refresh\n", | |
"events-2025-02-05.jsonl exists, delete to force a refresh\n", | |
"events-2025-02-06.jsonl exists, delete to force a refresh\n", | |
"events-2025-02-07.jsonl exists, delete to force a refresh\n", | |
"events-2025-02-08.jsonl exists, delete to force a refresh\n", | |
"events-2025-02-09.jsonl exists, delete to force a refresh\n", | |
"events-2025-02-10.jsonl exists, delete to force a refresh\n", | |
"events-2025-02-11.jsonl exists, delete to force a refresh\n", | |
"events-2025-02-12.jsonl exists, delete to force a refresh\n", | |
"events-2025-02-13.jsonl exists, delete to force a refresh\n", | |
"events-2025-02-14.jsonl exists, delete to force a refresh\n", | |
"events-2025-02-15.jsonl exists, delete to force a refresh\n", | |
"events-2025-02-16.jsonl exists, delete to force a refresh\n", | |
"events-2025-02-17.jsonl exists, delete to force a refresh\n", | |
"events-2025-02-18.jsonl exists, delete to force a refresh\n", | |
"events-2025-02-19.jsonl exists, delete to force a refresh\n", | |
"events-2025-02-20.jsonl exists, delete to force a refresh\n" | |
] | |
} | |
], | |
"source": [ | |
"start = date(2025,1,1)\n", | |
"end = date(2025,2,20)\n", | |
"\n", | |
"d = start\n", | |
"while d <= end:\n", | |
" jsonl = f\"https://archive.analytics.mybinder.org/events-{d.isoformat()}.jsonl\"\n", | |
" outfile = os.path.basename(jsonl)\n", | |
" d += timedelta(days=1)\n", | |
" if os.path.exists(outfile):\n", | |
" print(f\"{outfile} exists, delete to force a refresh\")\n", | |
" continue\n", | |
" else:\n", | |
" print(f\"Downloading {jsonl}\")\n", | |
" r = requests.get(jsonl)\n", | |
" r.raise_for_status()\n", | |
" with open(outfile, \"wb\") as f:\n", | |
" f.write(r.content)\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"id": "1285a570-390f-4fb8-8b00-4a4d9410b483", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df = None\n", | |
"for f in sorted(glob(\"events-*.jsonl\")):\n", | |
" f = pd.read_json(f, lines=True)\n", | |
" if df is None:\n", | |
" df = f\n", | |
" else:\n", | |
" df = pd.concat([df, f], axis=0)\n", | |
"\n", | |
"df = df.reset_index(drop=True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"id": "e2cf8088-9f8e-4c1f-811a-e04110836a54", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>build_token</th>\n", | |
" <th>origin</th>\n", | |
" <th>provider</th>\n", | |
" <th>ref</th>\n", | |
" <th>schema</th>\n", | |
" <th>spec</th>\n", | |
" <th>status</th>\n", | |
" <th>timestamp</th>\n", | |
" <th>version</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>198910</th>\n", | |
" <td>False</td>\n", | |
" <td>2i2c.mybinder.org</td>\n", | |
" <td>GitHub</td>\n", | |
" <td>f37d4c63fb04b4217b33ae473843b9fdc388a2a7</td>\n", | |
" <td>binderhub.jupyter.org/launch</td>\n", | |
" <td>hanlpbot/hanlp-binder/main</td>\n", | |
" <td>success</td>\n", | |
" <td>2025-02-20 23:59:00+00:00</td>\n", | |
" <td>5</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>198911</th>\n", | |
" <td>False</td>\n", | |
" <td>2i2c.mybinder.org</td>\n", | |
" <td>GitHub</td>\n", | |
" <td>f37d4c63fb04b4217b33ae473843b9fdc388a2a7</td>\n", | |
" <td>binderhub.jupyter.org/launch</td>\n", | |
" <td>hanlpbot/hanlp-binder/main</td>\n", | |
" <td>success</td>\n", | |
" <td>2025-02-20 23:59:00+00:00</td>\n", | |
" <td>5</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>198912</th>\n", | |
" <td>True</td>\n", | |
" <td>2i2c.mybinder.org</td>\n", | |
" <td>GitHub</td>\n", | |
" <td>fc6d94b4372c23338db80ebd347b1339db1e7024</td>\n", | |
" <td>binderhub.jupyter.org/launch</td>\n", | |
" <td>janisbent/sc-lab/latest</td>\n", | |
" <td>success</td>\n", | |
" <td>2025-02-20 23:59:00+00:00</td>\n", | |
" <td>5</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" build_token origin provider \\\n", | |
"198910 False 2i2c.mybinder.org GitHub \n", | |
"198911 False 2i2c.mybinder.org GitHub \n", | |
"198912 True 2i2c.mybinder.org GitHub \n", | |
"\n", | |
" ref \\\n", | |
"198910 f37d4c63fb04b4217b33ae473843b9fdc388a2a7 \n", | |
"198911 f37d4c63fb04b4217b33ae473843b9fdc388a2a7 \n", | |
"198912 fc6d94b4372c23338db80ebd347b1339db1e7024 \n", | |
"\n", | |
" schema spec status \\\n", | |
"198910 binderhub.jupyter.org/launch hanlpbot/hanlp-binder/main success \n", | |
"198911 binderhub.jupyter.org/launch hanlpbot/hanlp-binder/main success \n", | |
"198912 binderhub.jupyter.org/launch janisbent/sc-lab/latest success \n", | |
"\n", | |
" timestamp version \n", | |
"198910 2025-02-20 23:59:00+00:00 5 \n", | |
"198911 2025-02-20 23:59:00+00:00 5 \n", | |
"198912 2025-02-20 23:59:00+00:00 5 " | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df.tail(3)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "74810819-6fb8-4a32-beb7-59ca8056fb6f", | |
"metadata": {}, | |
"source": [ | |
"Use the `spec` column as a way to identify each repository, and create a table showing the number of repositories that overlap between all pairs of federation members" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"id": "a744f4ef-5fcb-48af-868c-bb41995e44db", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"spec_by_origin = {}\n", | |
"for origin in df.origin.unique():\n", | |
" spec_by_origin[origin] = df[df[\"origin\"]==origin][\"spec\"]\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"id": "d9f3c05a-e2e5-4520-b3f7-e7886b3e4a4d", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>2i2c-bare.mybinder.org</th>\n", | |
" <th>2i2c.mybinder.org</th>\n", | |
" <th>notebooks.gesis.org</th>\n", | |
" <th>ovh.mybinder.org</th>\n", | |
" <th>ovh2.mybinder.org</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>2i2c-bare.mybinder.org</th>\n", | |
" <td>263</td>\n", | |
" <td>154</td>\n", | |
" <td>91</td>\n", | |
" <td>1</td>\n", | |
" <td>65</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2i2c.mybinder.org</th>\n", | |
" <td>154</td>\n", | |
" <td>5622</td>\n", | |
" <td>1051</td>\n", | |
" <td>5</td>\n", | |
" <td>714</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>notebooks.gesis.org</th>\n", | |
" <td>91</td>\n", | |
" <td>1051</td>\n", | |
" <td>2322</td>\n", | |
" <td>6</td>\n", | |
" <td>556</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>ovh.mybinder.org</th>\n", | |
" <td>1</td>\n", | |
" <td>5</td>\n", | |
" <td>6</td>\n", | |
" <td>7</td>\n", | |
" <td>4</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>ovh2.mybinder.org</th>\n", | |
" <td>65</td>\n", | |
" <td>714</td>\n", | |
" <td>556</td>\n", | |
" <td>4</td>\n", | |
" <td>1333</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" 2i2c-bare.mybinder.org 2i2c.mybinder.org \\\n", | |
"2i2c-bare.mybinder.org 263 154 \n", | |
"2i2c.mybinder.org 154 5622 \n", | |
"notebooks.gesis.org 91 1051 \n", | |
"ovh.mybinder.org 1 5 \n", | |
"ovh2.mybinder.org 65 714 \n", | |
"\n", | |
" notebooks.gesis.org ovh.mybinder.org ovh2.mybinder.org \n", | |
"2i2c-bare.mybinder.org 91 1 65 \n", | |
"2i2c.mybinder.org 1051 5 714 \n", | |
"notebooks.gesis.org 2322 6 556 \n", | |
"ovh.mybinder.org 6 7 4 \n", | |
"ovh2.mybinder.org 556 4 1333 " | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"origins = sorted(spec_by_origin.keys())\n", | |
"overlap = pd.DataFrame(index=origins, columns=origins)\n", | |
"for r in origins:\n", | |
" for c in origins:\n", | |
" overlap.loc[r, c] = len(set(spec_by_origin[r]).intersection(spec_by_origin[c]))\n", | |
"display(overlap)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "a010726f-e323-4385-b22a-fc21d2c68898", | |
"metadata": {}, | |
"source": [ | |
"For more accurate statistics we should probably combine columns to find out the number of tagged images instead of the number of repositories" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "ce11ab68-f0ed-40fe-ab29-55b3b3984b65", | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3 (ipykernel)", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.12.8" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
pandas | |
requests |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment