Created
August 15, 2025 13:58
-
-
Save neubig/804fcd7b4201f4fb32a357cc95cdac79 to your computer and use it in GitHub Desktop.
A script to analyze the number of PRs created by `neubig` and the number that openhands contributed to
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import os, csv, requests, datetime as dt, calendar, time, random | |
| from collections import OrderedDict | |
| import argparse | |
| import matplotlib.pyplot as plt | |
| BASE_SEARCH_URL = 'https://api.github.com/search/issues' | |
| def parse_link_header(value: str): | |
| links = {} | |
| if not value: | |
| return links | |
| for part in value.split(','): | |
| section = part.split(';') | |
| if len(section) < 2: | |
| continue | |
| url = section[0].strip()[1:-1] | |
| rel = None | |
| for sec in section[1:]: | |
| sec = sec.strip() | |
| if sec.startswith('rel='): | |
| rel = sec.split('=')[1].strip('"') | |
| if rel: | |
| links[rel] = url | |
| return links | |
| def github_get(url, params=None, max_retries=5): | |
| headers = { | |
| 'Accept': 'application/vnd.github+json', | |
| 'User-Agent': 'openhands-analysis-script' | |
| } | |
| token = os.getenv('GITHUB_TOKEN') | |
| if token: | |
| headers['Authorization'] = f'Bearer {token}' | |
| for attempt in range(max_retries): | |
| resp = requests.get(url, headers=headers, params=params, timeout=30) | |
| if resp.status_code == 403 and 'rate limit' in resp.text.lower(): | |
| reset = resp.headers.get('X-RateLimit-Reset') | |
| if reset: | |
| wait_s = max(int(reset) - int(time.time()) + 1, 5) | |
| else: | |
| wait_s = 10 * (attempt + 1) | |
| time.sleep(wait_s) | |
| continue | |
| if resp.status_code in (502, 503, 504): | |
| time.sleep(2 * (attempt + 1)) | |
| continue | |
| resp.raise_for_status() | |
| return resp | |
| resp.raise_for_status() | |
| def month_start_list(months_back: int): | |
| today = dt.date.today() | |
| first_of_this_month = dt.date(today.year, today.month, 1) | |
| out = [] | |
| y, m = first_of_this_month.year, first_of_this_month.month | |
| for i in range(months_back): | |
| mm = m - (months_back - 1 - i) | |
| yy = y | |
| while mm <= 0: | |
| mm += 12 | |
| yy -= 1 | |
| out.append(dt.date(yy, mm, 1)) | |
| return out | |
| def month_range(d: dt.date): | |
| today = dt.date.today() | |
| start = d | |
| if d.year == today.year and d.month == today.month: | |
| end = today | |
| else: | |
| end_day = calendar.monthrange(d.year, d.month)[1] | |
| end = dt.date(d.year, d.month, end_day) | |
| return start, end | |
| def search_prs_for_month(user: str, start_date: dt.date, end_date: dt.date): | |
| q = f"author:{user} type:pr created:{start_date.isoformat()}..{end_date.isoformat()}" | |
| params = {'q': q, 'per_page': 100, 'page': 1, 'sort': 'created', 'order': 'asc'} | |
| items = [] | |
| url = BASE_SEARCH_URL | |
| while True: | |
| resp = github_get(url, params=params) | |
| data = resp.json() | |
| items.extend(data.get('items', [])) | |
| links = parse_link_header(resp.headers.get('Link', '')) | |
| if 'next' in links: | |
| url = links['next'] | |
| params = None | |
| else: | |
| break | |
| return items | |
| def pr_has_openhands_commit_email(pr_api_url: str) -> bool: | |
| url = pr_api_url.rstrip('/') + '/commits' | |
| params = {'per_page': 250, 'page': 1} | |
| while True: | |
| resp = github_get(url, params=params) | |
| commits = resp.json() | |
| for c in commits: | |
| cb = c.get('commit', {}) | |
| for who in ('author', 'committer'): | |
| email = (cb.get(who) or {}).get('email') | |
| if email and 'openhands' in email.lower(): | |
| return True | |
| links = parse_link_header(resp.headers.get('Link', '')) | |
| if 'next' in links: | |
| url = links['next'] | |
| params = None | |
| else: | |
| break | |
| return False | |
| def crawl_and_sample(user: str, months_back: int, sample_per_month: int, seed: int = 42): | |
| random.seed(seed) | |
| months = month_start_list(months_back) | |
| records = [] | |
| for ms in months: | |
| start, end = month_range(ms) | |
| items = search_prs_for_month(user, start, end) | |
| month_key = ms.strftime('%Y-%m') | |
| total_prs = len(items) | |
| k = min(sample_per_month, total_prs) | |
| if k == 0: | |
| records.append({ | |
| 'month': month_key, | |
| 'total_prs': total_prs, | |
| 'sample_n': 0, | |
| 'sample_with_agent': 0, | |
| 'sample_without_agent': 0, | |
| 'ratio_without_agent': '' | |
| }) | |
| continue | |
| sample = random.sample(items, k) | |
| without = 0 | |
| for it in sample: | |
| pr_url = (it.get('pull_request') or {}).get('url') | |
| if not pr_url: | |
| repo_api = it.get('repository_url') | |
| number = it.get('number') | |
| if repo_api and number: | |
| pr_url = f"{repo_api}/pulls/{number}" | |
| if not pr_url: | |
| without += 1 | |
| continue | |
| has_openhands = pr_has_openhands_commit_email(pr_url) | |
| if not has_openhands: | |
| without += 1 | |
| with_agent = k - without | |
| ratio_wo = without / k if k else '' | |
| records.append({ | |
| 'month': month_key, | |
| 'total_prs': total_prs, | |
| 'sample_n': k, | |
| 'sample_with_agent': with_agent, | |
| 'sample_without_agent': without, | |
| 'ratio_without_agent': f"{ratio_wo:.6f}" if k else '' | |
| }) | |
| return records | |
| def write_csv(records, path: str): | |
| fieldnames = ['month', 'total_prs', 'sample_n', 'sample_with_agent', 'sample_without_agent', 'ratio_without_agent'] | |
| with open(path, 'w', newline='') as f: | |
| w = csv.DictWriter(f, fieldnames=fieldnames) | |
| w.writeheader() | |
| for r in records: | |
| w.writerow(r) | |
| def read_csv(path: str): | |
| out = [] | |
| with open(path, 'r', newline='') as f: | |
| r = csv.DictReader(f) | |
| for row in r: | |
| row['total_prs'] = int(row['total_prs']) | |
| row['sample_n'] = int(row['sample_n']) | |
| row['sample_with_agent'] = int(row['sample_with_agent']) | |
| row['sample_without_agent'] = int(row['sample_without_agent']) | |
| ratio_str = row.get('ratio_without_agent', '') | |
| row['ratio_without_agent'] = float(ratio_str) if ratio_str not in (None, '') else None | |
| out.append(row) | |
| return out | |
| def plot_stacked_sample(records, out_path: str): | |
| labels = [r['month'] for r in records] | |
| totals = [r['total_prs'] for r in records] | |
| ratios_wo = [r.get('ratio_without_agent') for r in records] | |
| # Estimate counts by scaling total by sampled ratio | |
| without_est = [] | |
| with_est = [] | |
| for tot, ratio_wo in zip(totals, ratios_wo): | |
| if ratio_wo is None: | |
| ratio_wo = 0.0 if tot == 0 else 0.0 | |
| wout = tot * ratio_wo | |
| win = tot - wout | |
| without_est.append(wout) | |
| with_est.append(win) | |
| x = list(range(len(labels))) | |
| plt.figure(figsize=(14, 4)) | |
| plt.bar(x, without_est, color='#4C78A8', label='w/o agent (est)') | |
| plt.bar(x, with_est, bottom=without_est, color='#E45756', label='w/ agent (est)') | |
| plt.xticks(x, labels, rotation=45, ha='right', fontsize=8) | |
| plt.ylabel('Total PRs (estimated split)') | |
| plt.title('Estimated split of total PRs per month: w/ agent vs w/o agent') | |
| plt.legend() | |
| plt.tight_layout() | |
| plt.savefig(out_path, dpi=200) | |
| def plot_total_prs(records, out_path: str): | |
| labels = [r['month'] for r in records] | |
| totals = [r['total_prs'] for r in records] | |
| x = list(range(len(labels))) | |
| plt.figure(figsize=(14, 4)) | |
| plt.bar(x, totals, color='#4C78A8') | |
| plt.xticks(x, labels, rotation=45, ha='right', fontsize=8) | |
| plt.ylabel('PRs created') | |
| plt.title('PRs created per month') | |
| plt.tight_layout() | |
| plt.savefig(out_path, dpi=200) | |
| def main(): | |
| parser = argparse.ArgumentParser(description='Analyze PRs by month and agent involvement.') | |
| sub = parser.add_subparsers(dest='cmd', required=True) | |
| p_crawl = sub.add_parser('crawl', help='Crawl GitHub and write CSV and charts') | |
| p_crawl.add_argument('--user', default='neubig') | |
| p_crawl.add_argument('--months-back', type=int, default=24) | |
| p_crawl.add_argument('--sample-per-month', type=int, default=10) | |
| p_crawl.add_argument('--seed', type=int, default=42) | |
| p_crawl.add_argument('--csv', default='/workspace/project/neubig_prs_monthly.csv') | |
| p_crawl.add_argument('--out-total', default='/workspace/project/neubig_prs_per_month.png') | |
| p_crawl.add_argument('--out-stacked', default='/workspace/project/neubig_prs_stacked.png') | |
| p_read = sub.add_parser('read', help='Read CSV and make stacked bar chart') | |
| p_read.add_argument('--csv', default='/workspace/project/neubig_prs_monthly.csv') | |
| p_read.add_argument('--out-stacked', default='/workspace/project/neubig_prs_stacked.png') | |
| args = parser.parse_args() | |
| if args.cmd == 'crawl': | |
| recs = crawl_and_sample(args.user, args.months_back, args.sample_per_month, args.seed) | |
| write_csv(recs, args.csv) | |
| plot_total_prs(recs, args.out_total) | |
| plot_stacked_sample(recs, args.out_stacked) | |
| print(f'Wrote CSV to {args.csv}') | |
| print(f'Wrote total chart to {args.out_total}') | |
| print(f'Wrote stacked chart to {args.out_stacked}') | |
| elif args.cmd == 'read': | |
| recs = read_csv(args.csv) | |
| plot_stacked_sample(recs, args.out_stacked) | |
| print(f'Wrote stacked chart to {args.out_stacked}') | |
| if __name__ == '__main__': | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment