Skip to content

Instantly share code, notes, and snippets.

@neubig
Created August 15, 2025 13:58
Show Gist options
  • Select an option

  • Save neubig/804fcd7b4201f4fb32a357cc95cdac79 to your computer and use it in GitHub Desktop.

Select an option

Save neubig/804fcd7b4201f4fb32a357cc95cdac79 to your computer and use it in GitHub Desktop.
A script to analyze the number of PRs created by `neubig` and the number that openhands contributed to
import os, csv, requests, datetime as dt, calendar, time, random
from collections import OrderedDict
import argparse
import matplotlib.pyplot as plt
BASE_SEARCH_URL = 'https://api.github.com/search/issues'
def parse_link_header(value: str):
links = {}
if not value:
return links
for part in value.split(','):
section = part.split(';')
if len(section) < 2:
continue
url = section[0].strip()[1:-1]
rel = None
for sec in section[1:]:
sec = sec.strip()
if sec.startswith('rel='):
rel = sec.split('=')[1].strip('"')
if rel:
links[rel] = url
return links
def github_get(url, params=None, max_retries=5):
headers = {
'Accept': 'application/vnd.github+json',
'User-Agent': 'openhands-analysis-script'
}
token = os.getenv('GITHUB_TOKEN')
if token:
headers['Authorization'] = f'Bearer {token}'
for attempt in range(max_retries):
resp = requests.get(url, headers=headers, params=params, timeout=30)
if resp.status_code == 403 and 'rate limit' in resp.text.lower():
reset = resp.headers.get('X-RateLimit-Reset')
if reset:
wait_s = max(int(reset) - int(time.time()) + 1, 5)
else:
wait_s = 10 * (attempt + 1)
time.sleep(wait_s)
continue
if resp.status_code in (502, 503, 504):
time.sleep(2 * (attempt + 1))
continue
resp.raise_for_status()
return resp
resp.raise_for_status()
def month_start_list(months_back: int):
today = dt.date.today()
first_of_this_month = dt.date(today.year, today.month, 1)
out = []
y, m = first_of_this_month.year, first_of_this_month.month
for i in range(months_back):
mm = m - (months_back - 1 - i)
yy = y
while mm <= 0:
mm += 12
yy -= 1
out.append(dt.date(yy, mm, 1))
return out
def month_range(d: dt.date):
today = dt.date.today()
start = d
if d.year == today.year and d.month == today.month:
end = today
else:
end_day = calendar.monthrange(d.year, d.month)[1]
end = dt.date(d.year, d.month, end_day)
return start, end
def search_prs_for_month(user: str, start_date: dt.date, end_date: dt.date):
q = f"author:{user} type:pr created:{start_date.isoformat()}..{end_date.isoformat()}"
params = {'q': q, 'per_page': 100, 'page': 1, 'sort': 'created', 'order': 'asc'}
items = []
url = BASE_SEARCH_URL
while True:
resp = github_get(url, params=params)
data = resp.json()
items.extend(data.get('items', []))
links = parse_link_header(resp.headers.get('Link', ''))
if 'next' in links:
url = links['next']
params = None
else:
break
return items
def pr_has_openhands_commit_email(pr_api_url: str) -> bool:
url = pr_api_url.rstrip('/') + '/commits'
params = {'per_page': 250, 'page': 1}
while True:
resp = github_get(url, params=params)
commits = resp.json()
for c in commits:
cb = c.get('commit', {})
for who in ('author', 'committer'):
email = (cb.get(who) or {}).get('email')
if email and 'openhands' in email.lower():
return True
links = parse_link_header(resp.headers.get('Link', ''))
if 'next' in links:
url = links['next']
params = None
else:
break
return False
def crawl_and_sample(user: str, months_back: int, sample_per_month: int, seed: int = 42):
random.seed(seed)
months = month_start_list(months_back)
records = []
for ms in months:
start, end = month_range(ms)
items = search_prs_for_month(user, start, end)
month_key = ms.strftime('%Y-%m')
total_prs = len(items)
k = min(sample_per_month, total_prs)
if k == 0:
records.append({
'month': month_key,
'total_prs': total_prs,
'sample_n': 0,
'sample_with_agent': 0,
'sample_without_agent': 0,
'ratio_without_agent': ''
})
continue
sample = random.sample(items, k)
without = 0
for it in sample:
pr_url = (it.get('pull_request') or {}).get('url')
if not pr_url:
repo_api = it.get('repository_url')
number = it.get('number')
if repo_api and number:
pr_url = f"{repo_api}/pulls/{number}"
if not pr_url:
without += 1
continue
has_openhands = pr_has_openhands_commit_email(pr_url)
if not has_openhands:
without += 1
with_agent = k - without
ratio_wo = without / k if k else ''
records.append({
'month': month_key,
'total_prs': total_prs,
'sample_n': k,
'sample_with_agent': with_agent,
'sample_without_agent': without,
'ratio_without_agent': f"{ratio_wo:.6f}" if k else ''
})
return records
def write_csv(records, path: str):
fieldnames = ['month', 'total_prs', 'sample_n', 'sample_with_agent', 'sample_without_agent', 'ratio_without_agent']
with open(path, 'w', newline='') as f:
w = csv.DictWriter(f, fieldnames=fieldnames)
w.writeheader()
for r in records:
w.writerow(r)
def read_csv(path: str):
out = []
with open(path, 'r', newline='') as f:
r = csv.DictReader(f)
for row in r:
row['total_prs'] = int(row['total_prs'])
row['sample_n'] = int(row['sample_n'])
row['sample_with_agent'] = int(row['sample_with_agent'])
row['sample_without_agent'] = int(row['sample_without_agent'])
ratio_str = row.get('ratio_without_agent', '')
row['ratio_without_agent'] = float(ratio_str) if ratio_str not in (None, '') else None
out.append(row)
return out
def plot_stacked_sample(records, out_path: str):
labels = [r['month'] for r in records]
totals = [r['total_prs'] for r in records]
ratios_wo = [r.get('ratio_without_agent') for r in records]
# Estimate counts by scaling total by sampled ratio
without_est = []
with_est = []
for tot, ratio_wo in zip(totals, ratios_wo):
if ratio_wo is None:
ratio_wo = 0.0 if tot == 0 else 0.0
wout = tot * ratio_wo
win = tot - wout
without_est.append(wout)
with_est.append(win)
x = list(range(len(labels)))
plt.figure(figsize=(14, 4))
plt.bar(x, without_est, color='#4C78A8', label='w/o agent (est)')
plt.bar(x, with_est, bottom=without_est, color='#E45756', label='w/ agent (est)')
plt.xticks(x, labels, rotation=45, ha='right', fontsize=8)
plt.ylabel('Total PRs (estimated split)')
plt.title('Estimated split of total PRs per month: w/ agent vs w/o agent')
plt.legend()
plt.tight_layout()
plt.savefig(out_path, dpi=200)
def plot_total_prs(records, out_path: str):
labels = [r['month'] for r in records]
totals = [r['total_prs'] for r in records]
x = list(range(len(labels)))
plt.figure(figsize=(14, 4))
plt.bar(x, totals, color='#4C78A8')
plt.xticks(x, labels, rotation=45, ha='right', fontsize=8)
plt.ylabel('PRs created')
plt.title('PRs created per month')
plt.tight_layout()
plt.savefig(out_path, dpi=200)
def main():
parser = argparse.ArgumentParser(description='Analyze PRs by month and agent involvement.')
sub = parser.add_subparsers(dest='cmd', required=True)
p_crawl = sub.add_parser('crawl', help='Crawl GitHub and write CSV and charts')
p_crawl.add_argument('--user', default='neubig')
p_crawl.add_argument('--months-back', type=int, default=24)
p_crawl.add_argument('--sample-per-month', type=int, default=10)
p_crawl.add_argument('--seed', type=int, default=42)
p_crawl.add_argument('--csv', default='/workspace/project/neubig_prs_monthly.csv')
p_crawl.add_argument('--out-total', default='/workspace/project/neubig_prs_per_month.png')
p_crawl.add_argument('--out-stacked', default='/workspace/project/neubig_prs_stacked.png')
p_read = sub.add_parser('read', help='Read CSV and make stacked bar chart')
p_read.add_argument('--csv', default='/workspace/project/neubig_prs_monthly.csv')
p_read.add_argument('--out-stacked', default='/workspace/project/neubig_prs_stacked.png')
args = parser.parse_args()
if args.cmd == 'crawl':
recs = crawl_and_sample(args.user, args.months_back, args.sample_per_month, args.seed)
write_csv(recs, args.csv)
plot_total_prs(recs, args.out_total)
plot_stacked_sample(recs, args.out_stacked)
print(f'Wrote CSV to {args.csv}')
print(f'Wrote total chart to {args.out_total}')
print(f'Wrote stacked chart to {args.out_stacked}')
elif args.cmd == 'read':
recs = read_csv(args.csv)
plot_stacked_sample(recs, args.out_stacked)
print(f'Wrote stacked chart to {args.out_stacked}')
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment