neubig · August 15, 2025 13:58
diff --git a/neubig_prs_analysis.py b/neubig_prs_analysis.py
 import os, csv, requests, datetime as dt, calendar, time, random
 from collections import OrderedDict
 import argparse
 import matplotlib.pyplot as plt

 BASE_SEARCH_URL = 'https://api.github.com/search/issues'


 def parse_link_header(value: str):
    links = {}
    if not value:
        return links
    for part in value.split(','):
        section = part.split(';')
        if len(section) < 2:
            continue
        url = section[0].strip()[1:-1]
        rel = None
        for sec in section[1:]:
            sec = sec.strip()
            if sec.startswith('rel='):
                rel = sec.split('=')[1].strip('"')
        if rel:
            links[rel] = url
    return links


 def github_get(url, params=None, max_retries=5):
    headers = {
        'Accept': 'application/vnd.github+json',
        'User-Agent': 'openhands-analysis-script'
    }
    token = os.getenv('GITHUB_TOKEN')
    if token:
        headers['Authorization'] = f'Bearer {token}'
    for attempt in range(max_retries):
        resp = requests.get(url, headers=headers, params=params, timeout=30)
        if resp.status_code == 403 and 'rate limit' in resp.text.lower():
            reset = resp.headers.get('X-RateLimit-Reset')
            if reset:
                wait_s = max(int(reset) - int(time.time()) + 1, 5)
            else:
                wait_s = 10 * (attempt + 1)
            time.sleep(wait_s)
            continue
        if resp.status_code in (502, 503, 504):
            time.sleep(2 * (attempt + 1))
            continue
        resp.raise_for_status()
        return resp
    resp.raise_for_status()


 def month_start_list(months_back: int):
    today = dt.date.today()
    first_of_this_month = dt.date(today.year, today.month, 1)
    out = []
    y, m = first_of_this_month.year, first_of_this_month.month
    for i in range(months_back):
        mm = m - (months_back - 1 - i)
        yy = y
        while mm <= 0:
            mm += 12
            yy -= 1
        out.append(dt.date(yy, mm, 1))
    return out


 def month_range(d: dt.date):
    today = dt.date.today()
    start = d
    if d.year == today.year and d.month == today.month:
        end = today
    else:
        end_day = calendar.monthrange(d.year, d.month)[1]
        end = dt.date(d.year, d.month, end_day)
    return start, end


 def search_prs_for_month(user: str, start_date: dt.date, end_date: dt.date):
    q = f"author:{user} type:pr created:{start_date.isoformat()}..{end_date.isoformat()}"
    params = {'q': q, 'per_page': 100, 'page': 1, 'sort': 'created', 'order': 'asc'}
    items = []
    url = BASE_SEARCH_URL
    while True:
        resp = github_get(url, params=params)
        data = resp.json()
        items.extend(data.get('items', []))
        links = parse_link_header(resp.headers.get('Link', ''))
        if 'next' in links:
            url = links['next']
            params = None
        else:
            break
    return items


 def pr_has_openhands_commit_email(pr_api_url: str) -> bool:
    url = pr_api_url.rstrip('/') + '/commits'
    params = {'per_page': 250, 'page': 1}
    while True:
        resp = github_get(url, params=params)
        commits = resp.json()
        for c in commits:
            cb = c.get('commit', {})
            for who in ('author', 'committer'):
                email = (cb.get(who) or {}).get('email')
                if email and 'openhands' in email.lower():
                    return True
        links = parse_link_header(resp.headers.get('Link', ''))
        if 'next' in links:
            url = links['next']
            params = None
        else:
            break
    return False


 def crawl_and_sample(user: str, months_back: int, sample_per_month: int, seed: int = 42):
    random.seed(seed)
    months = month_start_list(months_back)
    records = []
    for ms in months:
        start, end = month_range(ms)
        items = search_prs_for_month(user, start, end)
        month_key = ms.strftime('%Y-%m')
        total_prs = len(items)
        k = min(sample_per_month, total_prs)
        if k == 0:
            records.append({
                'month': month_key,
                'total_prs': total_prs,
                'sample_n': 0,
                'sample_with_agent': 0,
                'sample_without_agent': 0,
                'ratio_without_agent': ''
            })
            continue
        sample = random.sample(items, k)
        without = 0
        for it in sample:
            pr_url = (it.get('pull_request') or {}).get('url')
            if not pr_url:
                repo_api = it.get('repository_url')
                number = it.get('number')
                if repo_api and number:
                    pr_url = f"{repo_api}/pulls/{number}"
            if not pr_url:
                without += 1
                continue
            has_openhands = pr_has_openhands_commit_email(pr_url)
            if not has_openhands:
                without += 1
        with_agent = k - without
        ratio_wo = without / k if k else ''
        records.append({
            'month': month_key,
            'total_prs': total_prs,
            'sample_n': k,
            'sample_with_agent': with_agent,
            'sample_without_agent': without,
            'ratio_without_agent': f"{ratio_wo:.6f}" if k else ''
        })
    return records


 def write_csv(records, path: str):
    fieldnames = ['month', 'total_prs', 'sample_n', 'sample_with_agent', 'sample_without_agent', 'ratio_without_agent']
    with open(path, 'w', newline='') as f:
        w = csv.DictWriter(f, fieldnames=fieldnames)
        w.writeheader()
        for r in records:
            w.writerow(r)


 def read_csv(path: str):
    out = []
    with open(path, 'r', newline='') as f:
        r = csv.DictReader(f)
        for row in r:
            row['total_prs'] = int(row['total_prs'])
            row['sample_n'] = int(row['sample_n'])
            row['sample_with_agent'] = int(row['sample_with_agent'])
            row['sample_without_agent'] = int(row['sample_without_agent'])
            ratio_str = row.get('ratio_without_agent', '')
            row['ratio_without_agent'] = float(ratio_str) if ratio_str not in (None, '') else None
            out.append(row)
    return out


 def plot_stacked_sample(records, out_path: str):
    labels = [r['month'] for r in records]
    totals = [r['total_prs'] for r in records]
    ratios_wo = [r.get('ratio_without_agent') for r in records]
    # Estimate counts by scaling total by sampled ratio
    without_est = []
    with_est = []
    for tot, ratio_wo in zip(totals, ratios_wo):
        if ratio_wo is None:
            ratio_wo = 0.0 if tot == 0 else 0.0
        wout = tot * ratio_wo
        win = tot - wout
        without_est.append(wout)
        with_est.append(win)
    x = list(range(len(labels)))
    plt.figure(figsize=(14, 4))
    plt.bar(x, without_est, color='#4C78A8', label='w/o agent (est)')
    plt.bar(x, with_est, bottom=without_est, color='#E45756', label='w/ agent (est)')
    plt.xticks(x, labels, rotation=45, ha='right', fontsize=8)
    plt.ylabel('Total PRs (estimated split)')
    plt.title('Estimated split of total PRs per month: w/ agent vs w/o agent')
    plt.legend()
    plt.tight_layout()
    plt.savefig(out_path, dpi=200)


 def plot_total_prs(records, out_path: str):
    labels = [r['month'] for r in records]
    totals = [r['total_prs'] for r in records]
    x = list(range(len(labels)))
    plt.figure(figsize=(14, 4))
    plt.bar(x, totals, color='#4C78A8')
    plt.xticks(x, labels, rotation=45, ha='right', fontsize=8)
    plt.ylabel('PRs created')
    plt.title('PRs created per month')
    plt.tight_layout()
    plt.savefig(out_path, dpi=200)


 def main():
    parser = argparse.ArgumentParser(description='Analyze PRs by month and agent involvement.')
    sub = parser.add_subparsers(dest='cmd', required=True)

    p_crawl = sub.add_parser('crawl', help='Crawl GitHub and write CSV and charts')
    p_crawl.add_argument('--user', default='neubig')
    p_crawl.add_argument('--months-back', type=int, default=24)
    p_crawl.add_argument('--sample-per-month', type=int, default=10)
    p_crawl.add_argument('--seed', type=int, default=42)
    p_crawl.add_argument('--csv', default='/workspace/project/neubig_prs_monthly.csv')
    p_crawl.add_argument('--out-total', default='/workspace/project/neubig_prs_per_month.png')
    p_crawl.add_argument('--out-stacked', default='/workspace/project/neubig_prs_stacked.png')

    p_read = sub.add_parser('read', help='Read CSV and make stacked bar chart')
    p_read.add_argument('--csv', default='/workspace/project/neubig_prs_monthly.csv')
    p_read.add_argument('--out-stacked', default='/workspace/project/neubig_prs_stacked.png')

    args = parser.parse_args()

    if args.cmd == 'crawl':
        recs = crawl_and_sample(args.user, args.months_back, args.sample_per_month, args.seed)
        write_csv(recs, args.csv)
        plot_total_prs(recs, args.out_total)
        plot_stacked_sample(recs, args.out_stacked)
        print(f'Wrote CSV to {args.csv}')
        print(f'Wrote total chart to {args.out_total}')
        print(f'Wrote stacked chart to {args.out_stacked}')
    elif args.cmd == 'read':
        recs = read_csv(args.csv)
        plot_stacked_sample(recs, args.out_stacked)
        print(f'Wrote stacked chart to {args.out_stacked}')


 if __name__ == '__main__':
    main()
	import os, csv, requests, datetime as dt, calendar, time, random
	from collections import OrderedDict
	import argparse
	import matplotlib.pyplot as plt

	BASE_SEARCH_URL = 'https://api.github.com/search/issues'


	def parse_link_header(value: str):
	links = {}
	if not value:
	return links
	for part in value.split(','):
	section = part.split(';')
	if len(section) < 2:
	continue
	url = section[0].strip()[1:-1]
	rel = None
	for sec in section[1:]:
	sec = sec.strip()
	if sec.startswith('rel='):
	rel = sec.split('=')[1].strip('"')
	if rel:
	links[rel] = url
	return links


	def github_get(url, params=None, max_retries=5):
	headers = {
	'Accept': 'application/vnd.github+json',
	'User-Agent': 'openhands-analysis-script'
	}
	token = os.getenv('GITHUB_TOKEN')
	if token:
	headers['Authorization'] = f'Bearer {token}'
	for attempt in range(max_retries):
	resp = requests.get(url, headers=headers, params=params, timeout=30)
	if resp.status_code == 403 and 'rate limit' in resp.text.lower():
	reset = resp.headers.get('X-RateLimit-Reset')
	if reset:
	wait_s = max(int(reset) - int(time.time()) + 1, 5)
	else:
	wait_s = 10 * (attempt + 1)
	time.sleep(wait_s)
	continue
	if resp.status_code in (502, 503, 504):
	time.sleep(2 * (attempt + 1))
	continue
	resp.raise_for_status()
	return resp
	resp.raise_for_status()


	def month_start_list(months_back: int):
	today = dt.date.today()
	first_of_this_month = dt.date(today.year, today.month, 1)
	out = []
	y, m = first_of_this_month.year, first_of_this_month.month
	for i in range(months_back):
	mm = m - (months_back - 1 - i)
	yy = y
	while mm <= 0:
	mm += 12
	yy -= 1
	out.append(dt.date(yy, mm, 1))
	return out


	def month_range(d: dt.date):
	today = dt.date.today()
	start = d
	if d.year == today.year and d.month == today.month:
	end = today
	else:
	end_day = calendar.monthrange(d.year, d.month)[1]
	end = dt.date(d.year, d.month, end_day)
	return start, end


	def search_prs_for_month(user: str, start_date: dt.date, end_date: dt.date):
	q = f"author:{user} type:pr created:{start_date.isoformat()}..{end_date.isoformat()}"
	params = {'q': q, 'per_page': 100, 'page': 1, 'sort': 'created', 'order': 'asc'}
	items = []
	url = BASE_SEARCH_URL
	while True:
	resp = github_get(url, params=params)
	data = resp.json()
	items.extend(data.get('items', []))
	links = parse_link_header(resp.headers.get('Link', ''))
	if 'next' in links:
	url = links['next']
	params = None
	else:
	break
	return items


	def pr_has_openhands_commit_email(pr_api_url: str) -> bool:
	url = pr_api_url.rstrip('/') + '/commits'
	params = {'per_page': 250, 'page': 1}
	while True:
	resp = github_get(url, params=params)
	commits = resp.json()
	for c in commits:
	cb = c.get('commit', {})
	for who in ('author', 'committer'):
	email = (cb.get(who) or {}).get('email')
	if email and 'openhands' in email.lower():
	return True
	links = parse_link_header(resp.headers.get('Link', ''))
	if 'next' in links:
	url = links['next']
	params = None
	else:
	break
	return False


	def crawl_and_sample(user: str, months_back: int, sample_per_month: int, seed: int = 42):
	random.seed(seed)
	months = month_start_list(months_back)
	records = []
	for ms in months:
	start, end = month_range(ms)
	items = search_prs_for_month(user, start, end)
	month_key = ms.strftime('%Y-%m')
	total_prs = len(items)
	k = min(sample_per_month, total_prs)
	if k == 0:
	records.append({
	'month': month_key,
	'total_prs': total_prs,
	'sample_n': 0,
	'sample_with_agent': 0,
	'sample_without_agent': 0,
	'ratio_without_agent': ''
	})
	continue
	sample = random.sample(items, k)
	without = 0
	for it in sample:
	pr_url = (it.get('pull_request') or {}).get('url')
	if not pr_url:
	repo_api = it.get('repository_url')
	number = it.get('number')
	if repo_api and number:
	pr_url = f"{repo_api}/pulls/{number}"
	if not pr_url:
	without += 1
	continue
	has_openhands = pr_has_openhands_commit_email(pr_url)
	if not has_openhands:
	without += 1
	with_agent = k - without
	ratio_wo = without / k if k else ''
	records.append({
	'month': month_key,
	'total_prs': total_prs,
	'sample_n': k,
	'sample_with_agent': with_agent,
	'sample_without_agent': without,
	'ratio_without_agent': f"{ratio_wo:.6f}" if k else ''
	})
	return records


	def write_csv(records, path: str):
	fieldnames = ['month', 'total_prs', 'sample_n', 'sample_with_agent', 'sample_without_agent', 'ratio_without_agent']
	with open(path, 'w', newline='') as f:
	w = csv.DictWriter(f, fieldnames=fieldnames)
	w.writeheader()
	for r in records:
	w.writerow(r)


	def read_csv(path: str):
	out = []
	with open(path, 'r', newline='') as f:
	r = csv.DictReader(f)
	for row in r:
	row['total_prs'] = int(row['total_prs'])
	row['sample_n'] = int(row['sample_n'])
	row['sample_with_agent'] = int(row['sample_with_agent'])
	row['sample_without_agent'] = int(row['sample_without_agent'])
	ratio_str = row.get('ratio_without_agent', '')
	row['ratio_without_agent'] = float(ratio_str) if ratio_str not in (None, '') else None
	out.append(row)
	return out


	def plot_stacked_sample(records, out_path: str):
	labels = [r['month'] for r in records]
	totals = [r['total_prs'] for r in records]
	ratios_wo = [r.get('ratio_without_agent') for r in records]
	# Estimate counts by scaling total by sampled ratio
	without_est = []
	with_est = []
	for tot, ratio_wo in zip(totals, ratios_wo):
	if ratio_wo is None:
	ratio_wo = 0.0 if tot == 0 else 0.0
	wout = tot * ratio_wo
	win = tot - wout
	without_est.append(wout)
	with_est.append(win)
	x = list(range(len(labels)))
	plt.figure(figsize=(14, 4))
	plt.bar(x, without_est, color='#4C78A8', label='w/o agent (est)')
	plt.bar(x, with_est, bottom=without_est, color='#E45756', label='w/ agent (est)')
	plt.xticks(x, labels, rotation=45, ha='right', fontsize=8)
	plt.ylabel('Total PRs (estimated split)')
	plt.title('Estimated split of total PRs per month: w/ agent vs w/o agent')
	plt.legend()
	plt.tight_layout()
	plt.savefig(out_path, dpi=200)


	def plot_total_prs(records, out_path: str):
	labels = [r['month'] for r in records]
	totals = [r['total_prs'] for r in records]
	x = list(range(len(labels)))
	plt.figure(figsize=(14, 4))
	plt.bar(x, totals, color='#4C78A8')
	plt.xticks(x, labels, rotation=45, ha='right', fontsize=8)
	plt.ylabel('PRs created')
	plt.title('PRs created per month')
	plt.tight_layout()
	plt.savefig(out_path, dpi=200)


	def main():
	parser = argparse.ArgumentParser(description='Analyze PRs by month and agent involvement.')
	sub = parser.add_subparsers(dest='cmd', required=True)

	p_crawl = sub.add_parser('crawl', help='Crawl GitHub and write CSV and charts')
	p_crawl.add_argument('--user', default='neubig')
	p_crawl.add_argument('--months-back', type=int, default=24)
	p_crawl.add_argument('--sample-per-month', type=int, default=10)
	p_crawl.add_argument('--seed', type=int, default=42)
	p_crawl.add_argument('--csv', default='/workspace/project/neubig_prs_monthly.csv')
	p_crawl.add_argument('--out-total', default='/workspace/project/neubig_prs_per_month.png')
	p_crawl.add_argument('--out-stacked', default='/workspace/project/neubig_prs_stacked.png')

	p_read = sub.add_parser('read', help='Read CSV and make stacked bar chart')
	p_read.add_argument('--csv', default='/workspace/project/neubig_prs_monthly.csv')
	p_read.add_argument('--out-stacked', default='/workspace/project/neubig_prs_stacked.png')

	args = parser.parse_args()

	if args.cmd == 'crawl':
	recs = crawl_and_sample(args.user, args.months_back, args.sample_per_month, args.seed)
	write_csv(recs, args.csv)
	plot_total_prs(recs, args.out_total)
	plot_stacked_sample(recs, args.out_stacked)
	print(f'Wrote CSV to {args.csv}')
	print(f'Wrote total chart to {args.out_total}')
	print(f'Wrote stacked chart to {args.out_stacked}')
	elif args.cmd == 'read':
	recs = read_csv(args.csv)
	plot_stacked_sample(recs, args.out_stacked)
	print(f'Wrote stacked chart to {args.out_stacked}')


	if __name__ == '__main__':
	main()
No results found