Last active
November 10, 2025 13:04
-
-
Save edsu/3a30bb66cf15165a5e7078d22c7a3082 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| # | |
| # This program will introspect on an OpenAlex API filter call and try to | |
| # determine what record in the cursored result set is causing a problem. | |
| # | |
| # ./openalexbug.py https://api.openalex.org/works?filter=author.id:https://openalex.org/A5003671931&cursor=&per-page=200 | |
| # problem record: https://openalex.org/W3200281942 | |
| # 121 records | |
| # | |
| # /// script | |
| # dependencies = ["requests"] | |
| # /// | |
| # | |
| import sys | |
| import time | |
| from urllib.parse import urlparse, parse_qs | |
| import requests | |
| if len(sys.argv) != 2: | |
| sys.exit("usage: openalexbug.py <API URL>") | |
| api_url = urlparse(sys.argv[1]) | |
| if api_url.netloc != 'api.openalex.org': | |
| sys.exit(f"{sys.argv[1]} isn't an OpenAlex API URL") | |
| url = 'https://api.openalex.org/' + api_url.path | |
| params = parse_qs(api_url.query) | |
| params['cursor'] = '*' | |
| params['per-page'] = 200 | |
| record_count = 0 | |
| while True: | |
| time.sleep(1) | |
| resp= requests.get(url, params) | |
| if resp.status_code == 500: | |
| params['per-page'] = int(params['per-page'] / 2) | |
| # cut the page size in half to zero in on problematic record | |
| if params['per-page'] > 1: | |
| continue | |
| else: | |
| # just get the ID instead of the full record, so we can advance the cursor | |
| params['select'] = 'id' | |
| resp = requests.get(url, params) | |
| if resp.status_code == 200: | |
| print(f"problem record: {resp.json()['results'][0]['id']}") | |
| del params['select'] | |
| else: | |
| # this shouldn't happen when just getting the id? | |
| resp.raise_for_status() | |
| results = resp.json() | |
| record_count += len(results['results']) | |
| params['per-page'] == 200 | |
| params['cursor'] = results['meta'].get('next_cursor') | |
| # if there's no cursor we're done! | |
| if not params['cursor']: | |
| print(f"{record_count} records") | |
| break |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment