Last active
July 2, 2023 23:38
-
-
Save lgommans/1b766fd7a24b85f56be13b37c4400218 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
triviafile = 'trivia.csv' | |
apiurl = 'https://opentdb.com/' | |
sleep_between_requests = 0.1 | |
questionsPerRequest = 50 | |
import requests, os, hashlib, json, sys, csv, urllib, time | |
unquote = urllib.parse.unquote | |
def cget(url, cachedir = b'cache'): | |
''' Cached GET request. The URL must be an UTF-8 string and cachedir should be bytes. ''' | |
global _cache, _cacheindex | |
if not os.path.isdir(cachedir): | |
os.mkdir(cachedir) | |
if '_cache' not in globals(): | |
indexfile = os.path.join(cachedir, b'index') | |
if os.path.isfile(indexfile): | |
with open(indexfile) as f: | |
_cache = f.read().replace('\r', '').split('\n') | |
else: | |
_cache = {} | |
_cacheindex = open(indexfile, 'a') | |
h = b'c-' + bytes(hashlib.new('md5', bytes(url, 'utf-8')).hexdigest(), 'utf-8') | |
fname = os.path.join(cachedir, h) | |
if url in _cache: | |
with open(fname) as f: | |
return f.read() | |
print('Performing GET for {}'.format(url)) | |
response = requests.get(url) | |
if response.status_code != 200: | |
return False | |
with open(fname, 'w', encoding='utf-8') as f: | |
f.write(response.text) | |
_cache.append(url) | |
_cacheindex.write(url + '\n') | |
return response.text | |
#{'overall': {'total_num_of_questions': 6098, 'total_num_of_pending_questions': 1710, 'total_num_of_verified_questions': 3031, 'total_num_of_rejected_questions': 1357}, 'categories': {'9': {'total_num_of_questions': 822, 'total_num_of_pending_questions': 307, 'total_num_of_verified_questions': 208, 'total_num_of_rejected_questions': 307}, '10': {'total_num_of_questions': 134, 'total_num_of_pending_questions': 50, 'total_num_of_verified_questions': 69, 'total_num_of_rejected_questions': 15}, '11': {'total_num_of_questions': 296, 'total_num_of_pending_questions': 83, 'total_num_of_verified_questions': 179, 'total_num_of_rejected_questions': 34}, '12': {'total_num_of_questions': 459, 'total_num_of_pending_questions': 130, 'total_num_of_verified_questions': 270, 'total_num_of_rejected_questions': 59}, '13': {'total_num_of_questions': 37, 'total_num_of_pending_questions': 8, 'total_num_of_verified_questions': 22, 'total_num_of_rejected_questions': 7}, | |
counts = json.loads(cget('{}api_count_global.php'.format(apiurl))) | |
totalrows = int(counts['overall']['total_num_of_verified_questions']) | |
# {'trivia_categories': [{'id': 9, 'name': 'General Knowledge'}, {'id': 10, 'name': 'Entertainment: Books'}, {'id': 11, 'name': 'Entertainment: Film'}, {'id': 12, 'name': 'Entertainment: Music'}, {'id': 13, 'name': 'Entertainment: Musicals & Theatres'}, {'id': 14, 'name': 'Entertainment: Television'}, {'id': 15, 'name': 'Entertainment: Video Games'}, {'id': 16, 'name': 'Entertainment: Board Games'}, {'id': 17, 'name': 'Science & Nature'}, {'id': 18, 'name': 'Science: Computers'}, {'id': 19, 'name': 'Science: Mathematics'}, {'id': 20, 'name': 'Mythology'}, {'id': 21, 'name': 'Sports'}, {'id': 22, 'name': 'Geography'}, {'id': 23, 'name': 'History'}, {'id': 24, 'name': 'Politics'}, {'id': 25, 'name': 'Art'}, {'id': 26, 'name': 'Celebrities'}, {'id': 27, 'name': 'Animals'}, {'id': 28, 'name': 'Vehicles'}, {'id': 29, 'name': 'Entertainment: Comics'}, {'id': 30, 'name': 'Science: Gadgets'}, {'id': 31, 'name': 'Entertainment: Japanese Anime & Manga'}, {'id': 32, 'name': 'Entertainment: Cartoon & Animations'}]} | |
tmpcats = json.loads(cget('{}api_category.php'.format(apiurl))) | |
cats = {} | |
for cat in tmpcats['trivia_categories']: | |
cats[cat['id']] = cat['name'] | |
del tmpcats | |
if '-h' in sys.argv or '--help' in sys.argv: | |
print('''Usage: | |
getanswers.py | |
It will create a directory called "cache" and a file called "trivia.csv". | |
The file trivia.csv has its own headers, so the fields should be clear. | |
The first answer, 'Answer 1', is the correct answer to the question. | |
Answers 3 and 4 are empty if the type is 'boolean' (true/false question). | |
''') | |
exit(1) | |
out = csv.writer(open(triviafile, 'w', newline='', encoding='utf-8')) | |
out.writerow(['Category', 'Type', 'Difficulty', 'Question', 'Answer 1', 'Answer 2', 'Answer 3', 'Answer 4']) | |
token = requests.get('https://opentdb.com/api_token.php?command=request').json()['token'] | |
donerows = 0 | |
last = False | |
while True: | |
if totalrows - donerows <= 0: | |
questionsPerRequest = 1 | |
print('Should be done, adjusting questions per request to 1') | |
elif totalrows - donerows < questionsPerRequest: | |
questionsPerRequest = totalrows - donerows | |
print('Should be done soon, adjusting questions per request to {}'.format(questionsPerRequest)) | |
request = requests.get('{}api.php?amount={}&token={}&encode=url3986'.format(apiurl, questionsPerRequest, token)).json() | |
if request['response_code'] == 4: | |
last = True | |
elif request['response_code'] != 0: | |
print('Error: response code {}. Full response:\n{}'.format(request['response_code'], json.dumps(request))) | |
exit(2) | |
rows = [] | |
for question in request['results']: | |
cat = unquote(question['category']) | |
typ = unquote(question['type']) | |
dif = unquote(question['difficulty']) | |
q = unquote(question['question']) | |
answers = [unquote(question['correct_answer'])] + [unquote(_) for _ in question['incorrect_answers']] | |
rows.append([cat, typ, dif, q] + answers) | |
out.writerows(rows) | |
donerows += len(rows) | |
print('Did {}/{} ({}%)...'.format(donerows, totalrows, round(donerows / totalrows * 100, 2))) | |
if last: | |
print('The API indicated that we are done.') | |
break | |
time.sleep(sleep_between_requests) | |
#{'response_code': 0, 'results': [{'category': 'Science%20%26%20Nature', 'type': 'boolean', 'difficulty': 'easy', 'question': 'Igneous%20rocks%20are%20formed%20by%20excessive%20heat%20and%20pressure.', 'correct_answer': 'False', 'incorrect_answers': ['True']}, {'category': 'Entertainment%3A%20Cartoon%20%26%20Animations', 'type': 'multiple', 'difficulty': 'easy', 'question': 'In%20South%20Park%2C%20what%20is%20Stan%27s%20surname%3F', 'correct_answer': 'Marsh', 'incorrect_answers': ['Stotch', 'Broflovski', 'Tweak']}, etc |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Added
encoding='utf-8'
as a fix identified by Lukáš Bláha. Thanks, Lukáš!