leoblum · January 22, 2021 11:42 · leoblum · Jan 22, 2021
diff --git a/download_gmail.py b/download_gmail.py
 import requests
 import json
 import os
 from datetime import timedelta
 from time import time

 output_folder = '.data/gmail_u0'
 cache_file = os.path.join(output_folder, '.cache')
 os.makedirs(output_folder, exist_ok=True)

 # "Copy as cURL" from Chrome Developer Tools > Filter, type: bv?
 chrome_req = r'''

 '''

 req_url = ''
 headers = {}
 payload = ''

 for i, x in enumerate([x.strip() for x in chrome_req.split('\n') if len(x)]):
    if x.startswith('-H'):
        x = "'".join(x.split("'")[1:-1])
        x = x.split(':')
        k, v = x[0].strip(), ':'.join(x[1:]).strip()
        headers[k] = v
        continue

    if x.startswith('--data-binary'):
        payload = "'".join(x.split("'")[1:-1])
        continue

    if x.startswith('curl'):
        req_url = "'".join(x.split("'")[1:-1])
        req_url = req_url.replace('c=' + req_url.split('c=')[1].split('&')[0], 'c=%s')
        continue


 def get_emails_ids(use_cache=False):
    if use_cache and os.path.exists(cache_file):
        with open(cache_file) as fp:
            return json.load(fp)

    emails = set()
    for x in range(999999):
        data = json.loads(payload)
        data['1']['10'] = x
        data = json.dumps(payload)

        rep = requests.post(req_url % x, headers=headers, data=data)
        res = rep.json()

        emails_size = len(emails)
        try:
            for top_email in res['3']:
                emails.add(top_email['1']['20'])
                for thread_email in top_email['1']['5']:
                    emails.add(thread_email['56'])
        except KeyError:
            pass

        size_diff = len(emails) - emails_size
        print('% 3d. Total: %d. New: %d' % (x, len(emails), size_diff))
        if size_diff == 0:
            break

    emails = list(emails)
    return emails


 emails = get_emails_ids(use_cache=True)
 with open(cache_file, 'w') as fp:
    json.dump(emails, fp)


 downloaded = [x.split('.eml')[0] for x in os.listdir(output_folder) if x.endswith('.eml')]
 downloaded = [x for x in downloaded if x in emails]
 start_time = time()
 last_time = start_time
 counter = len(downloaded)
 for x in emails:
    if x in downloaded:
        continue

    url = 'https://mail.google.com/mail/u/0?view=att&th=%s&attid=0&disp=comp&safe=1&zw' % x
    rep = requests.get(url, headers=headers)

    if rep.status_code != 200:
        print('error on %s with code %s' % (x, rep.status_code))
        continue

    with open(os.path.join(output_folder, '%s.eml' % x), 'bw') as fp:
        fp.write(rep.content)

    counter += 1
    current_time = time()
    if current_time - last_time > 20:
        avg_speed = (counter - len(downloaded)) / (current_time - start_time)
        time_left = (len(emails) - counter) / avg_speed
        time_left = timedelta(seconds=int(time_left))
        last_time = current_time
        print('%d of %d emails downloaded. time left: %s' % (counter, len(emails), time_left))
	import requests
	import json
	import os
	from datetime import timedelta
	from time import time

	output_folder = '.data/gmail_u0'
	cache_file = os.path.join(output_folder, '.cache')
	os.makedirs(output_folder, exist_ok=True)

	# "Copy as cURL" from Chrome Developer Tools > Filter, type: bv?
	chrome_req = r'''

	'''

	req_url = ''
	headers = {}
	payload = ''

	for i, x in enumerate([x.strip() for x in chrome_req.split('\n') if len(x)]):
	if x.startswith('-H'):
	x = "'".join(x.split("'")[1:-1])
	x = x.split(':')
	k, v = x[0].strip(), ':'.join(x[1:]).strip()
	headers[k] = v
	continue

	if x.startswith('--data-binary'):
	payload = "'".join(x.split("'")[1:-1])
	continue

	if x.startswith('curl'):
	req_url = "'".join(x.split("'")[1:-1])
	req_url = req_url.replace('c=' + req_url.split('c=')[1].split('&')[0], 'c=%s')
	continue


	def get_emails_ids(use_cache=False):
	if use_cache and os.path.exists(cache_file):
	with open(cache_file) as fp:
	return json.load(fp)

	emails = set()
	for x in range(999999):
	data = json.loads(payload)
	data['1']['10'] = x
	data = json.dumps(payload)

	rep = requests.post(req_url % x, headers=headers, data=data)
	res = rep.json()

	emails_size = len(emails)
	try:
	for top_email in res['3']:
	emails.add(top_email['1']['20'])
	for thread_email in top_email['1']['5']:
	emails.add(thread_email['56'])
	except KeyError:
	pass

	size_diff = len(emails) - emails_size
	print('% 3d. Total: %d. New: %d' % (x, len(emails), size_diff))
	if size_diff == 0:
	break

	emails = list(emails)
	return emails


	emails = get_emails_ids(use_cache=True)
	with open(cache_file, 'w') as fp:
	json.dump(emails, fp)


	downloaded = [x.split('.eml')[0] for x in os.listdir(output_folder) if x.endswith('.eml')]
	downloaded = [x for x in downloaded if x in emails]
	start_time = time()
	last_time = start_time
	counter = len(downloaded)
	for x in emails:
	if x in downloaded:
	continue

	url = 'https://mail.google.com/mail/u/0?view=att&th=%s&attid=0&disp=comp&safe=1&zw' % x
	rep = requests.get(url, headers=headers)

	if rep.status_code != 200:
	print('error on %s with code %s' % (x, rep.status_code))
	continue

	with open(os.path.join(output_folder, '%s.eml' % x), 'bw') as fp:
	fp.write(rep.content)

	counter += 1
	current_time = time()
	if current_time - last_time > 20:
	avg_speed = (counter - len(downloaded)) / (current_time - start_time)
	time_left = (len(emails) - counter) / avg_speed
	time_left = timedelta(seconds=int(time_left))
	last_time = current_time
	print('%d of %d emails downloaded. time left: %s' % (counter, len(emails), time_left))