Last active
December 17, 2019 14:54
-
-
Save snopoke/85af6f30e114ea1f8d8d410c33dc59f1 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import csv | |
import shutil | |
import gzip | |
from datetime import datetime, timedelta | |
from collections import Counter | |
from elasticsearch.exceptions import ConnectionTimeout | |
from corehq.util.timezones.utils import parse_date | |
from corehq.apps.es import FormES | |
path = '/home/cchq/form_dates' | |
try: | |
shutil.rmtree(path) | |
except FileNotFoundError: | |
pass | |
os.makedirs(path) | |
gstart = datetime.utcnow() | |
start = datetime(2019, 10, 1) | |
end = datetime(2019, 11, 1) | |
tc = 0 | |
day_summary = {} | |
dump_raw = False | |
while start < end: | |
day = start.date() | |
if dump_raw: | |
all_data_filename = f"{start.strftime('%Y-%m-%d')}.csv.gz" | |
all_data_headers = [] | |
if not os.path.isfile(os.path.join(path, all_data_filename)): | |
all_data_headers = ['form_id', 'completed_on', 'received_on', 'days_diff'] | |
if day not in day_summary: | |
day_summary[day] = Counter() | |
prev_day = day - timedelta(days=1) | |
prev_day_summary = day_summary.pop(prev_day, None) | |
if prev_day_summary: | |
summary_filename = f"{prev_day.strftime('%Y-%m-%d')}_summary.csv" | |
print(f' Writing summary {summary_filename}') | |
with open(os.path.join(path, summary_filename), 'w') as f: | |
writer = csv.writer(f) | |
writer.writerow(['days_diff', 'form_count']) | |
writer.writerows(sorted(prev_day_summary.items())) | |
count = 0 | |
se = start + timedelta(hours=1) | |
rows = [] | |
try: | |
forms = FormES().domain('icds-cas').completed(gte=start, lt=se).source(['form.meta.timeEnd', 'received_on']).scroll() | |
for form in forms: | |
count += 1 | |
completed_on = form['form']['meta']['timeEnd'] | |
received_on = form['received_on'] | |
days_diff = parse_date(received_on).date() - parse_date(completed_on).date() | |
rows.append([form['_id'], completed_on, received_on, days_diff.days]) | |
except ConnectionTimeout: | |
print(f' retrying batch: {start} to {se}') | |
continue | |
if dump_raw: | |
with gzip.open(os.path.join(path, all_data_filename), 'at') as f: | |
writer = csv.writer(f) | |
if all_data_headers: | |
writer.writerow(all_data_headers) | |
writer.writerows(rows) | |
day_summary[day].update([row[3] for row in rows]) | |
tc += count | |
print(f'[{datetime.utcnow()}] Runtime: {datetime.utcnow() - gstart}, Progress: {start} to {se}: {count} ({tc})') | |
start = se |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment