Created
December 7, 2019 02:52
-
-
Save theiostream/53d191fa7ee258a43fbcc7fb0567b7b2 to your computer and use it in GitHub Desktop.
Tentatively converts IdeaList .TEX file to CSV
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# converts Blackwell IdeaList .TEX file to CSV | |
# (c) 2019 Daniel Ferreira | |
from array import array | |
from datetime import date | |
import csv | |
FIELD_TYPE_STRING = 0 | |
FIELD_TYPE_DATE = 1 | |
STATE_START = 0 | |
STATE_READING_FIELD_SPEC = 1 | |
STATE_READING_FIELD_DATA = 2 | |
NICE_KEY = { | |
'8506': 'name', | |
'0b7f': 'title', | |
'c903': 'christname', | |
'2103': 'date', | |
'd619': 'type', | |
'ab3d': 'subject', | |
'0203': 'bill_name', | |
'0433': 'hoppit_subj', | |
'840c': 'question', | |
'a807': 'procon', | |
'0000': 'other' | |
} | |
def round_down(n, div): | |
return n - (n % div) | |
# The date format here is weird. | |
def parse_date(dd): | |
date_byte1 = dd[2] | |
date_byte2 = dd[3] | |
year = int(((date_byte1 - 0x34) / 2) + 1690) | |
byte1_odd = date_byte1 % 2 != 0 | |
rd = round_down(date_byte2, 0x20) | |
month_ = int(rd / 0x20) | |
if byte1_odd is True: | |
month = month_ + 8 | |
else: | |
month = month_ | |
day = date_byte2 - rd | |
return date(year, month, day).strftime('%Y-%m-%d') | |
with open('JOURNALS.TEX', 'rb') as f: | |
data = array('B', f.read()) | |
final_len = len(data) | |
output = [] | |
cum_obj = {} | |
state = STATE_START | |
idx = 0 | |
field_type = FIELD_TYPE_STRING | |
field_len = 0 | |
while True: | |
if idx >= final_len: | |
break | |
if state == STATE_START: | |
if data[idx+8] != 0xab: | |
idx += 1 | |
continue | |
cum_obj['id'] = bytes(data[idx:idx+3]).hex() | |
state = STATE_READING_FIELD_SPEC | |
idx += 10 | |
elif state == STATE_READING_FIELD_SPEC: | |
if data[idx] == 0x57 and data[idx+1] == 0x20: | |
field_type = FIELD_TYPE_STRING | |
elif data[idx] == 0x44 and data[idx+1] == 0x01: | |
field_type = FIELD_TYPE_DATE | |
else: | |
raise Exception('bad field type') | |
field_len = data[idx+2] | |
state = STATE_READING_FIELD_DATA | |
idx += 4 | |
elif state == STATE_READING_FIELD_DATA: | |
key = NICE_KEY[bytes(data[idx+field_len:idx+field_len+2]).hex()] | |
if field_type == FIELD_TYPE_STRING: | |
cum_obj[key] = bytes(data[idx:idx+field_len]).decode('utf-8', errors = 'ignore') | |
elif field_type == FIELD_TYPE_DATE: | |
cum_obj[key] = parse_date(data[idx:idx+field_len]) | |
else: | |
raise Exception('Bad field type') | |
if data[idx+field_len+2] == 0x00: | |
output.append(cum_obj) | |
cum_obj = {} | |
state = STATE_START | |
idx += field_len + 2 + 13 | |
else: | |
state = STATE_READING_FIELD_SPEC | |
idx += field_len + 2 | |
keys = output[0].keys() | |
with open('output.csv', 'w') as oupf: | |
dict_writer = csv.DictWriter(oupf, keys) | |
dict_writer.writeheader() | |
dict_writer.writerows(output) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment