|
import os |
|
import argparse |
|
import logging |
|
import re |
|
import pypff |
|
|
|
|
|
def process_folder(folder, path): |
|
folder_path = path + '/' + (folder.name or 'root') |
|
n_msg = folder.number_of_sub_messages |
|
logging.debug('Processing folder "{}" with {} sub-folders and {} messages; full path: "{}"'.format( |
|
folder.name, folder.number_of_sub_folders, n_msg, folder_path)) |
|
|
|
safe_path = re.sub(r'[ /]', '_', re.sub(r'[^a-z0-9 /]', '', folder_path.lower())) |
|
for mi, message in enumerate(folder.sub_messages): |
|
logging.debug('{}/{} > Processing message by {} with subject: {}'.format(mi, n_msg, |
|
message.sender_name, message.subject)) |
|
msg = process_message(message) |
|
fname = os.path.join(output_directory, safe_path + '_' + str(mi) + '.eml') |
|
logging.debug(' -- saving as {}'.format(fname)) |
|
with open(fname, 'w') as f: |
|
f.write(msg) |
|
|
|
for sub_folder in folder.sub_folders: |
|
process_folder(sub_folder, folder_path) |
|
|
|
|
|
def get_body(msg): |
|
def prep(b): |
|
if type(b) == bytes: |
|
b = b.decode("utf-8") |
|
return b.strip() if b else None |
|
|
|
body = prep(msg.plain_text_body) |
|
if body: |
|
return 'plain-text', body |
|
|
|
body = prep(msg.html_body) |
|
if body: |
|
return 'html', body |
|
|
|
body = prep(msg.rtf_body) |
|
if body: |
|
return 'rtf', body |
|
|
|
return 'plain-text', '' |
|
|
|
|
|
def process_message(message): |
|
msg = '' |
|
keys = [] |
|
for hp in message.transport_headers.split('\n'): |
|
pts = re.findall(r'^([^:]+): (.+)\r$', hp) |
|
if pts: |
|
key = pts[0][0].capitalize() |
|
if key in keys: |
|
key = 'X-' + key |
|
keys.append(key) |
|
val = pts[0][1] |
|
|
|
if key == 'Date': |
|
val = ', '.join(val.split(',')[:2]) |
|
|
|
msg += key + ': ' + val + '\r\n' |
|
|
|
msg += 'X-Sender-Name: ' + message.sender_name + '\r\n' |
|
msg += 'X-Delivery-Time: ' + str(message.delivery_time) + '\r\n' |
|
msg += 'X-Creation-Time: ' + str(message.creation_time) + '\r\n' |
|
msg += 'X-Client-Submit-Time: ' + str(message.client_submit_time) + '\r\n' |
|
msg += 'X-Subject: ' + message.subject + '\r\n' |
|
msg += 'X-Attachments: ' + str(message.number_of_attachments) + '\r\n' |
|
|
|
btype, body = get_body(message) |
|
msg += 'X-Body-Type: ' + btype + '\r\n' |
|
|
|
msg += '\r\n' |
|
msg += body |
|
return msg |
|
|
|
|
|
if __name__ == "__main__": |
|
parser = argparse.ArgumentParser() |
|
parser.add_argument('PST_FILE', help="PST File Format from Microsoft Outlook") |
|
parser.add_argument('OUTPUT_DIR', help="Directory of output for temporary and report files.") |
|
parser.add_argument('--logfile', default=None, help='File path of log file.') |
|
args = parser.parse_args() |
|
|
|
output_directory = os.path.abspath(args.OUTPUT_DIR) |
|
|
|
if not os.path.exists(output_directory): |
|
os.makedirs(output_directory) |
|
|
|
if args.logfile: |
|
if not os.path.exists(args.logfile): |
|
os.makedirs(args.logfile) |
|
log_path = os.path.join(args.logfile, 'pst_indexer.log') |
|
else: |
|
log_path = None |
|
logging.basicConfig(level=logging.DEBUG, filename=log_path, |
|
format='%(asctime)s | %(levelname)s | %(message)s', filemode='w') |
|
|
|
logging.info('Starting Script...') |
|
pst_file = args.PST_FILE |
|
try: |
|
pff_file = pypff.file() |
|
pff_file.open(pst_file) |
|
|
|
process_folder(pff_file.root_folder, os.path.basename(pst_file)) |
|
except Exception as e: |
|
raise |
|
finally: |
|
pff_file.close() |
|
|
|
logging.info('Script Complete') |
In case anyone else ends up here, see libyal/libpff#2 to check the status of python bindings for attachments.