Created
November 3, 2021 05:38
-
-
Save kadin2048/ad0feef9f4f4230cd207907eceb17452 to your computer and use it in GitHub Desktop.
Turn a Pidgin HTML chatlog into a Thunderbird-compatible .eml file so that it can be imported into IMAP for archive purposes.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# Turn an Pidgin HTML chatlog into a Thunderbird-compatible .eml file | |
# so that it can be imported into Gmail for archive purposes. | |
# | |
# Syntax: $ python pidgintoeml.py pidginlogfile.html [outputfilename.eml] | |
# | |
# Version: 2011-09-28 | |
# | |
import sys | |
import datetime | |
import os.path | |
from email.MIMEMultipart import MIMEMultipart | |
from email.MIMEText import MIMEText | |
# Debug flag; set to False to suppress information | |
global debug | |
debug = False | |
def main(): | |
# Use the first argument as the input file | |
try: | |
if sys.argv[1]: | |
filename = sys.argv[1] | |
except IndexError: | |
sys.stderr.write("No input file specified.\n") | |
return 1 | |
# Second arg, if present, is the ouput file | |
try: | |
outfilename = sys.argv[2] | |
# Note that this will throw IndexError if not present | |
except IndexError: | |
# which we catch here | |
outfilename = filename + '.eml' | |
# DEBUG | |
if debug: | |
print "-- Reading from " + filename | |
print "-- Writing to " + outfilename | |
# Test to see if the output file already exists (processed already) | |
if os.path.isfile(outfilename): | |
sys.stderr.write("Output file " + outfilename + " already exists. Terminating.\n") | |
return 1 | |
# Open the in and out files | |
try: | |
fi = open(filename, 'r') # fi is a file object | |
fo = open(outfilename, 'w') | |
except IOError: | |
sys.stderr.write("IO Error while opening files.\n") | |
return 1 | |
# Create a message object | |
msg_base = MIMEMultipart('mixed') | |
if filename.split('.')[-1] == 'html': | |
# For probable Pidgin logs (ending in .html)... | |
# Process the first line of the input file to determine the eml headers | |
determineHTMLLogHeaders( fi.readline(), msg_base ) | |
fi.seek(0) # reset file object | |
if debug: | |
print "-- Headers after parsing first line are..." | |
for key, value in msg_base.items(): | |
print key + ": " + value | |
# Create the HTML payload using the entire file | |
doctype = '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">\n' | |
ht = doctype + fi.read()#.encode('utf-8') | |
if debug: | |
print "-- HTML body is of type:" | |
print type(ht) | |
msghtml = MIMEText(ht, 'html')#, 'utf-8') | |
# Attach the HTML to the root | |
msg_base.attach(msghtml) | |
if filename.split('.')[-1] == 'chatlog': | |
# For XML formatted chat logs | |
sys.stderr.write("XML chatlog processing not implemented. Terminating.\n") | |
return 1 | |
#if filename.split('.')[-1] == | |
if debug: | |
print "-- Ready to flatten and write message..." | |
# Write out the message | |
fo.write( msg_base.as_string() ) | |
if debug: | |
print "-- Complete." | |
return 0 | |
def determineHTMLLogHeaders(firstline, msg_base): | |
# Start by just looking at the <title> element | |
title = firstline[firstline.find("<title>")+7:firstline.find("</title>")] | |
if debug: | |
print "<title>: " + title | |
# Determine the 'From' address of the chat | |
# TODO: This would be better done with a regexp but I was lazy | |
msg_base['From'] = title[title.find("Conversation with ")+18:title.find(" at ")] | |
if debug: | |
print "-- From is: " + msg_base['From'] | |
# Determine the 'To' address | |
msg_base['To'] = title[title.find(" on ")+4:] | |
if debug: | |
print "-- To is: " + msg_base['To'] | |
# Now we have to deal with the date. This is messy. | |
logdate = title[title.find(" at ")+4:title.find(" on ")] | |
# Turn it into a datetime object | |
d = datetime.datetime.strptime(logdate, '%m/%d/%Y %I:%M:%S %p') | |
# Then write it out to RFC822 format | |
# TODO: This is a naive/stupid way of handling timezone! | |
msg_base['Date'] = d.strftime("%a, %d %b %Y %H:%M:%S" + " -0500 (EST)") | |
# And the message subject | |
msg_base['Subject'] = title[0:title.find(" on ")] | |
if __name__ == "__main__": | |
sys.exit( main() ) # program return value is main()'s return value |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Please note this is for Python 2.x and will not work on Python 3 without some modifications.