Last active
April 10, 2025 20:35
-
-
Save benwattsjones/060ad83efd2b3afc8b229d41f9b246c4 to your computer and use it in GitHub Desktop.
Quick python code to parse mbox files, specifically those used by GMail. Extracts sender, date, plain text contents etc., ignores base64 attachments.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python3 | |
# ~*~ utf-8 ~*~ | |
import mailbox | |
import bs4 | |
def get_html_text(html): | |
try: | |
return bs4.BeautifulSoup(html, 'lxml').body.get_text(' ', strip=True) | |
except AttributeError: # message contents empty | |
return None | |
class GmailMboxMessage(): | |
def __init__(self, email_data): | |
if not isinstance(email_data, mailbox.mboxMessage): | |
raise TypeError('Variable must be type mailbox.mboxMessage') | |
self.email_data = email_data | |
def parse_email(self): | |
email_labels = self.email_data['X-Gmail-Labels'] | |
email_date = self.email_data['Date'] | |
email_from = self.email_data['From'] | |
email_to = self.email_data['To'] | |
email_subject = self.email_data['Subject'] | |
email_text = self.read_email_payload() | |
def read_email_payload(self): | |
email_payload = self.email_data.get_payload() | |
if self.email_data.is_multipart(): | |
email_messages = list(self._get_email_messages(email_payload)) | |
else: | |
email_messages = [email_payload] | |
return [self._read_email_text(msg) for msg in email_messages] | |
def _get_email_messages(self, email_payload): | |
for msg in email_payload: | |
if isinstance(msg, (list,tuple)): | |
for submsg in self._get_email_messages(msg): | |
yield submsg | |
elif msg.is_multipart(): | |
for submsg in self._get_email_messages(msg.get_payload()): | |
yield submsg | |
else: | |
yield msg | |
def _read_email_text(self, msg): | |
content_type = 'NA' if isinstance(msg, str) else msg.get_content_type() | |
encoding = 'NA' if isinstance(msg, str) else msg.get('Content-Transfer-Encoding', 'NA') | |
if 'text/plain' in content_type and 'base64' not in encoding: | |
msg_text = msg.get_payload() | |
elif 'text/html' in content_type and 'base64' not in encoding: | |
msg_text = get_html_text(msg.get_payload()) | |
elif content_type == 'NA': | |
msg_text = get_html_text(msg) | |
else: | |
msg_text = None | |
return (content_type, encoding, msg_text) | |
######################### End of library, example of use below | |
mbox_obj = mailbox.mbox('path/to/your-mbox-file-from-gmail.mbox') | |
num_entries = len(mbox_obj) | |
for idx, email_obj in enumerate(mbox_obj): | |
email_data = GmailMboxMessage(email_obj) | |
email_data.parse_email() | |
print('Parsing email {0} of {1}'.format(idx, num_entries)) |
"Is there a way" yes; but it depends on what exactly you mean. A common solution to extracting just the text from HTML payloads is to run Beautifulsoup on the HTML. If you want to trim off quoted text from earlier messages in a thread, I don't know of any existing libraries for that (but that doesn't mean there aren't any). Similarly, you might want to trim signature blocks (honest-to-RFC signatures start with newline, dash, dash, space, newline; but very few modern signatures adhere to this convention).
See also:
For anyone who wants useful output out of the HTML: you'll want
msg.get_payload(decode=True).decode()
This would have saved me a lot of heartburn.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
email_text does not contain pure email text; there is noisy data (like css tags). Is there a method to get the pure text?