Last active
July 10, 2023 12:46
-
-
Save HEKUCHAN/b0314e80eba8b94e4da89c4945b3ce58 to your computer and use it in GitHub Desktop.
Count by type of encoding in mbox
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import pprint\n", | |
"import mailbox\n", | |
"import collections\n", | |
"from email.header import decode_header" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"mbox_path = \"./sample_mbox/all_mail.mbox\"\n", | |
"emails = []\n", | |
"encoding_list = []\n", | |
"body_encoding_list = []\n", | |
"header_encoding_list = []" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"mail_box = mailbox.mbox(\n", | |
" mbox_path\n", | |
")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"for email in mail_box:\n", | |
" # body charset\n", | |
" for part in email.walk():\n", | |
" if part.get_content_type() in [\"text/plain\", \"text/html\"]:\n", | |
" encoding = part.get_content_charset()\n", | |
" encoding_list.append(encoding)\n", | |
" body_encoding_list.append(encoding)\n", | |
"\n", | |
" # header charset\n", | |
" for header_value in email.values():\n", | |
" for _byte, encoding in decode_header(header_value):\n", | |
" encoding_list.append(encoding)\n", | |
" header_encoding_list.append(encoding)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Counter({None: 632678,\n", | |
" 'utf-8': 91374,\n", | |
" 'iso-2022-jp': 8583,\n", | |
" 'us-ascii': 6297,\n", | |
" 'shift_jis': 1348,\n", | |
" 'ascii': 46,\n", | |
" 'iso-8859-1': 37,\n", | |
" 'windows-1252': 24,\n", | |
" 'unknown-8bit': 9,\n", | |
" 'utf8': 8,\n", | |
" 'gb2312': 1})\n" | |
] | |
} | |
], | |
"source": [ | |
"# Header and Body encoding count\n", | |
"pprint.pprint(collections.Counter(encoding_list))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Counter({None: 632620,\n", | |
" 'utf-8': 55939,\n", | |
" 'us-ascii': 6226,\n", | |
" 'iso-2022-jp': 4873,\n", | |
" 'shift_jis': 467,\n", | |
" 'iso-8859-1': 29,\n", | |
" 'unknown-8bit': 9,\n", | |
" 'utf8': 4,\n", | |
" 'gb2312': 1})\n" | |
] | |
} | |
], | |
"source": [ | |
"# Header\n", | |
"pprint.pprint(collections.Counter(header_encoding_list))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Counter({'utf-8': 35435,\n", | |
" 'iso-2022-jp': 3710,\n", | |
" 'shift_jis': 881,\n", | |
" 'us-ascii': 71,\n", | |
" None: 58,\n", | |
" 'ascii': 46,\n", | |
" 'windows-1252': 24,\n", | |
" 'iso-8859-1': 8,\n", | |
" 'utf8': 4})\n" | |
] | |
} | |
], | |
"source": [ | |
"# Body\n", | |
"pprint.pprint(collections.Counter(body_encoding_list))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"celltoolbar": "Raw Cell Format", | |
"kernelspec": { | |
"display_name": "Python 3 (ipykernel)", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.11.0" | |
}, | |
"vscode": { | |
"interpreter": { | |
"hash": "de1c2a847b7180ec204216c2290a1b53c5177eb084c4a313a8105b305ac08b1c" | |
} | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment