Skip to content

Instantly share code, notes, and snippets.

@HEKUCHAN
Last active July 10, 2023 12:46
Show Gist options
  • Save HEKUCHAN/b0314e80eba8b94e4da89c4945b3ce58 to your computer and use it in GitHub Desktop.
Save HEKUCHAN/b0314e80eba8b94e4da89c4945b3ce58 to your computer and use it in GitHub Desktop.
Count by type of encoding in mbox
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pprint\n",
"import mailbox\n",
"import collections\n",
"from email.header import decode_header"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"mbox_path = \"./sample_mbox/all_mail.mbox\"\n",
"emails = []\n",
"encoding_list = []\n",
"body_encoding_list = []\n",
"header_encoding_list = []"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"mail_box = mailbox.mbox(\n",
" mbox_path\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"for email in mail_box:\n",
" # body charset\n",
" for part in email.walk():\n",
" if part.get_content_type() in [\"text/plain\", \"text/html\"]:\n",
" encoding = part.get_content_charset()\n",
" encoding_list.append(encoding)\n",
" body_encoding_list.append(encoding)\n",
"\n",
" # header charset\n",
" for header_value in email.values():\n",
" for _byte, encoding in decode_header(header_value):\n",
" encoding_list.append(encoding)\n",
" header_encoding_list.append(encoding)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Counter({None: 632678,\n",
" 'utf-8': 91374,\n",
" 'iso-2022-jp': 8583,\n",
" 'us-ascii': 6297,\n",
" 'shift_jis': 1348,\n",
" 'ascii': 46,\n",
" 'iso-8859-1': 37,\n",
" 'windows-1252': 24,\n",
" 'unknown-8bit': 9,\n",
" 'utf8': 8,\n",
" 'gb2312': 1})\n"
]
}
],
"source": [
"# Header and Body encoding count\n",
"pprint.pprint(collections.Counter(encoding_list))"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Counter({None: 632620,\n",
" 'utf-8': 55939,\n",
" 'us-ascii': 6226,\n",
" 'iso-2022-jp': 4873,\n",
" 'shift_jis': 467,\n",
" 'iso-8859-1': 29,\n",
" 'unknown-8bit': 9,\n",
" 'utf8': 4,\n",
" 'gb2312': 1})\n"
]
}
],
"source": [
"# Header\n",
"pprint.pprint(collections.Counter(header_encoding_list))"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Counter({'utf-8': 35435,\n",
" 'iso-2022-jp': 3710,\n",
" 'shift_jis': 881,\n",
" 'us-ascii': 71,\n",
" None: 58,\n",
" 'ascii': 46,\n",
" 'windows-1252': 24,\n",
" 'iso-8859-1': 8,\n",
" 'utf8': 4})\n"
]
}
],
"source": [
"# Body\n",
"pprint.pprint(collections.Counter(body_encoding_list))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"celltoolbar": "Raw Cell Format",
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.0"
},
"vscode": {
"interpreter": {
"hash": "de1c2a847b7180ec204216c2290a1b53c5177eb084c4a313a8105b305ac08b1c"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment