Mix.install([
{:explorer, "~> 0.10.1"},
{:kino, "~> 0.15.3"},
{:kino_explorer, "~> 0.1.24"}
])
Go to Google Takeout and initiate an export.
It'll take a little bit.
Once unzipped/untarred, update the path
variable below.
Test out the size of the Enum.take
below, as this livebook is not particularly efficient, and a large .mbox
file can cause timeouts.
import Kino.Shorts
alias Explorer.DataFrame, as: DF
:ok
path =
"~/Documents/Takeout/Mail/All mail Including Spam and Trash.mbox"
|> Path.expand()
# Just to confirm it's working :)
first_few =
path
|> File.stream!()
|> Stream.map(&String.trim/1)
|> Enum.take(10)
tree(first_few)
chunk_fun = fn
<<"From ", _rest::binary>> = line, [] ->
{:cont, [line]}
<<"From ", _rest::binary>> = line, acc ->
{:cont, Enum.reverse(acc), [line]}
line, acc ->
{:cont, [line | acc]}
end
after_fun = fn
[] -> raise "Won't happen, but let's not hang if we mess up"
[<<"From ", _rest::binary>> = line] -> {:cont, [line]}
acc -> {:cont, Enum.reverse(acc), []}
end
stream =
File.stream!(path)
|> Stream.map(&String.trim_trailing(&1, "\n"))
|> Stream.chunk_while([], chunk_fun, after_fun)
empty_msg_map = Map.from_keys([:delivered_to, :from, :to, :subject], nil)
lines_to_keep = fn
<<"From ", _rest::binary>> -> []
<<"Delivered-To: ", rest::binary>> -> [delivered_to: rest]
<<"From: ", rest::binary>> -> [from: rest]
<<"To: ", rest::binary>> -> [to: rest]
<<"Subject: ", rest::binary>> -> [subject: rest]
_ -> []
end
formatted_stream =
stream
|> Stream.flat_map(fn lines ->
[Enum.flat_map(lines, lines_to_keep)]
end)
|> Stream.map(&Enum.into(&1, empty_msg_map))
df =
formatted_stream
|> Enum.take(4000)
|> DF.new()
require Explorer.DataFrame
df
|> DF.lazy()
|> DF.group_by("from")
|> DF.summarise(from_count: count(from))
|> DF.sort_by(desc: from_count)