Skip to content

Instantly share code, notes, and snippets.

@petermueller
Last active April 17, 2025 05:39
Show Gist options
  • Save petermueller/a664ef33f38cb2726bf3e0239798beb7 to your computer and use it in GitHub Desktop.
Save petermueller/a664ef33f38cb2726bf3e0239798beb7 to your computer and use it in GitHub Desktop.
Google Takeout Mbox Parsing w/ Livebook

Mbox Parsing

Mix.install([
  {:explorer, "~> 0.10.1"},
  {:kino, "~> 0.15.3"},
  {:kino_explorer, "~> 0.1.24"}
])

Analyze your mailbox to find the junk!

Run in Livebook

Go to Google Takeout and initiate an export.

It'll take a little bit.

Once unzipped/untarred, update the path variable below.

Test out the size of the Enum.take below, as this livebook is not particularly efficient, and a large .mbox file can cause timeouts.

import Kino.Shorts

alias Explorer.DataFrame, as: DF

:ok
path = 
  "~/Documents/Takeout/Mail/All mail Including Spam and Trash.mbox"
  |> Path.expand()

# Just to confirm it's working :)
first_few = 
  path
  |> File.stream!()
  |> Stream.map(&String.trim/1)
  |> Enum.take(10)

tree(first_few)
chunk_fun = fn
  <<"From ", _rest::binary>> = line, [] ->
    {:cont, [line]}

  <<"From ", _rest::binary>> = line, acc ->
    {:cont, Enum.reverse(acc), [line]}

  line, acc ->
    {:cont, [line | acc]}
end

after_fun = fn
  [] -> raise "Won't happen, but let's not hang if we mess up"
  [<<"From ", _rest::binary>> = line] -> {:cont, [line]}
  acc -> {:cont, Enum.reverse(acc), []}
end

stream =
  File.stream!(path)
  |> Stream.map(&String.trim_trailing(&1, "\n"))
  |> Stream.chunk_while([], chunk_fun, after_fun)
empty_msg_map = Map.from_keys([:delivered_to, :from, :to, :subject], nil)

lines_to_keep = fn
  <<"From ", _rest::binary>> -> []
  <<"Delivered-To: ", rest::binary>> -> [delivered_to: rest]
  <<"From: ", rest::binary>> -> [from: rest]
  <<"To: ", rest::binary>> -> [to: rest]
  <<"Subject: ", rest::binary>> -> [subject: rest]
  _ -> []
end

formatted_stream =
  stream
  |> Stream.flat_map(fn lines ->
    [Enum.flat_map(lines, lines_to_keep)]
  end)
  |> Stream.map(&Enum.into(&1, empty_msg_map))

df =
  formatted_stream
  |> Enum.take(4000)
  |> DF.new()
require Explorer.DataFrame

df
|> DF.lazy()
|> DF.group_by("from")
|> DF.summarise(from_count: count(from))
|> DF.sort_by(desc: from_count)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment