Skip to content

Instantly share code, notes, and snippets.

@joshleecreates
Last active February 7, 2017 04:14
Show Gist options
  • Save joshleecreates/719f9fe1d91a9acd318697b65604daf0 to your computer and use it in GitHub Desktop.
Save joshleecreates/719f9fe1d91a9acd318697b65604daf0 to your computer and use it in GitHub Desktop.
An elixir helper for classifying text
@doc """
Example Usage
```
items = [
%{
name: "thing",
description: "foo bar baz"
},
%{
name: "thing two",
description: "biz bump boozle biz"
},
]
word_counts = maps_to_word_counts(items)
word_list = generate_word_list(word_counts, 0.1, 0.5)
table(:counts, word_counts, word_list)
```
"""
defmodule ClassificationHelper do
require Stemmer
@stop_words ~w(
a about above after again against all am an and any are arent as at be
because been before being below between both but by cant cannot could
couldnt did didnt do does doesnt doing dont down during each few for from
further had hadnt has hasnt have havent having he hed hell hes her here
heres hers herself him himself his how hows i id ill im ive if in into
is isnt it its its itself lets me more most mustnt my myself no nor not of
off on once only or other ought our ours ourselves out over own same shant
she shed shell shes should shouldnt so some such than that thats the
their theirs them themselves then there theres these they theyd theyll
theyre theyve this those through to too under until up very was wasnt we
wed well were weve were werent what whats when whens where wheres
which while who whos whom why whys with wont would wouldnt you youd
youll youre youve your yours yourself yourselves
)
def count_words(text, state \\ %{}, weight \\ 1) do
text
|> String.tokenize
|> Enum.reduce(state, fn(word, acc) ->
Map.update(acc, String.to_atom(word), 0, &(1*weight + &1))
end)
end
def generate_word_list(word_counts_map, lower_bound, upper_bound) do
n = Enum.count(word_counts_map)
word_counts_map
|> get_appearances
|> Enum.filter_map(
fn({_word, appearances}) ->
(appearances/n) > lower_bound && (appearances/n) < upper_bound
end,
fn({word, _appearances}) -> word end
)
end
def get_appearances(word_counts_map) do
Enum.reduce(word_counts_map, %{}, fn({_item, row}, acc) ->
row_appearances = row
|> Map.keys()
|> Enum.map(&({&1, Map.get(acc, &1, 0) + 1}))
|> Enum.into(%{})
Map.merge(acc, row_appearances)
end)
end
@doc """
"""
def maps_to_word_counts(items, name_field, fields) do
items
|> Enum.map(fn({_, item}) ->
{
Map.get(item, name_field),
Enum.reduce(fields, %{}, &(count_field_weighted(&1, item, &2)))
}
end)
|> Enum.into(%{})
end
def table(:bow, word_count_maps, word_list) do
end
def table(:tfidf, word_count_maps, word_list) do
end
def table(:counts, word_count_maps, word_list) do
table = Enum.map(word_count_maps, fn({name, row})->
[name | Enum.map(word_list, &(get_column_weighted(&1, row)))]
end)
table = [["" | word_list] | table]
end
def tokenize(text) do
text
|> String.replace(~r/[^A-z\s]/u, "")
|> String.downcase
|> String.split
|> Enum.reject(&(&1 in @stop_words))
|> Stemmer.stem
end
# private
defp count_field_weighted({field, weight}, item, acc) do
count_words(Map.get(item, field), acc, weight)
end
defp count_field_weighted(field, item, acc) do
count_field_weighted({field, 1}, item, acc)
end
defp get_column_weighted({col, weight}, row), do: Map.get(row, col, 0) * weight
defp get_column_weighted(col, row), do: get_column_weighted({col, 1}, row)
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment