Last active
February 7, 2017 04:14
-
-
Save joshleecreates/719f9fe1d91a9acd318697b65604daf0 to your computer and use it in GitHub Desktop.
An elixir helper for classifying text
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
@doc """ | |
Example Usage | |
``` | |
items = [ | |
%{ | |
name: "thing", | |
description: "foo bar baz" | |
}, | |
%{ | |
name: "thing two", | |
description: "biz bump boozle biz" | |
}, | |
] | |
word_counts = maps_to_word_counts(items) | |
word_list = generate_word_list(word_counts, 0.1, 0.5) | |
table(:counts, word_counts, word_list) | |
``` | |
""" | |
defmodule ClassificationHelper do | |
require Stemmer | |
@stop_words ~w( | |
a about above after again against all am an and any are arent as at be | |
because been before being below between both but by cant cannot could | |
couldnt did didnt do does doesnt doing dont down during each few for from | |
further had hadnt has hasnt have havent having he hed hell hes her here | |
heres hers herself him himself his how hows i id ill im ive if in into | |
is isnt it its its itself lets me more most mustnt my myself no nor not of | |
off on once only or other ought our ours ourselves out over own same shant | |
she shed shell shes should shouldnt so some such than that thats the | |
their theirs them themselves then there theres these they theyd theyll | |
theyre theyve this those through to too under until up very was wasnt we | |
wed well were weve were werent what whats when whens where wheres | |
which while who whos whom why whys with wont would wouldnt you youd | |
youll youre youve your yours yourself yourselves | |
) | |
def count_words(text, state \\ %{}, weight \\ 1) do | |
text | |
|> String.tokenize | |
|> Enum.reduce(state, fn(word, acc) -> | |
Map.update(acc, String.to_atom(word), 0, &(1*weight + &1)) | |
end) | |
end | |
def generate_word_list(word_counts_map, lower_bound, upper_bound) do | |
n = Enum.count(word_counts_map) | |
word_counts_map | |
|> get_appearances | |
|> Enum.filter_map( | |
fn({_word, appearances}) -> | |
(appearances/n) > lower_bound && (appearances/n) < upper_bound | |
end, | |
fn({word, _appearances}) -> word end | |
) | |
end | |
def get_appearances(word_counts_map) do | |
Enum.reduce(word_counts_map, %{}, fn({_item, row}, acc) -> | |
row_appearances = row | |
|> Map.keys() | |
|> Enum.map(&({&1, Map.get(acc, &1, 0) + 1})) | |
|> Enum.into(%{}) | |
Map.merge(acc, row_appearances) | |
end) | |
end | |
@doc """ | |
""" | |
def maps_to_word_counts(items, name_field, fields) do | |
items | |
|> Enum.map(fn({_, item}) -> | |
{ | |
Map.get(item, name_field), | |
Enum.reduce(fields, %{}, &(count_field_weighted(&1, item, &2))) | |
} | |
end) | |
|> Enum.into(%{}) | |
end | |
def table(:bow, word_count_maps, word_list) do | |
end | |
def table(:tfidf, word_count_maps, word_list) do | |
end | |
def table(:counts, word_count_maps, word_list) do | |
table = Enum.map(word_count_maps, fn({name, row})-> | |
[name | Enum.map(word_list, &(get_column_weighted(&1, row)))] | |
end) | |
table = [["" | word_list] | table] | |
end | |
def tokenize(text) do | |
text | |
|> String.replace(~r/[^A-z\s]/u, "") | |
|> String.downcase | |
|> String.split | |
|> Enum.reject(&(&1 in @stop_words)) | |
|> Stemmer.stem | |
end | |
# private | |
defp count_field_weighted({field, weight}, item, acc) do | |
count_words(Map.get(item, field), acc, weight) | |
end | |
defp count_field_weighted(field, item, acc) do | |
count_field_weighted({field, 1}, item, acc) | |
end | |
defp get_column_weighted({col, weight}, row), do: Map.get(row, col, 0) * weight | |
defp get_column_weighted(col, row), do: get_column_weighted({col, 1}, row) | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment