Last active
March 20, 2017 02:04
-
-
Save antoine/c0ede64f000cf316ace367bbd9cf524c to your computer and use it in GitHub Desktop.
implementation of https://www.futurelearn.com/courses/functional-programming-erlang/1/assignments/161822/submission/new
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-module(index). | |
-export([get_file_contents/1, | |
show_file_contents/1, | |
index_file/1, | |
index_sample_files/0]). | |
-include_lib("eunit/include/eunit.hrl"). | |
index_sample_files() -> | |
io:format("~p~n", [index_file("gettysburg-address.txt")]), | |
io:format("~p~n", [index_file("dickens-christmas.txt")]). | |
commonWords() -> | |
["on", "the", "that"]. | |
%%computes the index of each word and sort | |
%%the output according to the words themselves | |
index_file(Name) -> | |
Words = words(get_file_contents(Name)), | |
lists:sort(fun({W1,_},{W2,_}) -> W1=<W2 end, collate(index_words(1,Words, Words))). | |
%%format the output for each word | |
collate([])-> | |
[]; | |
collate([{Word, [N|Ns]}|Words])-> | |
[{Word, lists:reverse(collate_lines(N,N,Ns))}|collate(Words)]. | |
%%convert a sorted list of line numbers into the output | |
%%required by the exercise | |
collate_lines(Begin,Current, []) -> | |
[{Current,Begin}]; | |
collate_lines(Begin,Current, [N|Ns]) -> | |
case Current-N of | |
1 -> collate_lines(Begin,N,Ns); | |
_ -> [{Current,Begin}|collate_lines(N,N,Ns)] | |
end. | |
%%clean up the text and split each line into words | |
words([]) -> | |
[]; | |
words([[]|Lines]) -> | |
[[]|words(Lines)]; | |
words([Line|Lines]) -> | |
[split_on_space(nocaps(nopunc(Line)), [])|words(Lines)]. | |
%%for each word of sufficient length and not found in the list | |
%%of common words collect the line numbers it appears in | |
%%and remove it from the rest of the text | |
index_words(_LNb, [[]], _AllLines)-> | |
[]; | |
index_words(LNb, [[]|Lines], AllLines)-> | |
index_words(LNb, Lines, AllLines); | |
index_words(LNb, [[Word|Line]|Lines], AllLines)-> | |
case member(Word, commonWords()) or (length(Word)=<2) of | |
true -> index_words(LNb, [Line|Lines], AllLines); | |
false -> %%io:format("working on ~p~n", [Word]), | |
[index_word(LNb, Word, AllLines, []) | index_words( | |
LNb, | |
remove_word_lines(Word, [Line|Lines]), | |
remove_word_lines(Word,AllLines))] | |
end. | |
%%collect the line number of the lines where Word appear | |
index_word(_LNb, Word, [], MatchedLines) -> | |
{Word, MatchedLines}; | |
index_word(LNb, Word, [Line|Lines], MatchedLines) -> | |
case member(Word, Line) of | |
true -> index_word(LNb+1, Word, Lines, [LNb|MatchedLines]); | |
false -> index_word(LNb+1, Word, Lines, MatchedLines) | |
end. | |
%%remove a Word from a matrix of words, working Line by Line | |
remove_word_lines(_W,[]) -> | |
[]; | |
remove_word_lines(W,[Line|Lines]) -> | |
[remove_word_line(W,Line)|remove_word_lines(W, Lines)]. | |
%%remove a Word from a Line | |
remove_word_line(_W,[])-> | |
[]; | |
remove_word_line(W,[W|Line])-> | |
remove_word_line(W,Line); | |
remove_word_line(W,[OtherWord|Line])-> | |
[OtherWord|remove_word_line(W, Line)]. | |
remove_word_lines_test() -> | |
[["B"], ["C","D"], []] = remove_word_lines("A", [["A", "B"], ["C","D"], ["A", "A"]]). | |
%%split a string into a list of words | |
split_on_space([], []) -> | |
[]; | |
split_on_space([], Word) -> | |
[lists:reverse(Word)]; | |
split_on_space([C|Cs], Word) -> | |
case member(C, " ") of | |
true -> case Word of | |
[] -> split_on_space(Cs, []); | |
_ -> [lists:reverse(Word)|split_on_space(Cs, [])] | |
end; | |
false -> split_on_space(Cs, [C|Word]) | |
end. | |
nopunc([])->[]; | |
nopunc([C|Cs]) -> | |
case member(C, ",-.\\'()[]`\"") of | |
true -> nopunc(Cs); | |
false -> [C|nopunc(Cs)] | |
end. | |
nocaps([]) -> | |
[]; | |
nocaps([X|Xs]) -> | |
[ nocap(X) | nocaps(Xs) ]. | |
nocap(X) -> | |
case $A =< X andalso X =< $Z of | |
true -> | |
X+32; | |
false -> | |
X | |
end. | |
member(X, [X|_Xs]) -> | |
true; | |
member(_, []) -> | |
false; | |
member(C,[_|Xs])-> | |
member(C,Xs). | |
% Used to read a file into a list of lines. | |
% Example files available in: | |
% gettysburg-address.txt (short) | |
% dickens-christmas.txt (long) | |
% Get the contents of a text file into a list of lines. | |
% Each line has its trailing newline removed. | |
get_file_contents(Name) -> | |
{ok,File} = file:open(Name,[read]), | |
Rev = get_all_lines(File,[]), | |
lists:reverse(Rev). | |
% Auxiliary function for get_file_contents. | |
% Not exported. | |
get_all_lines(File,Partial) -> | |
case io:get_line(File,"") of | |
eof -> file:close(File), | |
Partial; | |
Line -> {Strip,_} = lists:split(length(Line)-1,Line), | |
get_all_lines(File,[Strip|Partial]) | |
end. | |
% Show the contents of a list of strings. | |
% Can be used to check the results of calling get_file_contents. | |
show_file_contents([L|Ls]) -> | |
io:format("~s~n",[L]), | |
show_file_contents(Ls); | |
show_file_contents([]) -> | |
ok. | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment