antoine · March 20, 2017 02:04
diff --git a/index.erl b/index.erl
 -module(index).
 -export([get_file_contents/1,
         show_file_contents/1,
         index_file/1,
         index_sample_files/0]).
 -include_lib("eunit/include/eunit.hrl").

 index_sample_files() ->
  io:format("~p~n", [index_file("gettysburg-address.txt")]),
  io:format("~p~n", [index_file("dickens-christmas.txt")]).

 commonWords() ->
  ["on", "the", "that"].

 %%computes the index of each word and sort
 %%the output according to the words themselves
 index_file(Name) ->
  Words = words(get_file_contents(Name)),
  lists:sort(fun({W1,_},{W2,_}) -> W1=<W2 end, collate(index_words(1,Words, Words))).

 %%format the output for each word
 collate([])->
  [];
 collate([{Word, [N|Ns]}|Words])->
  [{Word, lists:reverse(collate_lines(N,N,Ns))}|collate(Words)].

 %%convert a sorted list of line numbers into the output
 %%required by the exercise
 collate_lines(Begin,Current, []) ->
  [{Current,Begin}];
 collate_lines(Begin,Current, [N|Ns]) ->
  case Current-N of
    1 -> collate_lines(Begin,N,Ns);
    _ -> [{Current,Begin}|collate_lines(N,N,Ns)]
  end.

 %%clean up the text and split each line into words
 words([]) ->
  [];
 words([[]|Lines]) ->
  [[]|words(Lines)];
 words([Line|Lines]) ->
  [split_on_space(nocaps(nopunc(Line)), [])|words(Lines)].
  
 %%for each word of sufficient length and not found in the list
 %%of common words collect the line numbers it appears in 
 %%and remove it from the rest of the text
 index_words(_LNb, [[]], _AllLines)->
  [];
 index_words(LNb, [[]|Lines], AllLines)->
  index_words(LNb, Lines, AllLines);
 index_words(LNb, [[Word|Line]|Lines], AllLines)->
  case member(Word, commonWords()) or (length(Word)=<2) of
    true -> index_words(LNb, [Line|Lines], AllLines);
    false -> %%io:format("working on ~p~n", [Word]),
             [index_word(LNb, Word, AllLines, []) | index_words(
                                                      LNb,
                                                      remove_word_lines(Word, [Line|Lines]),
                                                      remove_word_lines(Word,AllLines))]
  end.

 %%collect the line number of the lines where Word appear
 index_word(_LNb, Word, [], MatchedLines) ->
  {Word, MatchedLines};
 index_word(LNb, Word, [Line|Lines], MatchedLines) ->
  case member(Word, Line) of
    true -> index_word(LNb+1, Word, Lines, [LNb|MatchedLines]);
    false -> index_word(LNb+1, Word, Lines, MatchedLines)
  end.

 %%remove a Word from a matrix of words, working Line by Line
 remove_word_lines(_W,[]) ->
  [];
 remove_word_lines(W,[Line|Lines]) ->
  [remove_word_line(W,Line)|remove_word_lines(W, Lines)].

 %%remove a Word from a Line
 remove_word_line(_W,[])->
  [];
 remove_word_line(W,[W|Line])->
  remove_word_line(W,Line);
 remove_word_line(W,[OtherWord|Line])->
  [OtherWord|remove_word_line(W, Line)].

 remove_word_lines_test() ->
  [["B"], ["C","D"], []] = remove_word_lines("A", [["A", "B"], ["C","D"], ["A", "A"]]).

 %%split a string into a list of words
 split_on_space([], []) ->
  [];
 split_on_space([], Word) ->
  [lists:reverse(Word)];
 split_on_space([C|Cs], Word) ->
  case member(C, " ") of
    true -> case Word of
              [] -> split_on_space(Cs, []);
              _ -> [lists:reverse(Word)|split_on_space(Cs, [])]
            end;
    false -> split_on_space(Cs, [C|Word])
  end.


 nopunc([])->[];
 nopunc([C|Cs]) ->
  case member(C, ",-.\\'()[]`\"") of
    true -> nopunc(Cs);
    false -> [C|nopunc(Cs)]
  end.

 nocaps([]) ->
  [];
 nocaps([X|Xs]) ->
  [ nocap(X) | nocaps(Xs) ].

 nocap(X) ->
  case $A =< X andalso X =< $Z of
    true ->
      X+32;
    false ->
      X
  end.

 member(X, [X|_Xs]) ->
  true;
 member(_, []) ->
  false;
 member(C,[_|Xs])->
  member(C,Xs).



 % Used to read a file into a list of lines.
 % Example files available in:
 %   gettysburg-address.txt (short)
 %   dickens-christmas.txt  (long)
  

 % Get the contents of a text file into a list of lines.
 % Each line has its trailing newline removed.

 get_file_contents(Name) ->
    {ok,File} = file:open(Name,[read]),
    Rev = get_all_lines(File,[]),
 lists:reverse(Rev).

 % Auxiliary function for get_file_contents.
 % Not exported.

 get_all_lines(File,Partial) ->
    case io:get_line(File,"") of
        eof -> file:close(File),
               Partial;
        Line -> {Strip,_} = lists:split(length(Line)-1,Line),
                get_all_lines(File,[Strip|Partial])
    end.

 % Show the contents of a list of strings.
 % Can be used to check the results of calling get_file_contents.

 show_file_contents([L|Ls]) ->
    io:format("~s~n",[L]),
    show_file_contents(Ls);
 show_file_contents([]) ->
    ok.
	-module(index).
	-export([get_file_contents/1,
	show_file_contents/1,
	index_file/1,
	index_sample_files/0]).
	-include_lib("eunit/include/eunit.hrl").

	index_sample_files() ->
	io:format("~p~n", [index_file("gettysburg-address.txt")]),
	io:format("~p~n", [index_file("dickens-christmas.txt")]).

	commonWords() ->
	["on", "the", "that"].

	%%computes the index of each word and sort
	%%the output according to the words themselves
	index_file(Name) ->
	Words = words(get_file_contents(Name)),
	lists:sort(fun({W1,_},{W2,_}) -> W1=<W2 end, collate(index_words(1,Words, Words))).

	%%format the output for each word
	collate([])->
	[];
	collate([{Word, [N\|Ns]}\|Words])->
	[{Word, lists:reverse(collate_lines(N,N,Ns))}\|collate(Words)].

	%%convert a sorted list of line numbers into the output
	%%required by the exercise
	collate_lines(Begin,Current, []) ->
	[{Current,Begin}];
	collate_lines(Begin,Current, [N\|Ns]) ->
	case Current-N of
	1 -> collate_lines(Begin,N,Ns);
	_ -> [{Current,Begin}\|collate_lines(N,N,Ns)]
	end.

	%%clean up the text and split each line into words
	words([]) ->
	[];
	words([[]\|Lines]) ->
	[[]\|words(Lines)];
	words([Line\|Lines]) ->
	[split_on_space(nocaps(nopunc(Line)), [])\|words(Lines)].

	%%for each word of sufficient length and not found in the list
	%%of common words collect the line numbers it appears in
	%%and remove it from the rest of the text
	index_words(_LNb, [[]], _AllLines)->
	[];
	index_words(LNb, [[]\|Lines], AllLines)->
	index_words(LNb, Lines, AllLines);
	index_words(LNb, [[Word\|Line]\|Lines], AllLines)->
	case member(Word, commonWords()) or (length(Word)=<2) of
	true -> index_words(LNb, [Line\|Lines], AllLines);
	false -> %%io:format("working on ~p~n", [Word]),
	[index_word(LNb, Word, AllLines, []) \| index_words(
	LNb,
	remove_word_lines(Word, [Line\|Lines]),
	remove_word_lines(Word,AllLines))]
	end.

	%%collect the line number of the lines where Word appear
	index_word(_LNb, Word, [], MatchedLines) ->
	{Word, MatchedLines};
	index_word(LNb, Word, [Line\|Lines], MatchedLines) ->
	case member(Word, Line) of
	true -> index_word(LNb+1, Word, Lines, [LNb\|MatchedLines]);
	false -> index_word(LNb+1, Word, Lines, MatchedLines)
	end.

	%%remove a Word from a matrix of words, working Line by Line
	remove_word_lines(_W,[]) ->
	[];
	remove_word_lines(W,[Line\|Lines]) ->
	[remove_word_line(W,Line)\|remove_word_lines(W, Lines)].

	%%remove a Word from a Line
	remove_word_line(_W,[])->
	[];
	remove_word_line(W,[W\|Line])->
	remove_word_line(W,Line);
	remove_word_line(W,[OtherWord\|Line])->
	[OtherWord\|remove_word_line(W, Line)].

	remove_word_lines_test() ->
	[["B"], ["C","D"], []] = remove_word_lines("A", [["A", "B"], ["C","D"], ["A", "A"]]).

	%%split a string into a list of words
	split_on_space([], []) ->
	[];
	split_on_space([], Word) ->
	[lists:reverse(Word)];
	split_on_space([C\|Cs], Word) ->
	case member(C, " ") of
	true -> case Word of
	[] -> split_on_space(Cs, []);
	_ -> [lists:reverse(Word)\|split_on_space(Cs, [])]
	end;
	false -> split_on_space(Cs, [C\|Word])
	end.


	nopunc([])->[];
	nopunc([C\|Cs]) ->
	case member(C, ",-.\\'()[]`\"") of
	true -> nopunc(Cs);
	false -> [C\|nopunc(Cs)]
	end.

	nocaps([]) ->
	[];
	nocaps([X\|Xs]) ->
	[ nocap(X) \| nocaps(Xs) ].

	nocap(X) ->
	case $A =< X andalso X =< $Z of
	true ->
	X+32;
	false ->
	X
	end.

	member(X, [X\|_Xs]) ->
	true;
	member(_, []) ->
	false;
	member(C,[_\|Xs])->
	member(C,Xs).



	% Used to read a file into a list of lines.
	% Example files available in:
	% gettysburg-address.txt (short)
	% dickens-christmas.txt (long)


	% Get the contents of a text file into a list of lines.
	% Each line has its trailing newline removed.

	get_file_contents(Name) ->
	{ok,File} = file:open(Name,[read]),
	Rev = get_all_lines(File,[]),
	lists:reverse(Rev).

	% Auxiliary function for get_file_contents.
	% Not exported.

	get_all_lines(File,Partial) ->
	case io:get_line(File,"") of
	eof -> file:close(File),
	Partial;
	Line -> {Strip,_} = lists:split(length(Line)-1,Line),
	get_all_lines(File,[Strip\|Partial])
	end.

	% Show the contents of a list of strings.
	% Can be used to check the results of calling get_file_contents.

	show_file_contents([L\|Ls]) ->
	io:format("~s~n",[L]),
	show_file_contents(Ls);
	show_file_contents([]) ->
	ok.