Last active
March 30, 2016 17:59
-
-
Save pichi/2d10c93242d5057913d026a607f07dd4 to your computer and use it in GitHub Desktop.
Stopwords Benchmark
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
$ erl -pa eministat/ebin | |
Erlang/OTP 18 [erts-7.3] [source] [64-bit] [smp:4:4] [async-threads:10] [hipe] [kernel-poll:false] | |
Eshell V7.3 (abort with ^G) | |
1> {ok, Bin} = file:read_file("/home/hynek/Downloads/words.txt"), L = string:tokens(binary_to_list(Bin), "\s\r\n"), length(L). | |
113809 | |
2> length(lists:filter(fun stopwords_clause:is_stopword/1, L)). | |
122 | |
3> length(lists:filter(fun stopwords_map:is_stopword/1, L)). | |
122 | |
4> Clause = eministat:s("clause", fun() -> lists:filter(fun stopwords_clause:is_stopword/1, L) end, 50). | |
{dataset,"clause", | |
[3490,3493,3498,3501,3504,3507,3513,3539,3541,3544,3548, | |
3549,3551,3552,3554,3557,3559,3560,3562,3564,3570,3571,3581, | |
3589,3591,3611|...], | |
181508.0,6.595332e8,50} | |
5> Map = eministat:s("map", fun() -> lists:filter(fun stopwords_map:is_stopword/1, L) end, 50). | |
{dataset,"map", | |
[10950,10965,10971,10978,10982,10983,10988,10993,10998, | |
11002,11012,11013,11016,11017,11017,11019,11021,11025,11026, | |
11026,11028,11030,11035,11038,11040,11045|...], | |
555276.0,6170067514.0,50} | |
6> eministat:x(95.0, Clause, Map). | |
x clause | |
+ map | |
+--------------------------------------------------------------------------+ | |
|xxxxx +++++ +| | |
|xxxx ++++ | | |
|xxxx +++ | | |
|xxxx ++ | | |
|xxx ++ | | |
|xxx ++ | | |
|xx ++ | | |
|xx ++ | | |
|xx ++ | | |
|xx + | | |
|xx + | | |
|xx + | | |
|xx + | | |
|xx + | | |
| x + | | |
| x + | | |
| x + | | |
| x + | | |
| x + | | |
| x + | | |
| x + | | |
| x + | | |
| x + | | |
| x + | | |
| x + | | |
| + | | |
| + | | |
| + | | |
| + | | |
| + | | |
| + | | |
| + | | |
| + | | |
| + | | |
||A| | | |
| |_MA_| | | |
+--------------------------------------------------------------------------+ | |
Dataset: x N=50 CI=95.0000 | |
Statistic Value [ Bias] (Bootstrapped LB‥UB) | |
Min: 3490.00 | |
1st Qu. 3551.00 | |
Median: 3591.00 | |
3rd Qu. 3679.00 | |
Max: 3945.00 | |
Average: 3630.16 [ 0.137534] ( 3602.82 ‥ 3664.56) | |
Std. Dev: 113.400 [ -1.81311] ( 90.8425 ‥ 141.539) | |
Outliers: 0/4 = 4 (μ=3630.30, σ=111.587) | |
Outlier variance: 0.151802 (moderate) | |
------ | |
Dataset: + N=50 CI=95.0000 | |
Statistic Value [ Bias] (Bootstrapped LB‥UB) | |
Min: 1.09500e+4 | |
1st Qu. 1.10160e+4 | |
Median: 1.10400e+4 | |
3rd Qu. 1.11270e+4 | |
Max: 1.28270e+4 | |
Average: 1.11055e+4 [ 0.297998] ( 1.10611e+4 ‥ 1.12491e+4) | |
Std. Dev: 264.914 [ -31.0673] ( 84.7956 ‥ 582.629) | |
Outliers: 0/2 = 2 (μ=1.11058e+4, σ=233.847) | |
Outlier variance: 9.45082e-2 (slight) | |
Difference at 95.0% confidence | |
7475.36 ± 80.8533 | |
205.924% ± 2.22726% | |
(Student's t, pooled s = 203.763) | |
------ | |
ok |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-module(stopwords_clause). | |
-export([is_stopword/1]). | |
is_stopword("a") -> true; | |
is_stopword("about") -> true; | |
is_stopword("above") -> true; | |
is_stopword("after") -> true; | |
is_stopword("again") -> true; | |
is_stopword("against") -> true; | |
is_stopword("all") -> true; | |
is_stopword("am") -> true; | |
is_stopword("an") -> true; | |
is_stopword("and") -> true; | |
is_stopword("any") -> true; | |
is_stopword("are") -> true; | |
is_stopword("aren't") -> true; | |
is_stopword("as") -> true; | |
is_stopword("at") -> true; | |
is_stopword("be") -> true; | |
is_stopword("because") -> true; | |
is_stopword("been") -> true; | |
is_stopword("before") -> true; | |
is_stopword("being") -> true; | |
is_stopword("below") -> true; | |
is_stopword("between") -> true; | |
is_stopword("both") -> true; | |
is_stopword("but") -> true; | |
is_stopword("by") -> true; | |
is_stopword("can't") -> true; | |
is_stopword("cannot") -> true; | |
is_stopword("could") -> true; | |
is_stopword("couldn't") -> true; | |
is_stopword("did") -> true; | |
is_stopword("didn't") -> true; | |
is_stopword("do") -> true; | |
is_stopword("does") -> true; | |
is_stopword("doesn't") -> true; | |
is_stopword("doing") -> true; | |
is_stopword("don't") -> true; | |
is_stopword("down") -> true; | |
is_stopword("during") -> true; | |
is_stopword("each") -> true; | |
is_stopword("few") -> true; | |
is_stopword("for") -> true; | |
is_stopword("from") -> true; | |
is_stopword("further") -> true; | |
is_stopword("had") -> true; | |
is_stopword("hadn't") -> true; | |
is_stopword("has") -> true; | |
is_stopword("hasn't") -> true; | |
is_stopword("have") -> true; | |
is_stopword("haven't") -> true; | |
is_stopword("having") -> true; | |
is_stopword("he") -> true; | |
is_stopword("he'd") -> true; | |
is_stopword("he'll") -> true; | |
is_stopword("he's") -> true; | |
is_stopword("her") -> true; | |
is_stopword("here") -> true; | |
is_stopword("here's") -> true; | |
is_stopword("hers") -> true; | |
is_stopword("herself") -> true; | |
is_stopword("him") -> true; | |
is_stopword("himself") -> true; | |
is_stopword("his") -> true; | |
is_stopword("how") -> true; | |
is_stopword("how's") -> true; | |
is_stopword("i") -> true; | |
is_stopword("i'd") -> true; | |
is_stopword("i'll") -> true; | |
is_stopword("i'm") -> true; | |
is_stopword("i've") -> true; | |
is_stopword("if") -> true; | |
is_stopword("in") -> true; | |
is_stopword("into") -> true; | |
is_stopword("is") -> true; | |
is_stopword("isn't") -> true; | |
is_stopword("it") -> true; | |
is_stopword("it's") -> true; | |
is_stopword("its") -> true; | |
is_stopword("itself") -> true; | |
is_stopword("let's") -> true; | |
is_stopword("me") -> true; | |
is_stopword("more") -> true; | |
is_stopword("most") -> true; | |
is_stopword("mustn't") -> true; | |
is_stopword("my") -> true; | |
is_stopword("myself") -> true; | |
is_stopword("no") -> true; | |
is_stopword("nor") -> true; | |
is_stopword("not") -> true; | |
is_stopword("of") -> true; | |
is_stopword("off") -> true; | |
is_stopword("on") -> true; | |
is_stopword("once") -> true; | |
is_stopword("only") -> true; | |
is_stopword("or") -> true; | |
is_stopword("other") -> true; | |
is_stopword("ought") -> true; | |
is_stopword("our") -> true; | |
is_stopword("ours") -> true; | |
is_stopword("ourselves") -> true; | |
is_stopword("out") -> true; | |
is_stopword("over") -> true; | |
is_stopword("own") -> true; | |
is_stopword("same") -> true; | |
is_stopword("shan't") -> true; | |
is_stopword("she") -> true; | |
is_stopword("she'd") -> true; | |
is_stopword("she'll") -> true; | |
is_stopword("she's") -> true; | |
is_stopword("should") -> true; | |
is_stopword("shouldn't") -> true; | |
is_stopword("so") -> true; | |
is_stopword("some") -> true; | |
is_stopword("such") -> true; | |
is_stopword("than") -> true; | |
is_stopword("that") -> true; | |
is_stopword("that's") -> true; | |
is_stopword("the") -> true; | |
is_stopword("their") -> true; | |
is_stopword("theirs") -> true; | |
is_stopword("them") -> true; | |
is_stopword("themselves") -> true; | |
is_stopword("then") -> true; | |
is_stopword("there") -> true; | |
is_stopword("there's") -> true; | |
is_stopword("these") -> true; | |
is_stopword("they") -> true; | |
is_stopword("they'd") -> true; | |
is_stopword("they'll") -> true; | |
is_stopword("they're") -> true; | |
is_stopword("they've") -> true; | |
is_stopword("this") -> true; | |
is_stopword("those") -> true; | |
is_stopword("through") -> true; | |
is_stopword("to") -> true; | |
is_stopword("too") -> true; | |
is_stopword("under") -> true; | |
is_stopword("until") -> true; | |
is_stopword("up") -> true; | |
is_stopword("very") -> true; | |
is_stopword("was") -> true; | |
is_stopword("wasn't") -> true; | |
is_stopword("we") -> true; | |
is_stopword("we'd") -> true; | |
is_stopword("we'll") -> true; | |
is_stopword("we're") -> true; | |
is_stopword("we've") -> true; | |
is_stopword("were") -> true; | |
is_stopword("weren't") -> true; | |
is_stopword("what") -> true; | |
is_stopword("what's") -> true; | |
is_stopword("when") -> true; | |
is_stopword("when's") -> true; | |
is_stopword("where") -> true; | |
is_stopword("where's") -> true; | |
is_stopword("which") -> true; | |
is_stopword("while") -> true; | |
is_stopword("who") -> true; | |
is_stopword("who's") -> true; | |
is_stopword("whom") -> true; | |
is_stopword("why") -> true; | |
is_stopword("why's") -> true; | |
is_stopword("with") -> true; | |
is_stopword("won't") -> true; | |
is_stopword("would") -> true; | |
is_stopword("wouldn't") -> true; | |
is_stopword("you") -> true; | |
is_stopword("you'd") -> true; | |
is_stopword("you'll") -> true; | |
is_stopword("you're") -> true; | |
is_stopword("you've") -> true; | |
is_stopword("your") -> true; | |
is_stopword("yours") -> true; | |
is_stopword("yourself") -> true; | |
is_stopword("yourselves") -> true; | |
is_stopword(_) -> false. |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-module(stopwords_map). | |
-export([is_stopword/1]). | |
is_stopword(S) -> | |
maps:get( | |
S, | |
#{ | |
"a" => true, | |
"about" => true, | |
"above" => true, | |
"after" => true, | |
"again" => true, | |
"against" => true, | |
"all" => true, | |
"am" => true, | |
"an" => true, | |
"and" => true, | |
"any" => true, | |
"are" => true, | |
"aren't" => true, | |
"as" => true, | |
"at" => true, | |
"be" => true, | |
"because" => true, | |
"been" => true, | |
"before" => true, | |
"being" => true, | |
"below" => true, | |
"between" => true, | |
"both" => true, | |
"but" => true, | |
"by" => true, | |
"can't" => true, | |
"cannot" => true, | |
"could" => true, | |
"couldn't" => true, | |
"did" => true, | |
"didn't" => true, | |
"do" => true, | |
"does" => true, | |
"doesn't" => true, | |
"doing" => true, | |
"don't" => true, | |
"down" => true, | |
"during" => true, | |
"each" => true, | |
"few" => true, | |
"for" => true, | |
"from" => true, | |
"further" => true, | |
"had" => true, | |
"hadn't" => true, | |
"has" => true, | |
"hasn't" => true, | |
"have" => true, | |
"haven't" => true, | |
"having" => true, | |
"he" => true, | |
"he'd" => true, | |
"he'll" => true, | |
"he's" => true, | |
"her" => true, | |
"here" => true, | |
"here's" => true, | |
"hers" => true, | |
"herself" => true, | |
"him" => true, | |
"himself" => true, | |
"his" => true, | |
"how" => true, | |
"how's" => true, | |
"i" => true, | |
"i'd" => true, | |
"i'll" => true, | |
"i'm" => true, | |
"i've" => true, | |
"if" => true, | |
"in" => true, | |
"into" => true, | |
"is" => true, | |
"isn't" => true, | |
"it" => true, | |
"it's" => true, | |
"its" => true, | |
"itself" => true, | |
"let's" => true, | |
"me" => true, | |
"more" => true, | |
"most" => true, | |
"mustn't" => true, | |
"my" => true, | |
"myself" => true, | |
"no" => true, | |
"nor" => true, | |
"not" => true, | |
"of" => true, | |
"off" => true, | |
"on" => true, | |
"once" => true, | |
"only" => true, | |
"or" => true, | |
"other" => true, | |
"ought" => true, | |
"our" => true, | |
"ours" => true, | |
"ourselves" => true, | |
"out" => true, | |
"over" => true, | |
"own" => true, | |
"same" => true, | |
"shan't" => true, | |
"she" => true, | |
"she'd" => true, | |
"she'll" => true, | |
"she's" => true, | |
"should" => true, | |
"shouldn't" => true, | |
"so" => true, | |
"some" => true, | |
"such" => true, | |
"than" => true, | |
"that" => true, | |
"that's" => true, | |
"the" => true, | |
"their" => true, | |
"theirs" => true, | |
"them" => true, | |
"themselves" => true, | |
"then" => true, | |
"there" => true, | |
"there's" => true, | |
"these" => true, | |
"they" => true, | |
"they'd" => true, | |
"they'll" => true, | |
"they're" => true, | |
"they've" => true, | |
"this" => true, | |
"those" => true, | |
"through" => true, | |
"to" => true, | |
"too" => true, | |
"under" => true, | |
"until" => true, | |
"up" => true, | |
"very" => true, | |
"was" => true, | |
"wasn't" => true, | |
"we" => true, | |
"we'd" => true, | |
"we'll" => true, | |
"we're" => true, | |
"we've" => true, | |
"were" => true, | |
"weren't" => true, | |
"what" => true, | |
"what's" => true, | |
"when" => true, | |
"when's" => true, | |
"where" => true, | |
"where's" => true, | |
"which" => true, | |
"while" => true, | |
"who" => true, | |
"who's" => true, | |
"whom" => true, | |
"why" => true, | |
"why's" => true, | |
"with" => true, | |
"won't" => true, | |
"would" => true, | |
"wouldn't" => true, | |
"you" => true, | |
"you'd" => true, | |
"you'll" => true, | |
"you're" => true, | |
"you've" => true, | |
"your" => true, | |
"yours" => true, | |
"yourself" => true, | |
"yourselves" => true | |
}, | |
false). |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment