Created
February 4, 2022 15:01
-
-
Save eggsyntax/3bfa928b80382169815ed2bdddc029c6 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns common100k | |
"Version 2, using 100k words instead of 10k" | |
(:require [clojure.string :as s])) | |
(comment | |
;; words from https://github.com/first20hours/google-10000-english/blob/master/google-10000-english-usa.txt | |
(def words (->> (slurp "./wiki-100k.txt") | |
(s/split-lines) | |
;; remove comments | |
(remove #(s/starts-with? % "#")))) | |
(count words);; => 98913 | |
(take 10 words);; => ("the" "of" "and" "to" "a" "in" "that" "I" "was" "he") | |
(def words-5 (filter #(= 5 (count %)) words)) | |
(count words-5);; => 13736 | |
(take 5 words-5);; => ("which" "their" "would" "there" "could") | |
(def lowercase-letters (set (map str (seq "abcdefghijklmnopqrstuvwxyz")))) | |
(def letters (->> (apply str words-5) | |
(map s/lower-case) | |
;; ditch apostrophes & other symbols | |
(filter #(contains? lowercase-letters %)))) | |
(count letters);; => 67437 | |
(def letter-freqs (frequencies letters));; => #'common100k/letter-freqs | |
(->> (sort-by val > letter-freqs) | |
(map first) | |
(s/join " "));; => "e a s r o t i n l d u c h m g p b y f k w v j z x q" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment