Created
January 18, 2021 21:33
-
-
Save meekg33k/c67c3b6f0959592ba58659c633ec09f9 to your computer and use it in GitHub Desktop.
Algorithm to return most frequently used words
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const wordsToExclude = ['and', 'he', 'the', 'to', 'is', 'Jack', 'Jill'] | |
const literatureText = 'Jack and Jill went to the market to buy bread and cheese. Cheese is Jack\'s and Jill\'s favorite food' | |
const getTokenizedWords = (word) => { | |
if (!word || word === '') return [] | |
return word.split(/\W+/) | |
} | |
const getValidTokenizedWords = (words, excludedWords) => { | |
if (!words || words.length === 0) return [] | |
if (!excludedWords || excludedWords.length === 0) return words | |
let validWords = []; | |
for (const word of words) { | |
if (excludedWords.indexOf(word) == -1) { | |
validWords.push(word.toLowerCase()) | |
} | |
} | |
return validWords; | |
} | |
const getMostFrequentlyUsedWords = (words) => { | |
const wordCountMap = {}; | |
let countWordBucket = []; //Use the indices of the array to keep track of the occurrences of words | |
for (const word of words) { | |
let count = wordCountMap[word] | |
//Check if word exists in map. If it does, it has an existing count, so just add 1 to it | |
if (count) { | |
wordCountMap[word] = count + 1; | |
} | |
else { | |
//If it doesn't exist, this is the first time we are encountering it, so set it to 1 | |
wordCountMap[word] = 1; | |
} | |
count = wordCountMap[word] | |
if (countWordBucket[count]) { | |
countWordBucket[count].push(word) | |
} | |
else { | |
countWordBucket[count] = [word] | |
} | |
} | |
return countWordBucket[countWordBucket.length - 1]; //The words with the highest occurrence will be at the end of the array | |
} | |
const returnMostFrequentlyUsedWords = (literatureText, wordsToExclude) => { | |
if (!literatureText || literatureText === '') { | |
return [] | |
} | |
const tokenizedWords = getTokenizedWords(literatureText); | |
const validTokenizedWords = getValidTokenizedWords(tokenizedWords, wordsToExclude) | |
return getMostFrequentlyUsedWords(validTokenizedWords); | |
} | |
console.log(returnMostFrequentlyUsedWords(literatureText, wordsToExclude)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment