Last active
August 7, 2022 18:01
-
-
Save amakukha/8d1ac179c42c13bff3d007b56a48dcd4 to your computer and use it in GitHub Desktop.
Find k most frequent words in a file (fast implementation)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* Solution of the Jon Bentley's k most frequent words problem using a prefix tree and | |
* a heap of size k. Worst case time complexity is O(N log k), where N is the total | |
* number of words. | |
* | |
* The problem is formulated in the Communications of the ACM 29,5 (May 1986), 364-369: | |
* "Given a text file and an integer k, you are to print the k | |
* most common words in the file (and the number of their | |
* occurrences) in decreasing frequency." | |
* | |
* Donald Knuth famously solved the problem by using hash tries in a an 8-pages-long | |
* WEB program. Doug McIlroy, in response, solved it in six lines of shell script | |
* reusing standard Unix utilities (even though his solution is suboptimal). | |
*/ | |
// Original: https://www.geeksforgeeks.org/find-the-k-most-frequent-words-from-a-file/ | |
// Licence: Creative Commons Attribution-ShareAlike 4.0 International (CC BY-SA 4.0) | |
// Modifications by Andriy Makukha: | |
// - increased max. word size | |
// - scanning input for alphabetic words only, ignoring the rest | |
// - taking arguments from command line | |
// - sorting output | |
#include <stdio.h> | |
#include <string> | |
#include <algorithm> | |
# define MAX_CHARS 26 | |
# define BUFFER_SIZE 10000 // both max. word size and max. distance between words (+1) | |
// A Trie node | |
struct TrieNode | |
{ | |
bool isEnd; // indicates end of word | |
unsigned frequency; // the number of occurrences of a word | |
int indexMinHeap; // the index of the word in minHeap | |
TrieNode* child[MAX_CHARS]; // represents 26 slots each for 'a' to 'z'. | |
}; | |
// A Min Heap node | |
struct MinHeapNode | |
{ | |
TrieNode* root; // indicates the leaf node of TRIE | |
unsigned frequency; // number of occurrences | |
char* word; // the actual word stored | |
}; | |
// A Min Heap | |
struct MinHeap | |
{ | |
unsigned capacity; // the total size a min heap | |
int count; // indicates the number of slots filled. | |
MinHeapNode* array; // represents the collection of minHeapNodes | |
}; | |
bool compareMinHeapNodes(MinHeapNode n1, MinHeapNode n2) { | |
return n1.frequency > n2.frequency; | |
}; | |
// A routine to create a new Trie node | |
TrieNode* newTrieNode() | |
{ | |
// Allocate memory for Trie Node | |
TrieNode* trieNode = new TrieNode; | |
// Initialize values for new node | |
trieNode->isEnd = 0; | |
trieNode->frequency = 0; | |
trieNode->indexMinHeap = -1; | |
for( int i = 0; i < MAX_CHARS; ++i ) | |
trieNode->child[i] = NULL; | |
return trieNode; | |
} | |
// A routine to create a Min Heap of given capacity | |
MinHeap* createMinHeap( int capacity ) | |
{ | |
MinHeap* minHeap = new MinHeap; | |
minHeap->capacity = capacity; | |
minHeap->count = 0; | |
// Allocate memory for array of min heap nodes | |
minHeap->array = new MinHeapNode [ minHeap->capacity ]; | |
return minHeap; | |
} | |
// A routine to swap two min heap nodes. This function | |
// is needed in minHeapify | |
void swapMinHeapNodes ( MinHeapNode* a, MinHeapNode* b ) | |
{ | |
MinHeapNode temp = *a; | |
*a = *b; | |
*b = temp; | |
} | |
// This is the standard minHeapify function. It does one thing extra. | |
// It updates the minHeapIndex in Trie when two nodes are swapped in | |
// in min heap | |
void minHeapify( MinHeap* minHeap, int idx ) | |
{ | |
int left, right, smallest; | |
left = 2 * idx + 1; | |
right = 2 * idx + 2; | |
smallest = idx; | |
if ( left < minHeap->count && | |
minHeap->array[ left ]. frequency < | |
minHeap->array[ smallest ]. frequency | |
) | |
smallest = left; | |
if ( right < minHeap->count && | |
minHeap->array[ right ]. frequency < | |
minHeap->array[ smallest ]. frequency | |
) | |
smallest = right; | |
if( smallest != idx ) | |
{ | |
// Update the corresponding index in Trie node. | |
minHeap->array[ smallest ]. root->indexMinHeap = idx; | |
minHeap->array[ idx ]. root->indexMinHeap = smallest; | |
// Swap nodes in min heap | |
swapMinHeapNodes (&minHeap->array[ smallest ], &minHeap->array[ idx ]); | |
minHeapify( minHeap, smallest ); | |
} | |
} | |
// A standard function to build a heap | |
void buildMinHeap( MinHeap* minHeap ) | |
{ | |
int n, i; | |
n = minHeap->count - 1; | |
for( i = ( n - 1 ) / 2; i >= 0; --i ) | |
minHeapify( minHeap, i ); | |
} | |
// Inserts a word to heap, the function handles the 3 cases explained above | |
void insertInMinHeap( MinHeap* minHeap, TrieNode** root, const char* word ) | |
{ | |
// Case 1: the word is already present in minHeap | |
if( (*root)->indexMinHeap != -1 ) | |
{ | |
++( minHeap->array[ (*root)->indexMinHeap ]. frequency ); | |
// percolate down | |
minHeapify( minHeap, (*root)->indexMinHeap ); | |
} | |
// Case 2: Word is not present and heap is not full | |
else if( minHeap->count < minHeap->capacity ) | |
{ | |
int count = minHeap->count; | |
minHeap->array[ count ]. frequency = (*root)->frequency; | |
minHeap->array[ count ]. word = strdup( word ); | |
minHeap->array[ count ]. root = *root; | |
(*root)->indexMinHeap = minHeap->count; | |
++( minHeap->count ); | |
buildMinHeap( minHeap ); | |
} | |
// Case 3: Word is not present and heap is full. And frequency of word | |
// is more than root. The root is the least frequent word in heap, | |
// replace root with new word | |
else if ( (*root)->frequency > minHeap->array[0]. frequency ) | |
{ | |
minHeap->array[ 0 ]. root->indexMinHeap = -1; | |
minHeap->array[ 0 ]. root = *root; | |
minHeap->array[ 0 ]. root->indexMinHeap = 0; | |
minHeap->array[ 0 ]. frequency = (*root)->frequency; | |
// delete previously allocated memory and | |
free( minHeap->array[ 0 ]. word ); | |
minHeap->array[ 0 ]. word = strdup( word ); | |
minHeapify ( minHeap, 0 ); | |
} | |
} | |
// Inserts a new word to both Trie and Heap | |
void insertUtil ( TrieNode** root, MinHeap* minHeap, | |
const char* word, const char* dupWord ) | |
{ | |
// Base Case | |
if ( *root == NULL ) | |
*root = newTrieNode(); | |
// There are still more characters in word | |
if ( *word != '\0' ) | |
insertUtil ( &((*root)->child[ tolower( *word ) - 97 ]), | |
minHeap, word + 1, dupWord ); | |
else // The complete word is processed | |
{ | |
// word is already present, increase the frequency | |
if ( (*root)->isEnd ) | |
++( (*root)->frequency ); | |
else | |
{ | |
(*root)->isEnd = 1; | |
(*root)->frequency = 1; | |
} | |
// Insert in min heap also | |
insertInMinHeap( minHeap, root, dupWord ); | |
} | |
} | |
// add a word to Trie & min heap. A wrapper over the insertUtil | |
void insertTrieAndHeap(const char *word, TrieNode** root, MinHeap* minHeap) | |
{ | |
insertUtil( root, minHeap, word, word ); | |
} | |
// A routine to show results, The min heap | |
// contains k most frequent words so far, at any time | |
void displayMinHeap( MinHeap* minHeap ) | |
{ | |
// Sort in decreasing order of frequency | |
std::sort(minHeap->array, minHeap->array + minHeap->count, compareMinHeapNodes); | |
// print top K word with frequency | |
for(int i = 0; i < minHeap->count; ++i ) { | |
// convert the stored word to lowercase for consistency | |
for(int j = 0; minHeap->array[i].word[j]; j++) | |
minHeap->array[i].word[j] = tolower( minHeap->array[i].word[j] ); | |
// display word and frequency | |
printf("%d %s\n", | |
minHeap->array[i].frequency, | |
minHeap->array[i].word | |
); | |
} | |
} | |
// The main function that takes a file as input, add words to heap | |
// and Trie, finally shows result from heap | |
void printKMostFreq( FILE* fp, int k ) | |
{ | |
// Create a Min Heap of Size k | |
MinHeap* minHeap = createMinHeap( k ); | |
// Create an empty Trie | |
TrieNode* root = NULL; | |
// A buffer to store one word at a time (as well as junk inbetween words) | |
char buffer[BUFFER_SIZE]; | |
// Prepare format strings for input scanning | |
char format[20]; | |
sprintf(format, "%%%d[a-zA-Z]", (int)sizeof(buffer)-1); | |
char junkformat[20]; | |
sprintf(junkformat, "%%%d[^a-zA-Z]", (int)sizeof(buffer)-1); | |
// Read words one by one from file. Insert the word in Trie and Min Heap | |
fscanf( fp, junkformat, buffer ); | |
while( fscanf( fp, format, buffer ) != EOF ) { | |
insertTrieAndHeap(buffer, &root, minHeap); | |
fscanf( fp, junkformat, buffer ); | |
} | |
// The Min Heap will have the k most frequent words, so print Min Heap nodes | |
displayMinHeap( minHeap ); | |
} | |
int main(int argc, char* argv[]) | |
{ | |
if (argc < 3) { | |
printf("Usage: %s k filename\n", argv[0]); | |
printf(" k - number of most frequent words to display\n"); | |
printf(" fn - text file name\n"); | |
return 1; | |
} | |
int k = std::stoi(argv[1]); | |
FILE *fp = fopen (argv[2], "r"); | |
if (fp == NULL) | |
printf ("file \"%s\" doesn't exist\n", argv[2]); | |
else | |
printKMostFreq (fp, k); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment