amakukha · August 7, 2022 18:01
diff --git a/mostfreq.cpp b/mostfreq.cpp
 /* 
 * Solution of the Jon Bentley's k most frequent words problem using a prefix tree and
 * a heap of size k. Worst case time complexity is O(N log k), where N is the total 
 * number of words.
 *
 * The problem is formulated in the Communications of the ACM 29,5 (May 1986), 364-369:
 *     "Given a text file and an integer k, you are to print the k
 *     most common words in the file (and the number of their
 *     occurrences) in decreasing frequency."
 * 
 * Donald Knuth famously solved the problem by using hash tries in a an 8-pages-long
 * WEB program. Doug McIlroy, in response, solved it in six lines of shell script 
 * reusing standard Unix utilities (even though his solution is suboptimal).
 */

 // Original: https://www.geeksforgeeks.org/find-the-k-most-frequent-words-from-a-file/
 // Licence: Creative Commons Attribution-ShareAlike 4.0 International (CC BY-SA 4.0)
 // Modifications by Andriy Makukha:
 // - increased max. word size
 // - scanning input for alphabetic words only, ignoring the rest
 // - taking arguments from command line
 // - sorting output

 #include <stdio.h>
 #include <string> 
 #include <algorithm>

 # define MAX_CHARS 26 
 # define BUFFER_SIZE 10000   // both max. word size and max. distance between words (+1)

 // A Trie node 
 struct TrieNode 
 { 
        bool isEnd; // indicates end of word 
        unsigned frequency; // the number of occurrences of a word 
        int indexMinHeap; // the index of the word in minHeap 
        TrieNode* child[MAX_CHARS]; // represents 26 slots each for 'a' to 'z'. 
 }; 

 // A Min Heap node 
 struct MinHeapNode 
 { 
        TrieNode* root; // indicates the leaf node of TRIE 
        unsigned frequency; // number of occurrences 
        char* word; // the actual word stored 
 }; 

 // A Min Heap 
 struct MinHeap 
 { 
        unsigned capacity; // the total size a min heap 
        int count; // indicates the number of slots filled. 
        MinHeapNode* array; // represents the collection of minHeapNodes 
 }; 

 bool compareMinHeapNodes(MinHeapNode n1, MinHeapNode n2) {
        return n1.frequency > n2.frequency;
 };

 // A routine to create a new Trie node 
 TrieNode* newTrieNode() 
 { 
        // Allocate memory for Trie Node 
        TrieNode* trieNode = new TrieNode; 

        // Initialize values for new node 
        trieNode->isEnd = 0; 
        trieNode->frequency = 0; 
        trieNode->indexMinHeap = -1; 
        for( int i = 0; i < MAX_CHARS; ++i ) 
                trieNode->child[i] = NULL; 

        return trieNode; 
 } 

 // A routine to create a Min Heap of given capacity 
 MinHeap* createMinHeap( int capacity ) 
 { 
        MinHeap* minHeap = new MinHeap; 

        minHeap->capacity = capacity; 
        minHeap->count = 0; 

        // Allocate memory for array of min heap nodes 
        minHeap->array = new MinHeapNode [ minHeap->capacity ]; 

        return minHeap; 
 } 

 // A routine to swap two min heap nodes. This function 
 // is needed in minHeapify 
 void swapMinHeapNodes ( MinHeapNode* a, MinHeapNode* b ) 
 { 
        MinHeapNode temp = *a; 
        *a = *b; 
        *b = temp; 
 } 

 // This is the standard minHeapify function. It does one thing extra. 
 // It updates the minHeapIndex in Trie when two nodes are swapped in 
 // in min heap 
 void minHeapify( MinHeap* minHeap, int idx ) 
 { 
        int left, right, smallest; 

        left = 2 * idx + 1; 
        right = 2 * idx + 2; 
        smallest = idx; 
        if ( left < minHeap->count && 
                minHeap->array[ left ]. frequency < 
                minHeap->array[ smallest ]. frequency 
        ) 
                smallest = left; 

        if ( right < minHeap->count && 
                minHeap->array[ right ]. frequency < 
                minHeap->array[ smallest ]. frequency 
        ) 
                smallest = right; 

        if( smallest != idx ) 
        { 
                // Update the corresponding index in Trie node. 
                minHeap->array[ smallest ]. root->indexMinHeap = idx; 
                minHeap->array[ idx ]. root->indexMinHeap = smallest; 

                // Swap nodes in min heap 
                swapMinHeapNodes (&minHeap->array[ smallest ], &minHeap->array[ idx ]); 

                minHeapify( minHeap, smallest ); 
        } 
 } 

 // A standard function to build a heap 
 void buildMinHeap( MinHeap* minHeap ) 
 { 
        int n, i; 
        n = minHeap->count - 1; 

        for( i = ( n - 1 ) / 2; i >= 0; --i ) 
                minHeapify( minHeap, i ); 
 } 

 // Inserts a word to heap, the function handles the 3 cases explained above 
 void insertInMinHeap( MinHeap* minHeap, TrieNode** root, const char* word ) 
 { 
        // Case 1: the word is already present in minHeap 
        if( (*root)->indexMinHeap != -1 ) 
        { 
                ++( minHeap->array[ (*root)->indexMinHeap ]. frequency ); 

                // percolate down 
                minHeapify( minHeap, (*root)->indexMinHeap ); 
        } 

        // Case 2: Word is not present and heap is not full 
        else if( minHeap->count < minHeap->capacity ) 
        { 
                int count = minHeap->count; 
                minHeap->array[ count ]. frequency = (*root)->frequency; 
                minHeap->array[ count ]. word = strdup( word );
                minHeap->array[ count ]. root = *root; 
                (*root)->indexMinHeap = minHeap->count; 

                ++( minHeap->count ); 
                buildMinHeap( minHeap ); 
        } 

        // Case 3: Word is not present and heap is full. And frequency of word 
        // is more than root. The root is the least frequent word in heap, 
        // replace root with new word 
        else if ( (*root)->frequency > minHeap->array[0]. frequency ) 
        { 

                minHeap->array[ 0 ]. root->indexMinHeap = -1; 
                minHeap->array[ 0 ]. root = *root; 
                minHeap->array[ 0 ]. root->indexMinHeap = 0; 
                minHeap->array[ 0 ]. frequency = (*root)->frequency; 

                // delete previously allocated memory and 
                free( minHeap->array[ 0 ]. word ); 
                minHeap->array[ 0 ]. word = strdup( word );

                minHeapify ( minHeap, 0 ); 
        } 
 } 

 // Inserts a new word to both Trie and Heap 
 void insertUtil ( TrieNode** root, MinHeap* minHeap, 
                                                const char* word, const char* dupWord ) 
 { 
        // Base Case 
        if ( *root == NULL ) 
                *root = newTrieNode(); 

        // There are still more characters in word 
        if ( *word != '\0' ) 
                insertUtil ( &((*root)->child[ tolower( *word ) - 97 ]), 
                                                minHeap, word + 1, dupWord ); 
        else // The complete word is processed 
        { 
                // word is already present, increase the frequency 
                if ( (*root)->isEnd ) 
                        ++( (*root)->frequency ); 
                else
                { 
                        (*root)->isEnd = 1; 
                        (*root)->frequency = 1; 
                } 

                // Insert in min heap also 
                insertInMinHeap( minHeap, root, dupWord ); 
        } 
 } 


 // add a word to Trie & min heap. A wrapper over the insertUtil 
 void insertTrieAndHeap(const char *word, TrieNode** root, MinHeap* minHeap) 
 { 
        insertUtil( root, minHeap, word, word ); 
 } 

 // A routine to show results, The min heap 
 // contains k most frequent words so far, at any time 
 void displayMinHeap( MinHeap* minHeap ) 
 { 
        // Sort in decreasing order of frequency
        std::sort(minHeap->array, minHeap->array + minHeap->count, compareMinHeapNodes);

        // print top K word with frequency 
        for(int i = 0; i < minHeap->count; ++i ) {

                // convert the stored word to lowercase for consistency
                for(int j = 0; minHeap->array[i].word[j]; j++)
                        minHeap->array[i].word[j] = tolower( minHeap->array[i].word[j] );

                // display word and frequency
                printf("%d %s\n", 
                                    minHeap->array[i].frequency,
                                    minHeap->array[i].word
                ); 
        }
 } 

 // The main function that takes a file as input, add words to heap 
 // and Trie, finally shows result from heap 
 void printKMostFreq( FILE* fp, int k ) 
 { 
        // Create a Min Heap of Size k 
        MinHeap* minHeap = createMinHeap( k ); 
        
        // Create an empty Trie 
        TrieNode* root = NULL; 

        // A buffer to store one word at a time (as well as junk inbetween words)
        char buffer[BUFFER_SIZE]; 

        // Prepare format strings for input scanning
        char format[20];
        sprintf(format, "%%%d[a-zA-Z]", (int)sizeof(buffer)-1);
        char junkformat[20];
        sprintf(junkformat, "%%%d[^a-zA-Z]", (int)sizeof(buffer)-1);

        // Read words one by one from file. Insert the word in Trie and Min Heap 
        fscanf( fp, junkformat, buffer );
        while( fscanf( fp, format, buffer ) != EOF )  {
                insertTrieAndHeap(buffer, &root, minHeap); 
                fscanf( fp, junkformat, buffer );
        }

        // The Min Heap will have the k most frequent words, so print Min Heap nodes 
        displayMinHeap( minHeap ); 
 } 

 int main(int argc, char* argv[]) 
 { 
        if (argc < 3) {
                printf("Usage: %s k filename\n", argv[0]);
                printf("    k  - number of most frequent words to display\n");
                printf("    fn - text file name\n");
                return 1;
        }
        int k = std::stoi(argv[1]); 
        FILE *fp = fopen (argv[2], "r"); 
        if (fp == NULL) 
                printf ("file \"%s\" doesn't exist\n", argv[2]); 
        else
                printKMostFreq (fp, k); 
        return 0; 
 }
	/*
	* Solution of the Jon Bentley's k most frequent words problem using a prefix tree and
	* a heap of size k. Worst case time complexity is O(N log k), where N is the total
	* number of words.
	*
	* The problem is formulated in the Communications of the ACM 29,5 (May 1986), 364-369:
	* "Given a text file and an integer k, you are to print the k
	* most common words in the file (and the number of their
	* occurrences) in decreasing frequency."
	*
	* Donald Knuth famously solved the problem by using hash tries in a an 8-pages-long
	* WEB program. Doug McIlroy, in response, solved it in six lines of shell script
	* reusing standard Unix utilities (even though his solution is suboptimal).
	*/

	// Original: https://www.geeksforgeeks.org/find-the-k-most-frequent-words-from-a-file/
	// Licence: Creative Commons Attribution-ShareAlike 4.0 International (CC BY-SA 4.0)
	// Modifications by Andriy Makukha:
	// - increased max. word size
	// - scanning input for alphabetic words only, ignoring the rest
	// - taking arguments from command line
	// - sorting output

	#include <stdio.h>
	#include <string>
	#include <algorithm>

	# define MAX_CHARS 26
	# define BUFFER_SIZE 10000 // both max. word size and max. distance between words (+1)

	// A Trie node
	struct TrieNode
	{
	bool isEnd; // indicates end of word
	unsigned frequency; // the number of occurrences of a word
	int indexMinHeap; // the index of the word in minHeap
	TrieNode* child[MAX_CHARS]; // represents 26 slots each for 'a' to 'z'.
	};

	// A Min Heap node
	struct MinHeapNode
	{
	TrieNode* root; // indicates the leaf node of TRIE
	unsigned frequency; // number of occurrences
	char* word; // the actual word stored
	};

	// A Min Heap
	struct MinHeap
	{
	unsigned capacity; // the total size a min heap
	int count; // indicates the number of slots filled.
	MinHeapNode* array; // represents the collection of minHeapNodes
	};

	bool compareMinHeapNodes(MinHeapNode n1, MinHeapNode n2) {
	return n1.frequency > n2.frequency;
	};

	// A routine to create a new Trie node
	TrieNode* newTrieNode()
	{
	// Allocate memory for Trie Node
	TrieNode* trieNode = new TrieNode;

	// Initialize values for new node
	trieNode->isEnd = 0;
	trieNode->frequency = 0;
	trieNode->indexMinHeap = -1;
	for( int i = 0; i < MAX_CHARS; ++i )
	trieNode->child[i] = NULL;

	return trieNode;
	}

	// A routine to create a Min Heap of given capacity
	MinHeap* createMinHeap( int capacity )
	{
	MinHeap* minHeap = new MinHeap;

	minHeap->capacity = capacity;
	minHeap->count = 0;

	// Allocate memory for array of min heap nodes
	minHeap->array = new MinHeapNode [ minHeap->capacity ];

	return minHeap;
	}

	// A routine to swap two min heap nodes. This function
	// is needed in minHeapify
	void swapMinHeapNodes ( MinHeapNode* a, MinHeapNode* b )
	{
	MinHeapNode temp = *a;
	a = b;
	*b = temp;
	}

	// This is the standard minHeapify function. It does one thing extra.
	// It updates the minHeapIndex in Trie when two nodes are swapped in
	// in min heap
	void minHeapify( MinHeap* minHeap, int idx )
	{
	int left, right, smallest;

	left = 2 * idx + 1;
	right = 2 * idx + 2;
	smallest = idx;
	if ( left < minHeap->count &&
	minHeap->array[ left ]. frequency <
	minHeap->array[ smallest ]. frequency
	)
	smallest = left;

	if ( right < minHeap->count &&
	minHeap->array[ right ]. frequency <
	minHeap->array[ smallest ]. frequency
	)
	smallest = right;

	if( smallest != idx )
	{
	// Update the corresponding index in Trie node.
	minHeap->array[ smallest ]. root->indexMinHeap = idx;
	minHeap->array[ idx ]. root->indexMinHeap = smallest;

	// Swap nodes in min heap
	swapMinHeapNodes (&minHeap->array[ smallest ], &minHeap->array[ idx ]);

	minHeapify( minHeap, smallest );
	}
	}

	// A standard function to build a heap
	void buildMinHeap( MinHeap* minHeap )
	{
	int n, i;
	n = minHeap->count - 1;

	for( i = ( n - 1 ) / 2; i >= 0; --i )
	minHeapify( minHeap, i );
	}

	// Inserts a word to heap, the function handles the 3 cases explained above
	void insertInMinHeap( MinHeap* minHeap, TrieNode** root, const char* word )
	{
	// Case 1: the word is already present in minHeap
	if( (*root)->indexMinHeap != -1 )
	{
	++( minHeap->array[ (*root)->indexMinHeap ]. frequency );

	// percolate down
	minHeapify( minHeap, (*root)->indexMinHeap );
	}

	// Case 2: Word is not present and heap is not full
	else if( minHeap->count < minHeap->capacity )
	{
	int count = minHeap->count;
	minHeap->array[ count ]. frequency = (*root)->frequency;
	minHeap->array[ count ]. word = strdup( word );
	minHeap->array[ count ]. root = *root;
	(*root)->indexMinHeap = minHeap->count;

	++( minHeap->count );
	buildMinHeap( minHeap );
	}

	// Case 3: Word is not present and heap is full. And frequency of word
	// is more than root. The root is the least frequent word in heap,
	// replace root with new word
	else if ( (*root)->frequency > minHeap->array[0]. frequency )
	{

	minHeap->array[ 0 ]. root->indexMinHeap = -1;
	minHeap->array[ 0 ]. root = *root;
	minHeap->array[ 0 ]. root->indexMinHeap = 0;
	minHeap->array[ 0 ]. frequency = (*root)->frequency;

	// delete previously allocated memory and
	free( minHeap->array[ 0 ]. word );
	minHeap->array[ 0 ]. word = strdup( word );

	minHeapify ( minHeap, 0 );
	}
	}

	// Inserts a new word to both Trie and Heap
	void insertUtil ( TrieNode** root, MinHeap* minHeap,
	const char* word, const char* dupWord )
	{
	// Base Case
	if ( *root == NULL )
	*root = newTrieNode();

	// There are still more characters in word
	if ( *word != '\0' )
	insertUtil ( &((root)->child[ tolower( word ) - 97 ]),
	minHeap, word + 1, dupWord );
	else // The complete word is processed
	{
	// word is already present, increase the frequency
	if ( (*root)->isEnd )
	++( (*root)->frequency );
	else
	{
	(*root)->isEnd = 1;
	(*root)->frequency = 1;
	}

	// Insert in min heap also
	insertInMinHeap( minHeap, root, dupWord );
	}
	}


	// add a word to Trie & min heap. A wrapper over the insertUtil
	void insertTrieAndHeap(const char word, TrieNode* root, MinHeap* minHeap)
	{
	insertUtil( root, minHeap, word, word );
	}

	// A routine to show results, The min heap
	// contains k most frequent words so far, at any time
	void displayMinHeap( MinHeap* minHeap )
	{
	// Sort in decreasing order of frequency
	std::sort(minHeap->array, minHeap->array + minHeap->count, compareMinHeapNodes);

	// print top K word with frequency
	for(int i = 0; i < minHeap->count; ++i ) {

	// convert the stored word to lowercase for consistency
	for(int j = 0; minHeap->array[i].word[j]; j++)
	minHeap->array[i].word[j] = tolower( minHeap->array[i].word[j] );

	// display word and frequency
	printf("%d %s\n",
	minHeap->array[i].frequency,
	minHeap->array[i].word
	);
	}
	}

	// The main function that takes a file as input, add words to heap
	// and Trie, finally shows result from heap
	void printKMostFreq( FILE* fp, int k )
	{
	// Create a Min Heap of Size k
	MinHeap* minHeap = createMinHeap( k );

	// Create an empty Trie
	TrieNode* root = NULL;

	// A buffer to store one word at a time (as well as junk inbetween words)
	char buffer[BUFFER_SIZE];

	// Prepare format strings for input scanning
	char format[20];
	sprintf(format, "%%%d[a-zA-Z]", (int)sizeof(buffer)-1);
	char junkformat[20];
	sprintf(junkformat, "%%%d[^a-zA-Z]", (int)sizeof(buffer)-1);

	// Read words one by one from file. Insert the word in Trie and Min Heap
	fscanf( fp, junkformat, buffer );
	while( fscanf( fp, format, buffer ) != EOF ) {
	insertTrieAndHeap(buffer, &root, minHeap);
	fscanf( fp, junkformat, buffer );
	}

	// The Min Heap will have the k most frequent words, so print Min Heap nodes
	displayMinHeap( minHeap );
	}

	int main(int argc, char* argv[])
	{
	if (argc < 3) {
	printf("Usage: %s k filename\n", argv[0]);
	printf(" k - number of most frequent words to display\n");
	printf(" fn - text file name\n");
	return 1;
	}
	int k = std::stoi(argv[1]);
	FILE *fp = fopen (argv[2], "r");
	if (fp == NULL)
	printf ("file \"%s\" doesn't exist\n", argv[2]);
	else
	printKMostFreq (fp, k);
	return 0;
	}