Created
March 4, 2013 22:23
Revisions
-
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,147 @@ #include<iostream> #include<unistd.h> #include<algorithm> #include<map> #include<vector> #include<string> #include<cassert> #include<sstream> #include<fstream> using namespace std; char fix_character(char c) { if(c >= 'a' && c <= 'z') return c; if(c >= 'A' && c <= 'Z') return c + 32; return -1; } string fix_characters(string s) { string new_string; for(size_t i = 0; i < s.size(); i++) { char c = fix_character(s[i]); if(c != -1) new_string += c; } return new_string; } #define TRIE_SIZE 26 struct Trie { vector<Trie*> children; bool endpoint; Trie() { this->children.assign(TRIE_SIZE, NULL); this->endpoint = false; } void add(string word) { word = fix_characters(word); add(word, 0); } void find_and_add_to_map(string word, map<string, unsigned int> &hits) { find_and_add_to_map(word, hits, 0); } private: void add(string &word, size_t index) { if(index == word.size()) { this->endpoint = true; } else { size_t character = word[index] - 'a'; if(character > TRIE_SIZE) { cerr << "TURBO: Refusing to add " << word << " to search trie." << endl; return; } if(this->children[character] == NULL) this->children[character] = new Trie; this->children[character]->add(word, index + 1); } } void find_and_add_to_map(string &word, map<string, unsigned int> &hits, size_t index) { if(this->endpoint) hits[word.substr(0, index)]++; if(index == word.size() + 1) return; size_t character = word[index] - 'a'; if(character > TRIE_SIZE) // error return; if(this->children[character] == NULL) return; this->children[character]->find_and_add_to_map(word, hits, index + 1); } }; Trie build_search_trie() { Trie root; ifstream search("search.csv"); string line; while(getline(search, line)) { stringstream ss(line); getline(ss, line, ';'); root.add(line); } cerr << "TURBO: Done building search trie." << endl; return root; } map<string, unsigned int> corpus_search(Trie &root) { ifstream corpus("corpus.csv"); string line; map<string, unsigned int> hits; while(getline(corpus, line)) { stringstream ss(line); for(size_t i = 0; i <= 3 && getline(ss, line, '|'); i++) { if(i == 0) continue; line = fix_characters(line); for(size_t k = 0; k < line.size(); k++) root.find_and_add_to_map(line.substr(k, line.size() - k), hits); } } return hits; } int main() { Trie root = build_search_trie(); auto hits = corpus_search(root); // cout << "term;count" << endl; // for(auto hit : hits) // cout << hit.first << ";" << hit.second << endl; return 0; }