Skip to content

Instantly share code, notes, and snippets.

Created March 4, 2013 22:23

Revisions

  1. @invalid-email-address Anonymous created this gist Mar 4, 2013.
    147 changes: 147 additions & 0 deletions turbo.cpp
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,147 @@
    #include<iostream>
    #include<unistd.h>
    #include<algorithm>
    #include<map>
    #include<vector>
    #include<string>
    #include<cassert>
    #include<sstream>
    #include<fstream>
    using namespace std;

    char fix_character(char c)
    {
    if(c >= 'a' && c <= 'z')
    return c;

    if(c >= 'A' && c <= 'Z')
    return c + 32;

    return -1;
    }

    string fix_characters(string s)
    {
    string new_string;

    for(size_t i = 0; i < s.size(); i++) {
    char c = fix_character(s[i]);
    if(c != -1) new_string += c;
    }

    return new_string;
    }

    #define TRIE_SIZE 26

    struct Trie {
    vector<Trie*> children;
    bool endpoint;

    Trie()
    {
    this->children.assign(TRIE_SIZE, NULL);
    this->endpoint = false;
    }

    void add(string word)
    {
    word = fix_characters(word);
    add(word, 0);
    }

    void find_and_add_to_map(string word, map<string, unsigned int> &hits)
    {
    find_and_add_to_map(word, hits, 0);
    }

    private:
    void add(string &word, size_t index)
    {
    if(index == word.size()) {
    this->endpoint = true;
    } else {
    size_t character = word[index] - 'a';

    if(character > TRIE_SIZE) {
    cerr << "TURBO: Refusing to add " << word << " to search trie." << endl;
    return;
    }

    if(this->children[character] == NULL)
    this->children[character] = new Trie;

    this->children[character]->add(word, index + 1);
    }
    }

    void find_and_add_to_map(string &word, map<string, unsigned int> &hits, size_t index)
    {
    if(this->endpoint)
    hits[word.substr(0, index)]++;

    if(index == word.size() + 1)
    return;

    size_t character = word[index] - 'a';

    if(character > TRIE_SIZE) // error
    return;

    if(this->children[character] == NULL)
    return;

    this->children[character]->find_and_add_to_map(word, hits, index + 1);
    }
    };

    Trie build_search_trie()
    {
    Trie root;
    ifstream search("search.csv");
    string line;

    while(getline(search, line)) {
    stringstream ss(line);
    getline(ss, line, ';');

    root.add(line);
    }

    cerr << "TURBO: Done building search trie." << endl;

    return root;
    }

    map<string, unsigned int> corpus_search(Trie &root)
    {
    ifstream corpus("corpus.csv");
    string line;
    map<string, unsigned int> hits;

    while(getline(corpus, line)) {
    stringstream ss(line);

    for(size_t i = 0; i <= 3 && getline(ss, line, '|'); i++) {
    if(i == 0) continue;
    line = fix_characters(line);

    for(size_t k = 0; k < line.size(); k++)
    root.find_and_add_to_map(line.substr(k, line.size() - k), hits);
    }
    }

    return hits;
    }

    int main()
    {
    Trie root = build_search_trie();
    auto hits = corpus_search(root);

    // cout << "term;count" << endl;
    // for(auto hit : hits)
    // cout << hit.first << ";" << hit.second << endl;

    return 0;
    }