Skip to content

Instantly share code, notes, and snippets.

@tty
Created February 8, 2010 14:24

Revisions

  1. tty revised this gist Feb 8, 2010. 1 changed file with 2 additions and 1 deletion.
    3 changes: 2 additions & 1 deletion gistfile1.rb
    Original file line number Diff line number Diff line change
    @@ -1,4 +1,5 @@
    # basic text search with relevancy for MongoDB. See http://blog.tty.nl/2010/02/08/simple-ranked-text-search-for-mongodb/
    # Basic text search with relevancy for MongoDB.
    # See http://blog.tty.nl/2010/02/08/simple-ranked-text-search-for-mongodb/
    # Copythingie 2010 - Ward Bekker - [email protected]

    #create (or empty) a docs collection
  2. tty revised this gist Feb 8, 2010. 1 changed file with 3 additions and 0 deletions.
    3 changes: 3 additions & 0 deletions gistfile1.rb
    Original file line number Diff line number Diff line change
    @@ -1,3 +1,6 @@
    # basic text search with relevancy for MongoDB. See http://blog.tty.nl/2010/02/08/simple-ranked-text-search-for-mongodb/
    # Copythingie 2010 - Ward Bekker - [email protected]

    #create (or empty) a docs collection
    doc_col = MongoMapper.connection.db('example_db').collection('docs')
    doc_col.remove({})
  3. tty revised this gist Feb 8, 2010. 1 changed file with 1 addition and 0 deletions.
    1 change: 1 addition & 0 deletions gistfile1.rb
    Original file line number Diff line number Diff line change
    @@ -2,6 +2,7 @@
    doc_col = MongoMapper.connection.db('example_db').collection('docs')
    doc_col.remove({})

    #add some sample data
    doc_col.insert({ "txt" => "it is what it is"})
    doc_col.insert({ "txt" => "what is it"})
    doc_col.insert({ "txt" => "it is a banana"})
  4. tty revised this gist Feb 8, 2010. 1 changed file with 5 additions and 0 deletions.
    5 changes: 5 additions & 0 deletions gistfile1.rb
    Original file line number Diff line number Diff line change
    @@ -1,10 +1,12 @@
    #create (or empty) a docs collection
    doc_col = MongoMapper.connection.db('example_db').collection('docs')
    doc_col.remove({})

    doc_col.insert({ "txt" => "it is what it is"})
    doc_col.insert({ "txt" => "what is it"})
    doc_col.insert({ "txt" => "it is a banana"})

    #The invix creation map function. Splits the texts in individual words
    map_index =<<JS
    function() {
    var words = this.txt.split(' ');
    @@ -15,6 +17,7 @@
    }
    JS

    # Groups the doc id's for every unique word
    reduce_index =<<JS
    function(key, values) {
    var docs = [];
    @@ -25,6 +28,7 @@
    }
    JS

    # Every document counts as one
    map_relevance =<<JS
    function() {
    for ( var i=0; i< this.value.docs.length; i++ ) {
    @@ -33,6 +37,7 @@
    }
    JS

    # And calculate the amount of occurrences for every unique document id
    reduce_relevance=<<JS
    function(key, values) {
    var sum = 0;
  5. tty created this gist Feb 8, 2010.
    55 changes: 55 additions & 0 deletions gistfile1.rb
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,55 @@
    doc_col = MongoMapper.connection.db('example_db').collection('docs')
    doc_col.remove({})

    doc_col.insert({ "txt" => "it is what it is"})
    doc_col.insert({ "txt" => "what is it"})
    doc_col.insert({ "txt" => "it is a banana"})

    map_index =<<JS
    function() {
    var words = this.txt.split(' ');
    for ( var i=0; i<words.length; i++ ) {
    emit(words[i], { docs: [this._id] });
    }
    }
    JS

    reduce_index =<<JS
    function(key, values) {
    var docs = [];
    values.forEach ( function(val) { docs = docs.concat(val.docs); })
    return { docs: docs };
    }
    JS

    map_relevance =<<JS
    function() {
    for ( var i=0; i< this.value.docs.length; i++ ) {
    emit(this.value.docs[i], { count: 1 });
    }
    }
    JS

    reduce_relevance=<<JS
    function(key, values) {
    var sum = 0;
    values.forEach ( function(val) { sum += val.count; })
    return { count: sum };
    }
    JS

    #calculate the inverted index
    invix_col = doc_col.map_reduce(map_index, reduce_index)
    #calculate the # occcurances of each searchterm
    query = ["what", "is", "it"]
    ranked_result = invix_col.map_reduce(map_relevance, reduce_relevance, { :query => { "_id" => { "$in" => query} } } )

    #output the results, most relevant on top
    ranked_result.find().sort("count", :desc).each do |result|
    puts "document with id #{result["_id"]} has rank #{result["value"]["count"]} : #{doc_col.find_one("_id" => result["_id"]).inspect}"
    end