Skip to content

Instantly share code, notes, and snippets.

@bgreenlee
Created October 28, 2011 00:01

Revisions

  1. bgreenlee revised this gist Oct 28, 2011. 1 changed file with 3 additions and 3 deletions.
    6 changes: 3 additions & 3 deletions autocorrrect.py
    Original file line number Diff line number Diff line change
    @@ -2,6 +2,8 @@
    import collections
    from operator import itemgetter

    WORDFILE = '/usr/share/dict/words'

    class Autocorrect(object):
    """
    Very simplistic implementation of autocorrect using ngrams.
    @@ -10,8 +12,7 @@ def __init__(self, ngram_size=3, len_variance=1):
    self.ngram_size = ngram_size
    self.len_variance = len_variance

    wordfile = os.path.join(os.path.dirname(__file__), "words")
    self.words = set(open(wordfile).read().splitlines())
    self.words = set([w.lower() for w in open(WORDFILE).read().splitlines()])

    # create dictionary of ngrams and the words that contain them
    self.ngram_words = collections.defaultdict(set)
    @@ -57,4 +58,3 @@ def suggested_words(self, target_word, results=5):
    else:
    suggestions = autocorrect.suggested_words(word)
    print "Maybe you meant: %s" % ", ".join(suggestions)

  2. bgreenlee created this gist Oct 28, 2011.
    60 changes: 60 additions & 0 deletions autocorrrect.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,60 @@
    import os.path
    import collections
    from operator import itemgetter

    class Autocorrect(object):
    """
    Very simplistic implementation of autocorrect using ngrams.
    """
    def __init__(self, ngram_size=3, len_variance=1):
    self.ngram_size = ngram_size
    self.len_variance = len_variance

    wordfile = os.path.join(os.path.dirname(__file__), "words")
    self.words = set(open(wordfile).read().splitlines())

    # create dictionary of ngrams and the words that contain them
    self.ngram_words = collections.defaultdict(set)
    for word in self.words:
    for ngram in self.ngrams(word):
    self.ngram_words[ngram].add(word)
    print "Generated %d ngrams from %d words" % (len(self.ngram_words), len(self.words))

    def lookup(self, word):
    "Return True if the word exists in the dictionary."
    return word in self.words

    def ngrams(self, word):
    "Given a word, return the set of unique ngrams in that word."
    all_ngrams = set()
    for i in range(0, len(word) - self.ngram_size + 1):
    all_ngrams.add(word[i:i + self.ngram_size])
    return all_ngrams

    def suggested_words(self, target_word, results=5):
    "Given a word, return a list of possible corrections."
    word_ranking = collections.defaultdict(int)
    possible_words = set()
    for ngram in self.ngrams(target_word):
    words = self.ngram_words[ngram]
    for word in words:
    # only use words that are within +-LEN_VARIANCE characters in
    # length of the target word
    if len(word) >= len(target_word) - self.len_variance and \
    len(word) <= len(target_word) + self.len_variance:
    word_ranking[word] += 1
    # sort by descending frequency
    ranked_word_pairs = sorted(word_ranking.iteritems(), key=itemgetter(1), reverse=True)
    return [word_pair[0] for word_pair in ranked_word_pairs[0:results]]


    if __name__ == '__main__':
    autocorrect = Autocorrect()
    while True:
    word = raw_input("Enter a word: ").lower()
    if autocorrect.lookup(word):
    print "Looks good to me!"
    else:
    suggestions = autocorrect.suggested_words(word)
    print "Maybe you meant: %s" % ", ".join(suggestions)