dwickwire · January 28, 2014 05:39
diff --git a/spelling.py b/spelling.py
 # Probability of a spelling correction, c =
 #   Probability(c is a word) * 
 #       Probability(original is a typo for c)
 # Best correction =
 #   one with highest probability
 # Probability(c is a word) =
 #   estimated by counting
 # Probability(original is a typo for c) = 
 #   proportional to number of changes
 
 # correction("thew") = ?
 # Compare:
 # P("the" is a word) * P("thew" typo "the")
 # P("thaw" is a word) * P("thew" typo "thaw")
 # ... 
 
 import string, collections


 def train(filename):
    P = collections.defaultdict(lamba: 1)
    for line in file(filename):
        word, count = line.split()
        P[word] = int(count)
    return P


 P = train('en100k.txt')


 def edits1(word):
    n = len(word)
    return set([word[0:i]+word[i+1:] for i in range(n)] + # deletion
            [word[0:i]+word[i]+word[i+2:] for i in range(n-1) + # transposition
            [word[0:i]+c+word[i+1:] for i in range(n) for c in string.lowercase] + # alteration
            [word[0:i]+c+word[i:] for i in range(n+1) for c in string.lowercase]) # insertion
            

 def known_edits2(word):
    return set(e2 for e1 in edits1(word) for e2 in edits1(e1) if e2 in P)
    

 def known(words):
    return set(w for w in words if w in P)
    

 def correct(word):
    candidates = known([word]) or known(edits1(word)) or known_edits2(word) or [word]
    return argmax(candidates, P)
	# Probability of a spelling correction, c =
	# Probability(c is a word) *
	# Probability(original is a typo for c)
	# Best correction =
	# one with highest probability
	# Probability(c is a word) =
	# estimated by counting
	# Probability(original is a typo for c) =
	# proportional to number of changes

	# correction("thew") = ?
	# Compare:
	# P("the" is a word) * P("thew" typo "the")
	# P("thaw" is a word) * P("thew" typo "thaw")
	# ...

	import string, collections


	def train(filename):
	P = collections.defaultdict(lamba: 1)
	for line in file(filename):
	word, count = line.split()
	P[word] = int(count)
	return P


	P = train('en100k.txt')


	def edits1(word):
	n = len(word)
	return set([word[0:i]+word[i+1:] for i in range(n)] + # deletion
	[word[0:i]+word[i]+word[i+2:] for i in range(n-1) + # transposition
	[word[0:i]+c+word[i+1:] for i in range(n) for c in string.lowercase] + # alteration
	[word[0:i]+c+word[i:] for i in range(n+1) for c in string.lowercase]) # insertion


	def known_edits2(word):
	return set(e2 for e1 in edits1(word) for e2 in edits1(e1) if e2 in P)


	def known(words):
	return set(w for w in words if w in P)


	def correct(word):
	candidates = known([word]) or known(edits1(word)) or known_edits2(word) or [word]
	return argmax(candidates, P)