Created
April 10, 2017 11:42
-
-
Save sergeytimoshin/ae2b7152ac425a8de1a1d2b47b0b27ce to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/// Translation of [Peter Norvig's spell checker](http://norvig.com/spell-correct.html) into Swift 3. | |
/// Sample input corpus [here](http://norvig.com/big.txt) | |
/// Swift 2 version: https://gist.github.com/airspeedswift/6d8c9d95f0d812f58be3 | |
import Foundation // purely for IO, most things done with Swift.String | |
/// Given a word, produce a set of possible alternatives with | |
/// letters transposed, deleted, replaced or rogue characters inserted | |
func edits(word: String) -> Set<String> { | |
if word.isEmpty { return [] } | |
let splits = word.characters.indices.map { | |
(word[word.startIndex..<$0], word[$0..<word.endIndex]) | |
} | |
let deletes = splits.map { $0.0 + String($0.1.characters.dropFirst()) } | |
let transposes: [String] = splits.map { left, right in | |
if let fst = right.characters.first { | |
let drop1 = String(right.characters.dropFirst()) | |
if let snd = drop1.characters.first { | |
let drop2 = String(drop1.characters.dropFirst()) | |
return "\(left)\(snd)\(fst)\(drop2)" | |
} | |
} | |
return "" | |
}.filter { !$0.isEmpty } | |
let alphabet = "abcdefghijklmnopqrstuvwxyz" | |
let replaces = splits.flatMap { left, right in | |
alphabet.characters.map { "\(left)\($0)\(String(right.characters.dropFirst()))" } | |
} | |
let inserts = splits.flatMap { left, right in | |
alphabet.characters.map { "\(left)\($0)\(right)" } | |
} | |
return Set(deletes + transposes + replaces + inserts) | |
} | |
struct SpellChecker { | |
var knownWords: [String:Int] = [:] | |
mutating func train(word: String) { | |
if let idx = knownWords[word] { | |
knownWords[word] = idx + 1 | |
} | |
else { | |
knownWords[word] = 1 | |
} | |
} | |
init?(contentsOfFile file: String) { | |
do { | |
let text = try String(contentsOfFile: file, encoding: .utf8).lowercased() | |
let words = text.unicodeScalars.split(whereSeparator: { !("a"..."z").contains($0) }).map { String($0) } | |
for word in words { self.train(word: word) } | |
} | |
catch { | |
return nil | |
} | |
} | |
func knownEdits2(word: String) -> Set<String>? { | |
var known_edits: Set<String> = [] | |
for edit in edits(word: word) { | |
if let k = known(words: edits(word: edit)) { | |
known_edits.formUnion(k) | |
} | |
} | |
return known_edits.isEmpty ? nil : known_edits | |
} | |
func known<S: Sequence>(words: S) -> Set<String>? where S.Iterator.Element == String { | |
let s = Set(words.filter { self.knownWords.index(forKey: $0) != nil }) | |
return s.isEmpty ? nil : s | |
} | |
func candidates(word: String) -> Set<String> { | |
guard let result = known(words: [word]) ?? known(words: edits(word: word)) ?? knownEdits2(word: word) else { | |
return Set<String>() | |
} | |
return result | |
} | |
func correct(word: String) -> String { | |
return candidates(word: word).reduce(word) { | |
(knownWords[$0] ?? 1) < (knownWords[$1] ?? 1) ? $1 : $0 | |
} | |
} | |
} | |
// main() | |
if CommandLine.argc == 2, let checker = SpellChecker(contentsOfFile: CommandLine.arguments[1]) { | |
print("Type word to check and hit enter") | |
while let word = String(data: FileHandle.standardInput.availableData, encoding: .utf8), word.characters.last == "\n" | |
{ | |
let word = String(word.characters.dropLast()) | |
let checked = checker.correct(word: word) | |
let candidates = checker.candidates(word: word) | |
if word == checked { | |
print("\(word) unchanged") | |
} | |
else { | |
print("Correct:\t\(word) -> \(checked)") | |
print("Candidates:\t\(word) -> \(candidates)") | |
} | |
} | |
} | |
else { | |
print("Usage: (Process.arguments[0]) <corpus_filename>") | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thanks for this! However, spell check fails for words like
youre
(checked isyour
instead ofyou're
), etc.