Last active
November 25, 2024 08:57
-
-
Save wotjd/61738b60ed6afd825774db77071aa5ca to your computer and use it in GitHub Desktop.
poor man's hangul fuzzy search implementation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import Foundation | |
struct FuzzySearch { | |
struct Match { | |
let weight: Double | |
let indexes: [String.Index] | |
let string: String | |
} | |
struct Option: OptionSet { | |
let rawValue: Int | |
static let caseInsensitive = Option(rawValue: 1 << 0) | |
// ignore diacritic difference (for example, "naïve" and "naive"). | |
// static let diacriticInsensitive = Option(rawValue: 1 << 1) | |
// compatibility mapping for example, ® and r | |
// static let compatibilityMapping | |
// static let partialHangulMatch = Option(rawValue: 1 << 7) | |
static let hangulChosungMatch = Option(rawValue: 1 << 8) | |
} | |
enum Scale { | |
case byMaxDistance | |
case byTotalDistance | |
} | |
private let chosungDict = "ㄱㄲㄴㄷㄸㄹㅁㅂㅃㅅㅆㅇㅈㅉㅊㅋㅌㅍㅎ".enumerated().reduce(into: [Character: Character]()) { dict, enumerated in | |
let pair = Character(Unicode.Scalar(0x1100 + enumerated.offset)!) | |
dict[enumerated.element] = pair | |
dict[pair] = enumerated.element | |
} | |
var matchOption: Option = [.caseInsensitive, .hangulChosungMatch] | |
var scale = Scale.byMaxDistance | |
var countsFirstMatchDistance = false | |
private func calculateWeight( | |
stringCount: Int, | |
firstMatchDistance: Int, | |
matchCount: Int, | |
totalDistance: Int | |
) -> Double { | |
// Double((stringCount - firstMatchDistance) * stringCount + matchCount) / Double(stringCount * stringCount) + 1 / Double(totalDistance + 2) | |
1 / Double(totalDistance + 2) | |
} | |
func callAsFunction( | |
keyword: String, | |
from string: String | |
) -> Match? { | |
guard keyword.count <= string.count && !keyword.isEmpty else { return nil } | |
var (keyword, string) = if matchOption.contains(.caseInsensitive) { | |
(keyword.lowercased(), string.lowercased()) | |
} else { | |
(keyword, string) | |
} | |
var indexes = [String.Index]() | |
var distance = 0 | |
func findIndex(of character: Character, from index: String.Index) -> String.Index? { | |
string[index..<string.endIndex].firstIndex { | |
if $0 == character { | |
true | |
} else if | |
matchOption.contains(.hangulChosungMatch), | |
let anotherChosung = chosungDict[character] | |
{ | |
String($0).decomposedStringWithCanonicalMapping.contains { | |
[anotherChosung, character] | |
.compactMap(\.unicodeScalars.first) | |
.contains($0.unicodeScalars.first) | |
} | |
} else { | |
false | |
} | |
} | |
} | |
for character in keyword { | |
let lastIndex = 0 < indexes.count ? indexes[indexes.count - 1] : nil | |
let cursorIndex = lastIndex.map(string.index(after:)) ?? string.startIndex | |
guard | |
cursorIndex < string.endIndex, | |
let index = findIndex(of: character, from: cursorIndex) | |
else { return nil } | |
indexes.append(index) | |
guard lastIndex != nil || countsFirstMatchDistance else { continue } | |
let currentDistance = string.distance(from: cursorIndex, to: index) | |
distance = if scale == .byMaxDistance { | |
max(distance, currentDistance) | |
} else { | |
distance + currentDistance | |
} | |
} | |
return Match( | |
weight: | |
calculateWeight( | |
stringCount: string.count, | |
firstMatchDistance: string.distance(from: string.startIndex, to: indexes[0]), | |
matchCount: indexes.count, | |
totalDistance: distance | |
), | |
indexes: indexes, | |
string: string | |
) | |
} | |
func callAsFunction( | |
keyword: String, | |
from strings: [String] | |
) -> [Match] { | |
strings.compactMap { callAsFunction(keyword: keyword, from: $0) } | |
} | |
} | |
// usage | |
FuzzySearch()(keyword: "ㄷㄹㅈ", from: "다람쥐 쳇바퀴 어쩌구") | |
FuzzySearch()(keyword: "도군", from: ["강원도 고성군", "경기도 용인시", "충청남도 금산군", "서울특별시"]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment