Last active
November 5, 2018 10:02
-
-
Save dwickwire/8662660 to your computer and use it in GitHub Desktop.
Word segmentation from Peter Norvig.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Probability of a segmentation = | |
# Probability(first word) * Probability(rest) | |
# Best segmentation = | |
# one with highest probability | |
# Probability(word) | |
# estimated by counting | |
# Eg. Best segmentation("nowisthetime...") | |
# Pf("n") * Pr("owisthetime...") = .003% * 10^-30% = 10^-34% | |
# Pf("no") * Pr("wisthetime...") = .26% * 10^-26% = 10^-29% | |
# Pf("now") * Pr("isthetime...") = .23% * 10^-21% = 10^-24% | |
# Pf("nowi") * Pr("sthetime...") = 10^-7% * 10^-21% = 10^-30% | |
# ... | |
from utils import Pw, product, memo | |
def splits(characters, longest=12): | |
"All ways to split chars into a first word and remainder" | |
return [(characters[:i], characters[i:]) | |
for i in range(1, 1+min(longest, len(characters)))] | |
def Pwords(words): return product(words, key=Pw) | |
@memo | |
def segment(text): | |
"Best segmentation of text into words, by probability." | |
return [] if (text =="") else ( | |
max([[first]+segment(rest) for first, rest in splits(text)], | |
key=Pwords)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment