Created
June 26, 2021 08:32
-
-
Save macleginn/8e1961c69e0cd8e4db8a8b188e11494e to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Собираем вместе все возможные знаки пунктуации | |
import sys | |
from unicodedata import category | |
chrs = (chr(i) for i in range(sys.maxunicode + 1)) | |
punctuation = set(c for c in chrs if category(c).startswith("P")) | |
# Дефис бывает внутри слов | |
punctuation.remove('-') | |
def tokenize(s, lower_case=False): | |
if lower_case: | |
s = s.lower() | |
# Превращаем дефисы-тире в нормальные тире | |
s = s.replace(' - ', ' — ') | |
# Добавляем пробел до и после любых знаков пунктуации | |
for p in punctuation: | |
s = s.replace(p, f' {p} ') | |
# Разбиваем по пробелам | |
tokens = s.split() | |
# Убираем токены, состоящие только из пунктуации | |
return list(filter(lambda t: t.strip() not in punctuation, tokens)) | |
def get_n_grams(tokens, size): | |
i = 0 | |
j = size | |
result = [] | |
while j <= len(tokens): | |
result.append(tokens[i:j]) | |
i += 1 | |
j += 1 | |
return result |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment