-
-
Save dromanov/07f86e67025f665fd8a141c8b2989453 to your computer and use it in GitHub Desktop.
LINES = 5 | |
́ | |
̈ | |
̂ | |
⃗ | |
̆ |
// Задание из https://docs.google.com/forms/d/e/1FAIpQLSc-T7mmLsI4skRKbm2px3ssBmtv1cbiIp1oBpvYXC_62qb9tw/viewform | |
// Mirror: https://www.evernote.com/shard/s171/sh/57bba412-db24-4496-8aee-fbc1ef18db88/5162aeea37a68042ba7f162356edb83c | |
// | |
// | |
// Key points: | |
// * no UTF-8 "code point" can start from another "code point" | |
// * we need to support Russian, including "combining inverted breve" over `и' | |
// * it is enough to map `й' -> `и', `Ё' -> `е', etc | |
// * according to test on the problem page, order is lexicographical | |
// | |
// | |
// Decision: | |
// * std::map to normalize case, map separators to space, strip accents | |
// * std::set to drop accents and selected modifiers | |
// * use good source of knowledge to build this map (Python + unicodedata) | |
// | |
// | |
// References and links: | |
// [1] Russian codepage [http://www.utf8-chartable.de/unicode-utf8-table.pl?start=1024] | |
// [2] Русские символы - это 2 байта [0xd0, 0xd3] × [0x80, 0xbf]. | |
// [3] https://habrahabr.ru/post/262679/ | |
// | |
// | |
// WARNING! | |
// ======== | |
// I strip all accents: for some languages it is unacceptable. You can fix | |
// 'UTF8Tokenizer::blacklist()' - apply blacklist rejection only if codepoint | |
// is Russian/English, which is easy to do (see comment in 'UTF8Tokenizer::eat()'). | |
// Russian codepoints start from 0xd0, .., 0xd3. | |
#include <ctype.h> | |
#include <stdio.h> | |
#include <string.h> | |
#include <assert.h> | |
#include <algorithm> | |
#include <vector> | |
#include <string> | |
#include <map> | |
#include <set> | |
#include <iostream> | |
// ---------------------------------------------------------------------------- | |
// Instance of class can be enhanced with 'replacing dictionary' and blacklist. | |
// Usage: | |
// UTF8Tokenizer u; | |
// u.load_mapping("mappings.txt"); | |
// u.load_blacklist("blacklist.txt"); | |
// | |
// const char* res1 = u.eat(byte); | |
// const char* res2 = u.eat(byte); | |
// ... | |
// You feed byte by byte to 'u.eat()' and it will return 'nullptr' if token is | |
// not assembled/rejected, or C asciiz string with utf8 string. | |
// ---------------------------------------------------------------------------- | |
class UTF8Tokenizer { | |
std::map<std::string, std::string> simplify; // Й -> и, ё -> е, ... | |
std::set<std::string> blacklist; // Ударения, кратки, акценты. | |
int pos; | |
int chunk_size; | |
char chunk[5]; | |
public: | |
UTF8Tokenizer() : pos(-1), chunk_size(0) | |
{} | |
void load_mapping(const char* filename) | |
{ | |
// Add manually to simplify parsing of replacing data file. | |
simplify["'"] = " "; | |
simplify["\""] = " "; | |
simplify["\n"] = " "; | |
FILE *fp = fopen(filename, "rb"); | |
assert(fp && "Please run `unucode_lookup.py` to generate maps."); | |
int numlines; | |
fscanf(fp, "LINES = %d ", &numlines); | |
char from[5], to[5]; | |
for (int i = 0 ; i < numlines ; ++i) { | |
fscanf(fp, "'%[^']' -> '%[^']'\n", from, to); | |
simplify[std::string(from)] = std::string(to); | |
} | |
fclose(fp); | |
} | |
void load_blacklist(const char* filename) | |
{ | |
FILE *fp = fopen(filename, "rb"); | |
assert(fp && "Please run `unucode_lookup.py` to generate maps."); | |
int numlines; | |
fscanf(fp, "LINES = %d\n", &numlines); | |
char token[5]; | |
for (int i = 0 ; i < numlines ; ++i) { | |
fscanf(fp, "%[^\n]\n", token); | |
blacklist.insert(std::string(token)); | |
} | |
fclose(fp); | |
} | |
int utf8size(int byte) | |
{ | |
// TODO: 3-bit lookup table? 8 elements, seems ok. | |
if (byte < 128) { | |
return 1; | |
} | |
int count = 0; | |
while (byte & 128) { | |
byte <<= 1; | |
count++; | |
} | |
return count; | |
} | |
// Eat byte one by one and return either valid UTF-8 codepoint or | |
// `nullptr` if token is not assembled yet. See full comment for the class. | |
const char* eat(int byte) | |
{ | |
if (pos < 0) { | |
chunk_size = utf8size(byte); | |
// Set or drop 'Russian or English' flag here and use it below (**) | |
pos = 0; | |
} | |
chunk[pos++] = byte; | |
if (pos == chunk_size) { | |
chunk[pos] = 0; | |
pos = -1; | |
// Below you can optionally cancel rejection for nonRussian languages. | |
auto drop_pos = blacklist.find(chunk); | |
if (drop_pos != blacklist.end()) // (**) | |
return nullptr; | |
auto simplify_pos = simplify.find(chunk); | |
if (simplify_pos != simplify.end()) { | |
return simplify_pos->second.c_str(); | |
} | |
return chunk; | |
} | |
return nullptr; | |
} | |
~UTF8Tokenizer () | |
{ | |
assert(pos < 0 && "There are unfinished token waiting in buffer."); | |
} | |
}; | |
// ---------------------------------------------------------------------------- | |
// Simple wrapper for a pair "word + count". | |
// Provides operators necessary to interface with algorithm::sort(). | |
// Note that order of count is descending while word are sorted normally: | |
// | |
// +-- [ https://en.wikipedia.org/wiki/UTF-8#Advantages ] | |
// | Sorting a set of UTF-8 encoded strings as strings of unsigned bytes yields | |
// | the same order as sorting the corresponding Unicode strings | |
// | lexicographically by codepoint. | |
// ---------------------------------------------------------------------------- | |
struct Pair { | |
int count; | |
std::string word; | |
Pair(const char *word) : count(1), word(word) | |
{ | |
} | |
// Required by 'std::sort'. | |
Pair& operator= (const Pair& other) | |
{ | |
// Protect against invalid self-assignment | |
if (this != &other) | |
{ | |
count = other.count; | |
word = other.word; | |
} | |
// By convention, always return *this. | |
return *this; | |
} | |
// Required by 'std::sort'. | |
bool operator<(const Pair& b) const | |
{ | |
// ::count in decreasing, ::word in lexicographically increasing order. | |
if (count > b.count) | |
return true; | |
if (count < b.count) | |
return false; | |
return strcmp(word.c_str(), b.word.c_str()) < 0; | |
} | |
}; | |
// ---------------------------------------------------------------------------- | |
// Container for incoming words. | |
// Search is linear, simple map<word, count> will fix it if necessary. | |
// ---------------------------------------------------------------------------- | |
class FreqCounter { | |
std::vector<Pair> dict; | |
public: | |
void insert(const char *word) | |
{ | |
for (int i = 0 ; i < dict.size() ; ++i) { | |
if (dict[i].word == word) { | |
dict[i].count++; | |
return; | |
} | |
} | |
dict.push_back(Pair(word)); | |
} | |
void dump_sorted_into_file(const char* filename) | |
{ | |
FILE *fp = fopen(filename, "wb"); | |
assert(fp && "Cannot open output file for writing."); | |
std::sort(dict.begin(), dict.end()); | |
for (int i = 0 ; i < dict.size() ; ++i) | |
fprintf(fp, "%d %s\n", dict[i].count, dict[i].word.c_str()); | |
fclose(fp); | |
} | |
}; | |
// ---------------------------------------------------------------------------- | |
// Structure: | |
// prepare instances of tokenizer and counter | |
// feed incoming text char by char to utf8 assembler | |
// outgoing utf8 codepoints are merged into words using simple finite | |
// automata | |
// dump result into file. | |
// ---------------------------------------------------------------------------- | |
int main(int argc, char **argv) | |
{ | |
assert(argc == 3 && "Usage: ./a.out in.txt out.txt"); | |
FILE *fin = fopen(argv[1], "rb"); | |
assert(fin && "Cannot open input file for reading."); | |
FreqCounter fq; | |
UTF8Tokenizer utf8tokenizer; | |
utf8tokenizer.load_mapping("mappings.txt"); | |
utf8tokenizer.load_blacklist("blacklist.txt"); | |
// Пробую автоматное программирование [Шалыто]. | |
enum {EATING_SPACES, EATING_CHARS}; | |
int state = EATING_SPACES; | |
int c; | |
std::string word = ""; | |
while ((c = fgetc(fin)) != EOF) { | |
const char *token = utf8tokenizer.eat(c); | |
if (!token) { | |
continue; | |
} | |
switch (state) { | |
case EATING_SPACES: | |
if (token[0] != ' ') { | |
state = EATING_CHARS; | |
word = token; | |
} | |
break; | |
case EATING_CHARS: | |
if (token[0] == ' ') { | |
state = EATING_SPACES; | |
fq.insert(word.c_str()); | |
word = ""; | |
} else { | |
word += token; | |
} | |
break; | |
}; | |
} | |
fclose(fin); | |
fq.dump_sorted_into_file(argv[2]); | |
return 0; | |
} |
LINES = 160 | |
'Э' -> 'э' | |
'V' -> 'v' | |
'А' -> 'а' | |
'Д' -> 'д' | |
'X' -> 'x' | |
'И' -> 'и' | |
'М' -> 'м' | |
'Р' -> 'р' | |
'Ф' -> 'ф' | |
'г' -> 'г' | |
'Ш' -> 'ш' | |
',' -> ' ' | |
'\' -> ' ' | |
'0' -> ' ' | |
'n' -> 'n' | |
'4' -> ' ' | |
'и' -> 'и' | |
'м' -> 'м' | |
'р' -> 'р' | |
'ф' -> 'ф' | |
'ш' -> 'ш' | |
'ь' -> 'ь' | |
'9' -> ' ' | |
'b' -> 'b' | |
'T' -> 't' | |
'2' -> ' ' | |
';' -> ' ' | |
'`' -> ' ' | |
'd' -> 'd' | |
'=' -> ' ' | |
'l' -> 'l' | |
'p' -> 'p' | |
'?' -> ' ' | |
'h' -> 'h' | |
'x' -> 'x' | |
'|' -> ' ' | |
'A' -> 'a' | |
'C' -> 'c' | |
'Г' -> 'г' | |
'З' -> 'з' | |
'E' -> 'e' | |
'Л' -> 'л' | |
'5' -> ' ' | |
'П' -> 'п' | |
'У' -> 'у' | |
'G' -> 'g' | |
'Ч' -> 'ч' | |
'Ы' -> 'ы' | |
'Я' -> 'я' | |
'I' -> 'i' | |
'3' -> ' ' | |
'з' -> 'з' | |
'л' -> 'л' | |
'K' -> 'k' | |
'п' -> 'п' | |
't' -> 't' | |
'у' -> 'у' | |
'ч' -> 'ч' | |
' ' -> ' ' | |
'M' -> 'm' | |
'ы' -> 'ы' | |
'я' -> 'я' | |
'S' -> 's' | |
'"' -> ' ' | |
'O' -> 'o' | |
'W' -> 'w' | |
'[' -> ' ' | |
'_' -> ' ' | |
'$' -> ' ' | |
'Q' -> 'q' | |
'c' -> 'c' | |
'z' -> 'z' | |
'g' -> 'g' | |
'k' -> 'k' | |
'&' -> ' ' | |
'o' -> 'o' | |
'7' -> ' ' | |
's' -> 's' | |
'w' -> 'w' | |
'(' -> ' ' | |
'{' -> ' ' | |
'6' -> ' ' | |
'Ъ' -> 'ъ' | |
'Ь' -> 'ь' | |
'В' -> 'в' | |
'Ж' -> 'ж' | |
'К' -> 'к' | |
'>' -> ' ' | |
'О' -> 'о' | |
'Т' -> 'т' | |
'Ц' -> 'ц' | |
'*' -> ' ' | |
'Ю' -> 'ю' | |
'в' -> 'в' | |
'ж' -> 'ж' | |
'к' -> 'к' | |
'д' -> 'д' | |
'т' -> 'т' | |
'ц' -> 'ц' | |
'ъ' -> 'ъ' | |
'D' -> 'd' | |
'ю' -> 'ю' | |
'R' -> 'r' | |
'8' -> ' ' | |
'Z' -> 'z' | |
'^' -> ' ' | |
':' -> ' ' | |
'f' -> 'f' | |
'j' -> 'j' | |
'<' -> ' ' | |
'r' -> 'r' | |
'v' -> 'v' | |
'о' -> 'о' | |
'~' -> ' ' | |
'Ё' -> 'е' | |
'@' -> ' ' | |
'Б' -> 'б' | |
'B' -> 'b' | |
'Е' -> 'е' | |
'Й' -> 'и' | |
'Н' -> 'н' | |
'/' -> ' ' | |
'С' -> 'с' | |
'Х' -> 'х' | |
'#' -> ' ' | |
'Щ' -> 'щ' | |
'F' -> 'f' | |
'-' -> ' ' | |
'б' -> 'б' | |
'.' -> ' ' | |
'е' -> 'е' | |
'H' -> 'h' | |
'й' -> 'и' | |
'н' -> 'н' | |
'с' -> 'с' | |
'!' -> ' ' | |
'J' -> 'j' | |
'х' -> 'х' | |
'щ' -> 'щ' | |
'э' -> 'э' | |
'а' -> 'а' | |
'L' -> 'l' | |
'ё' -> 'е' | |
'U' -> 'u' | |
'Y' -> 'y' | |
'%' -> ' ' | |
'N' -> 'n' | |
']' -> ' ' | |
'a' -> 'a' | |
'e' -> 'e' | |
'P' -> 'p' | |
'i' -> 'i' | |
'm' -> 'm' | |
'q' -> 'q' | |
')' -> ' ' | |
'1' -> ' ' | |
'u' -> 'u' | |
'y' -> 'y' | |
'}' -> ' ' | |
'+' -> ' ' |
# -*- coding: UTF-8 -*- | |
import unicodedata | |
import codecs | |
russian = u"йцукеёнгшщзхъфывапролджэячсмитьбю" | |
def make(): | |
""" | |
Our goal is to prepare lookup tables for UTF-8 russian text tokenizer. | |
We treat `й' as `и' and `ё' as `е'. | |
""" | |
replacing = { | |
u'ё': u'е', | |
u'й': u'и', | |
} | |
# You can add any number of s⃗pêciál symbols here. | |
removing = set(u'\u0301 \u20d7 \u0302'.split()) | |
def extract_modifiers(u): | |
u = unicodedata.normalize(u"NFKD", u) | |
map(removing.add, u[1:]) | |
for c in russian: | |
replacing[c] = replacing.get(c, c) | |
replacing[c.upper()] = replacing.get(c, c) | |
extract_modifiers(c) | |
extract_modifiers(c.upper()) | |
for c in map(chr, range(32, 127)): | |
if c.isalpha(): | |
replacing[c] = c.lower() | |
elif c not in "\n'": # I will add them manually. | |
replacing[c] = ' ' | |
with codecs.open('mappings.txt', 'w', 'utf-8') as fp: | |
fp.write(u"LINES = %d\n" % len(replacing)) | |
for item in replacing.iteritems(): | |
fp.write(u"'%s' -> '%s'\n" % item) | |
with codecs.open('blacklist.txt', 'w', 'utf-8') as fp: | |
fp.write(u"LINES = %d\n" % len(removing)) | |
for item in removing: | |
fp.write(u"%s\n" % item) | |
print u"\n".join([u"'%s' -> '%s'" % i for i in replacing.iteritems()]) | |
print u"\n".join([u"'%s' = 'a%s'" % (repr(i), i) for i in removing]) | |
make() |
PostMortem
В коде ошибка, в UNIGINE нашли мгновенно :-). ^M, который должен работать как разделитель, был воспринят как слово и посчитан.
Причина - вместо локали и её традиционного isalpha для определения "буква/не буква" был словарь, который все разделители отображал на ' ', переводил всё в нижний регистр, нормализовывал 'Й' -> 'и', чистил ударения и так далее (сразу isalpha и tolower в один проход).
Этот словарь делался на Питоне (unicode_lookup.py), и там надо строку 33 исправить на
for c in map(chr, range(1, 127)):
вместо
for c in map(chr, range(32, 127)):
Тогда все табы, ^M и так далее будут внесены в словарь и будут обрабатываться как надо (их ASCII коды меньше 32).
В целом сказали, что решение по сути overengineered.
Для сравнения есть вариант Руслана Абдикеева, весьма интересный для чтения:
новый и старый.
Дискуссия и обсуждение тут.
Ждём второго набора ;-)
Работает, игнорирует ударения, спец-символы, кратки, акценты и так далее.