dromanov · March 2, 2017 16:50 · dromanov · Mar 2, 2017 · dromanov · Mar 7, 2017
diff --git a/blacklist.txt b/blacklist.txt
 LINES = 5
 ́
 ̈
 ̂
 ⃗
 ̆
diff --git a/main.cpp b/main.cpp
 // Задание из https://docs.google.com/forms/d/e/1FAIpQLSc-T7mmLsI4skRKbm2px3ssBmtv1cbiIp1oBpvYXC_62qb9tw/viewform
 // Mirror: https://www.evernote.com/shard/s171/sh/57bba412-db24-4496-8aee-fbc1ef18db88/5162aeea37a68042ba7f162356edb83c
 //
 //
 // Key points:
 // * no UTF-8 "code point" can start from another "code point"
 // * we need to support Russian, including "combining inverted breve" over `и'
 // * it is enough to map `й' -> `и', `Ё' -> `е', etc
 // * according to test on the problem page, order is lexicographical
 //
 //
 // Decision:
 // * std::map to normalize case, map separators to space, strip accents
 // * std::set to drop accents and selected modifiers
 // * use good source of knowledge to build this map (Python + unicodedata)
 //
 //
 // References and links:
 // [1] Russian codepage [http://www.utf8-chartable.de/unicode-utf8-table.pl?start=1024]
 // [2] Русские символы - это 2 байта [0xd0, 0xd3] × [0x80, 0xbf].
 // [3] https://habrahabr.ru/post/262679/
 //
 //
 // WARNING!
 // ========
 // I strip all accents: for some languages it is unacceptable. You can fix
 // 'UTF8Tokenizer::blacklist()' - apply blacklist rejection only if codepoint
 // is Russian/English, which is easy to do (see comment in 'UTF8Tokenizer::eat()').
 // Russian codepoints start from 0xd0, .., 0xd3.

 #include <ctype.h>
 #include <stdio.h>
 #include <string.h>
 #include <assert.h>

 #include <algorithm>
 #include <vector>
 #include <string>
 #include <map>
 #include <set>
 #include <iostream>


 // ----------------------------------------------------------------------------
 // Instance of class can be enhanced with 'replacing dictionary' and blacklist.
 // Usage:
 //     UTF8Tokenizer u;
 //     u.load_mapping("mappings.txt");
 //     u.load_blacklist("blacklist.txt");
 //
 //     const char* res1 = u.eat(byte);
 //     const char* res2 = u.eat(byte);
 //     ...
 // You feed byte by byte to 'u.eat()' and it will return 'nullptr' if token is
 // not assembled/rejected, or C asciiz string with utf8 string.
 // ----------------------------------------------------------------------------
 class UTF8Tokenizer {
    std::map<std::string, std::string>  simplify;   // Й -> и, ё -> е, ...
    std::set<std::string>               blacklist;  // Ударения, кратки, акценты.

    int   pos;
    int   chunk_size;
    char  chunk[5];

 public:
    UTF8Tokenizer() : pos(-1), chunk_size(0)
    {}

    void load_mapping(const char* filename)
    {
        // Add manually to simplify parsing of replacing data file.
        simplify["'"] = " ";
        simplify["\""] = " ";
        simplify["\n"] = " ";

        FILE *fp = fopen(filename, "rb");
        assert(fp && "Please run `unucode_lookup.py` to generate maps.");

        int numlines;
        fscanf(fp, "LINES = %d ", &numlines);

        char from[5], to[5];
        for (int i = 0 ; i < numlines ; ++i) {
            fscanf(fp, "'%[^']' -> '%[^']'\n", from, to);
            simplify[std::string(from)] = std::string(to);
        }

        fclose(fp);
    }

    void load_blacklist(const char* filename)
    {
        FILE *fp = fopen(filename, "rb");
        assert(fp && "Please run `unucode_lookup.py` to generate maps.");

        int numlines;
        fscanf(fp, "LINES = %d\n", &numlines);

        char token[5];
        for (int i = 0 ; i < numlines ; ++i) {
            fscanf(fp, "%[^\n]\n", token);
            blacklist.insert(std::string(token));
        }

        fclose(fp);
    }

    int utf8size(int byte)
    {
        // TODO: 3-bit lookup table? 8 elements, seems ok.
        if (byte < 128) {
            return 1;
        }
        int count = 0;
        while (byte & 128) {
            byte <<= 1;
            count++;
        }
        return count;
    }

    // Eat byte one by one and return either valid UTF-8 codepoint or
    // `nullptr` if token is not assembled yet. See full comment for the class.
    const char* eat(int byte)
    {
        if (pos < 0) {
            chunk_size = utf8size(byte);
            // Set or drop 'Russian or English' flag here and use it below (**)
            pos = 0;
        }

        chunk[pos++] = byte;

        if (pos == chunk_size) {
            chunk[pos] = 0;
            pos = -1;

            // Below you can optionally cancel rejection for nonRussian languages.
            auto drop_pos = blacklist.find(chunk);
            if (drop_pos != blacklist.end())        // (**)
                return nullptr;

            auto simplify_pos = simplify.find(chunk);
            if (simplify_pos != simplify.end()) {
                return simplify_pos->second.c_str();
            }

            return chunk;
        }

        return nullptr;
    }

    ~UTF8Tokenizer ()
    {
        assert(pos < 0 && "There are unfinished token waiting in buffer.");
    }
 };


 // ----------------------------------------------------------------------------
 // Simple wrapper for a pair "word + count".
 // Provides operators necessary to interface with algorithm::sort().
 // Note that order of count is descending while word are sorted normally:
 //
 // +-- [ https://en.wikipedia.org/wiki/UTF-8#Advantages ]
 // | Sorting a set of UTF-8 encoded strings as strings of unsigned bytes yields
 // | the same order as sorting the corresponding Unicode strings
 // | lexicographically by codepoint.
 // ----------------------------------------------------------------------------
 struct Pair {
    int         count;
    std::string word;

    Pair(const char *word) : count(1), word(word)
    {
    }

    // Required by 'std::sort'.
    Pair& operator= (const Pair& other)
    {
        // Protect against invalid self-assignment
        if (this != &other)
        {
            count = other.count;
            word = other.word;
        }

        // By convention, always return *this.
        return *this;
    }

    // Required by 'std::sort'.
    bool operator<(const Pair& b) const
    {
        // ::count in decreasing, ::word in lexicographically increasing order.
        if (count > b.count)
            return true;
        if (count < b.count)
            return false;
        return strcmp(word.c_str(), b.word.c_str()) < 0;
    }
 };


 // ----------------------------------------------------------------------------
 // Container for incoming words.
 // Search is linear, simple map<word, count> will fix it if necessary.
 // ----------------------------------------------------------------------------
 class FreqCounter {
    std::vector<Pair> dict;

 public:
    void insert(const char *word)
    {
        for (int i = 0 ; i < dict.size() ; ++i) {
            if (dict[i].word == word) {
                dict[i].count++;
                return;
            }
        }
        dict.push_back(Pair(word));
    }

    void dump_sorted_into_file(const char* filename)
    {
        FILE *fp = fopen(filename, "wb");
        assert(fp && "Cannot open output file for writing.");

        std::sort(dict.begin(), dict.end());

        for (int i = 0 ; i < dict.size() ; ++i)
            fprintf(fp, "%d %s\n", dict[i].count, dict[i].word.c_str());

        fclose(fp);
    }
 };


 // ----------------------------------------------------------------------------
 // Structure:
 //     prepare instances of tokenizer and counter
 //     feed incoming text char by char to utf8 assembler
 //     outgoing utf8 codepoints are merged into words using simple finite
 //         automata
 //     dump result into file.
 // ----------------------------------------------------------------------------
 int main(int argc, char **argv)
 {
    assert(argc == 3 && "Usage: ./a.out in.txt out.txt");

    FILE *fin = fopen(argv[1], "rb");
    assert(fin && "Cannot open input file for reading.");

    FreqCounter fq;

    UTF8Tokenizer utf8tokenizer;
    utf8tokenizer.load_mapping("mappings.txt");
    utf8tokenizer.load_blacklist("blacklist.txt");


    // Пробую автоматное программирование [Шалыто].
    enum {EATING_SPACES, EATING_CHARS};

    int           state = EATING_SPACES;
    int           c;
    std::string   word = "";
    while ((c = fgetc(fin)) != EOF) {
        const char *token = utf8tokenizer.eat(c);

        if (!token) {
            continue;
        }

        switch (state) {
        case EATING_SPACES:
            if (token[0] != ' ') {
                state = EATING_CHARS;
                word = token;
            }
        break;

        case EATING_CHARS:
            if (token[0] == ' ') {
                state = EATING_SPACES;
                fq.insert(word.c_str());
                word = "";
            } else {
                word += token;
            }
        break;
        };
    }

    fclose(fin);

    fq.dump_sorted_into_file(argv[2]);

    return 0;
 }
diff --git a/mappings.txt b/mappings.txt
 LINES = 160
 'Э' -> 'э'
 'V' -> 'v'
 'А' -> 'а'
 'Д' -> 'д'
 'X' -> 'x'
 'И' -> 'и'
 'М' -> 'м'
 'Р' -> 'р'
 'Ф' -> 'ф'
 'г' -> 'г'
 'Ш' -> 'ш'
 ',' -> ' '
 '\' -> ' '
 '0' -> ' '
 'n' -> 'n'
 '4' -> ' '
 'и' -> 'и'
 'м' -> 'м'
 'р' -> 'р'
 'ф' -> 'ф'
 'ш' -> 'ш'
 'ь' -> 'ь'
 '9' -> ' '
 'b' -> 'b'
 'T' -> 't'
 '2' -> ' '
 ';' -> ' '
 '`' -> ' '
 'd' -> 'd'
 '=' -> ' '
 'l' -> 'l'
 'p' -> 'p'
 '?' -> ' '
 'h' -> 'h'
 'x' -> 'x'
 '|' -> ' '
 'A' -> 'a'
 'C' -> 'c'
 'Г' -> 'г'
 'З' -> 'з'
 'E' -> 'e'
 'Л' -> 'л'
 '5' -> ' '
 'П' -> 'п'
 'У' -> 'у'
 'G' -> 'g'
 'Ч' -> 'ч'
 'Ы' -> 'ы'
 'Я' -> 'я'
 'I' -> 'i'
 '3' -> ' '
 'з' -> 'з'
 'л' -> 'л'
 'K' -> 'k'
 'п' -> 'п'
 't' -> 't'
 'у' -> 'у'
 'ч' -> 'ч'
 ' ' -> ' '
 'M' -> 'm'
 'ы' -> 'ы'
 'я' -> 'я'
 'S' -> 's'
 '"' -> ' '
 'O' -> 'o'
 'W' -> 'w'
 '[' -> ' '
 '_' -> ' '
 '$' -> ' '
 'Q' -> 'q'
 'c' -> 'c'
 'z' -> 'z'
 'g' -> 'g'
 'k' -> 'k'
 '&' -> ' '
 'o' -> 'o'
 '7' -> ' '
 's' -> 's'
 'w' -> 'w'
 '(' -> ' '
 '{' -> ' '
 '6' -> ' '
 'Ъ' -> 'ъ'
 'Ь' -> 'ь'
 'В' -> 'в'
 'Ж' -> 'ж'
 'К' -> 'к'
 '>' -> ' '
 'О' -> 'о'
 'Т' -> 'т'
 'Ц' -> 'ц'
 '*' -> ' '
 'Ю' -> 'ю'
 'в' -> 'в'
 'ж' -> 'ж'
 'к' -> 'к'
 'д' -> 'д'
 'т' -> 'т'
 'ц' -> 'ц'
 'ъ' -> 'ъ'
 'D' -> 'd'
 'ю' -> 'ю'
 'R' -> 'r'
 '8' -> ' '
 'Z' -> 'z'
 '^' -> ' '
 ':' -> ' '
 'f' -> 'f'
 'j' -> 'j'
 '<' -> ' '
 'r' -> 'r'
 'v' -> 'v'
 'о' -> 'о'
 '~' -> ' '
 'Ё' -> 'е'
 '@' -> ' '
 'Б' -> 'б'
 'B' -> 'b'
 'Е' -> 'е'
 'Й' -> 'и'
 'Н' -> 'н'
 '/' -> ' '
 'С' -> 'с'
 'Х' -> 'х'
 '#' -> ' '
 'Щ' -> 'щ'
 'F' -> 'f'
 '-' -> ' '
 'б' -> 'б'
 '.' -> ' '
 'е' -> 'е'
 'H' -> 'h'
 'й' -> 'и'
 'н' -> 'н'
 'с' -> 'с'
 '!' -> ' '
 'J' -> 'j'
 'х' -> 'х'
 'щ' -> 'щ'
 'э' -> 'э'
 'а' -> 'а'
 'L' -> 'l'
 'ё' -> 'е'
 'U' -> 'u'
 'Y' -> 'y'
 '%' -> ' '
 'N' -> 'n'
 ']' -> ' '
 'a' -> 'a'
 'e' -> 'e'
 'P' -> 'p'
 'i' -> 'i'
 'm' -> 'm'
 'q' -> 'q'
 ')' -> ' '
 '1' -> ' '
 'u' -> 'u'
 'y' -> 'y'
 '}' -> ' '
 '+' -> ' '
diff --git a/unicode_lookup.py b/unicode_lookup.py
 # -*- coding: UTF-8 -*-

 import unicodedata
 import codecs


 russian = u"йцукеёнгшщзхъфывапролджэячсмитьбю"


 def make():
    """
    Our goal is to prepare lookup tables for UTF-8 russian text tokenizer.
    We treat `й' as `и' and `ё' as `е'.
    """
    replacing = {
        u'ё': u'е',
        u'й': u'и',
    }

    # You can add any number of s⃗pêciál symbols here.
    removing = set(u'\u0301 \u20d7 \u0302'.split())

    def extract_modifiers(u):
        u = unicodedata.normalize(u"NFKD", u)
        map(removing.add, u[1:])

    for c in russian:
        replacing[c] = replacing.get(c, c)
        replacing[c.upper()] = replacing.get(c, c)
        extract_modifiers(c)
        extract_modifiers(c.upper())

    for c in map(chr, range(32, 127)):
        if c.isalpha():
            replacing[c] = c.lower()
        elif c not in "\n'":    # I will add them manually.
            replacing[c] = ' '

    with codecs.open('mappings.txt', 'w', 'utf-8') as fp:
        fp.write(u"LINES = %d\n" % len(replacing))
        for item in replacing.iteritems():
            fp.write(u"'%s' -> '%s'\n" % item)

    with codecs.open('blacklist.txt', 'w', 'utf-8') as fp:
        fp.write(u"LINES = %d\n" % len(removing))
        for item in removing:
            fp.write(u"%s\n" % item)

    print u"\n".join([u"'%s' -> '%s'" % i for i in replacing.iteritems()])
    print u"\n".join([u"'%s' = 'a%s'" % (repr(i), i) for i in removing])


 make()
	// Задание из https://docs.google.com/forms/d/e/1FAIpQLSc-T7mmLsI4skRKbm2px3ssBmtv1cbiIp1oBpvYXC_62qb9tw/viewform
	// Mirror: https://www.evernote.com/shard/s171/sh/57bba412-db24-4496-8aee-fbc1ef18db88/5162aeea37a68042ba7f162356edb83c
	//
	//
	// Key points:
	// * no UTF-8 "code point" can start from another "code point"
	// * we need to support Russian, including "combining inverted breve" over `и'
	// * it is enough to map `й' -> `и', `Ё' -> `е', etc
	// * according to test on the problem page, order is lexicographical
	//
	//
	// Decision:
	// * std::map to normalize case, map separators to space, strip accents
	// * std::set to drop accents and selected modifiers
	// * use good source of knowledge to build this map (Python + unicodedata)
	//
	//
	// References and links:
	// [1] Russian codepage [http://www.utf8-chartable.de/unicode-utf8-table.pl?start=1024]
	// [2] Русские символы - это 2 байта [0xd0, 0xd3] × [0x80, 0xbf].
	// [3] https://habrahabr.ru/post/262679/
	//
	//
	// WARNING!
	// ========
	// I strip all accents: for some languages it is unacceptable. You can fix
	// 'UTF8Tokenizer::blacklist()' - apply blacklist rejection only if codepoint
	// is Russian/English, which is easy to do (see comment in 'UTF8Tokenizer::eat()').
	// Russian codepoints start from 0xd0, .., 0xd3.

	#include <ctype.h>
	#include <stdio.h>
	#include <string.h>
	#include <assert.h>

	#include <algorithm>
	#include <vector>
	#include <string>
	#include <map>
	#include <set>
	#include <iostream>


	// ----------------------------------------------------------------------------
	// Instance of class can be enhanced with 'replacing dictionary' and blacklist.
	// Usage:
	// UTF8Tokenizer u;
	// u.load_mapping("mappings.txt");
	// u.load_blacklist("blacklist.txt");
	//
	// const char* res1 = u.eat(byte);
	// const char* res2 = u.eat(byte);
	// ...
	// You feed byte by byte to 'u.eat()' and it will return 'nullptr' if token is
	// not assembled/rejected, or C asciiz string with utf8 string.
	// ----------------------------------------------------------------------------
	class UTF8Tokenizer {
	std::map<std::string, std::string> simplify; // Й -> и, ё -> е, ...
	std::set<std::string> blacklist; // Ударения, кратки, акценты.

	int pos;
	int chunk_size;
	char chunk[5];

	public:
	UTF8Tokenizer() : pos(-1), chunk_size(0)
	{}

	void load_mapping(const char* filename)
	{
	// Add manually to simplify parsing of replacing data file.
	simplify["'"] = " ";
	simplify["\""] = " ";
	simplify["\n"] = " ";

	FILE *fp = fopen(filename, "rb");
	assert(fp && "Please run `unucode_lookup.py` to generate maps.");

	int numlines;
	fscanf(fp, "LINES = %d ", &numlines);

	char from[5], to[5];
	for (int i = 0 ; i < numlines ; ++i) {
	fscanf(fp, "'%[^']' -> '%[^']'\n", from, to);
	simplify[std::string(from)] = std::string(to);
	}

	fclose(fp);
	}

	void load_blacklist(const char* filename)
	{
	FILE *fp = fopen(filename, "rb");
	assert(fp && "Please run `unucode_lookup.py` to generate maps.");

	int numlines;
	fscanf(fp, "LINES = %d\n", &numlines);

	char token[5];
	for (int i = 0 ; i < numlines ; ++i) {
	fscanf(fp, "%[^\n]\n", token);
	blacklist.insert(std::string(token));
	}

	fclose(fp);
	}

	int utf8size(int byte)
	{
	// TODO: 3-bit lookup table? 8 elements, seems ok.
	if (byte < 128) {
	return 1;
	}
	int count = 0;
	while (byte & 128) {
	byte <<= 1;
	count++;
	}
	return count;
	}

	// Eat byte one by one and return either valid UTF-8 codepoint or
	// `nullptr` if token is not assembled yet. See full comment for the class.
	const char* eat(int byte)
	{
	if (pos < 0) {
	chunk_size = utf8size(byte);
	// Set or drop 'Russian or English' flag here and use it below (**)
	pos = 0;
	}

	chunk[pos++] = byte;

	if (pos == chunk_size) {
	chunk[pos] = 0;
	pos = -1;

	// Below you can optionally cancel rejection for nonRussian languages.
	auto drop_pos = blacklist.find(chunk);
	if (drop_pos != blacklist.end()) // (**)
	return nullptr;

	auto simplify_pos = simplify.find(chunk);
	if (simplify_pos != simplify.end()) {
	return simplify_pos->second.c_str();
	}

	return chunk;
	}

	return nullptr;
	}

	~UTF8Tokenizer ()
	{
	assert(pos < 0 && "There are unfinished token waiting in buffer.");
	}
	};


	// ----------------------------------------------------------------------------
	// Simple wrapper for a pair "word + count".
	// Provides operators necessary to interface with algorithm::sort().
	// Note that order of count is descending while word are sorted normally:
	//
	// +-- [ https://en.wikipedia.org/wiki/UTF-8#Advantages ]
	// \| Sorting a set of UTF-8 encoded strings as strings of unsigned bytes yields
	// \| the same order as sorting the corresponding Unicode strings
	// \| lexicographically by codepoint.
	// ----------------------------------------------------------------------------
	struct Pair {
	int count;
	std::string word;

	Pair(const char *word) : count(1), word(word)
	{
	}

	// Required by 'std::sort'.
	Pair& operator= (const Pair& other)
	{
	// Protect against invalid self-assignment
	if (this != &other)
	{
	count = other.count;
	word = other.word;
	}

	// By convention, always return *this.
	return *this;
	}

	// Required by 'std::sort'.
	bool operator<(const Pair& b) const
	{
	// ::count in decreasing, ::word in lexicographically increasing order.
	if (count > b.count)
	return true;
	if (count < b.count)
	return false;
	return strcmp(word.c_str(), b.word.c_str()) < 0;
	}
	};


	// ----------------------------------------------------------------------------
	// Container for incoming words.
	// Search is linear, simple map<word, count> will fix it if necessary.
	// ----------------------------------------------------------------------------
	class FreqCounter {
	std::vector<Pair> dict;

	public:
	void insert(const char *word)
	{
	for (int i = 0 ; i < dict.size() ; ++i) {
	if (dict[i].word == word) {
	dict[i].count++;
	return;
	}
	}
	dict.push_back(Pair(word));
	}

	void dump_sorted_into_file(const char* filename)
	{
	FILE *fp = fopen(filename, "wb");
	assert(fp && "Cannot open output file for writing.");

	std::sort(dict.begin(), dict.end());

	for (int i = 0 ; i < dict.size() ; ++i)
	fprintf(fp, "%d %s\n", dict[i].count, dict[i].word.c_str());

	fclose(fp);
	}
	};


	// ----------------------------------------------------------------------------
	// Structure:
	// prepare instances of tokenizer and counter
	// feed incoming text char by char to utf8 assembler
	// outgoing utf8 codepoints are merged into words using simple finite
	// automata
	// dump result into file.
	// ----------------------------------------------------------------------------
	int main(int argc, char **argv)
	{
	assert(argc == 3 && "Usage: ./a.out in.txt out.txt");

	FILE *fin = fopen(argv[1], "rb");
	assert(fin && "Cannot open input file for reading.");

	FreqCounter fq;

	UTF8Tokenizer utf8tokenizer;
	utf8tokenizer.load_mapping("mappings.txt");
	utf8tokenizer.load_blacklist("blacklist.txt");


	// Пробую автоматное программирование [Шалыто].
	enum {EATING_SPACES, EATING_CHARS};

	int state = EATING_SPACES;
	int c;
	std::string word = "";
	while ((c = fgetc(fin)) != EOF) {
	const char *token = utf8tokenizer.eat(c);

	if (!token) {
	continue;
	}

	switch (state) {
	case EATING_SPACES:
	if (token[0] != ' ') {
	state = EATING_CHARS;
	word = token;
	}
	break;

	case EATING_CHARS:
	if (token[0] == ' ') {
	state = EATING_SPACES;
	fq.insert(word.c_str());
	word = "";
	} else {
	word += token;
	}
	break;
	};
	}

	fclose(fin);

	fq.dump_sorted_into_file(argv[2]);

	return 0;
	}
	LINES = 160
	'Э' -> 'э'
	'V' -> 'v'
	'А' -> 'а'
	'Д' -> 'д'
	'X' -> 'x'
	'И' -> 'и'
	'М' -> 'м'
	'Р' -> 'р'
	'Ф' -> 'ф'
	'г' -> 'г'
	'Ш' -> 'ш'
	',' -> ' '
	'\' -> ' '
	'0' -> ' '
	'n' -> 'n'
	'4' -> ' '
	'и' -> 'и'
	'м' -> 'м'
	'р' -> 'р'
	'ф' -> 'ф'
	'ш' -> 'ш'
	'ь' -> 'ь'
	'9' -> ' '
	'b' -> 'b'
	'T' -> 't'
	'2' -> ' '
	';' -> ' '
	'`' -> ' '
	'd' -> 'd'
	'=' -> ' '
	'l' -> 'l'
	'p' -> 'p'
	'?' -> ' '
	'h' -> 'h'
	'x' -> 'x'
	'\|' -> ' '
	'A' -> 'a'
	'C' -> 'c'
	'Г' -> 'г'
	'З' -> 'з'
	'E' -> 'e'
	'Л' -> 'л'
	'5' -> ' '
	'П' -> 'п'
	'У' -> 'у'
	'G' -> 'g'
	'Ч' -> 'ч'
	'Ы' -> 'ы'
	'Я' -> 'я'
	'I' -> 'i'
	'3' -> ' '
	'з' -> 'з'
	'л' -> 'л'
	'K' -> 'k'
	'п' -> 'п'
	't' -> 't'
	'у' -> 'у'
	'ч' -> 'ч'
	' ' -> ' '
	'M' -> 'm'
	'ы' -> 'ы'
	'я' -> 'я'
	'S' -> 's'
	'"' -> ' '
	'O' -> 'o'
	'W' -> 'w'
	'[' -> ' '
	'_' -> ' '
	'$' -> ' '
	'Q' -> 'q'
	'c' -> 'c'
	'z' -> 'z'
	'g' -> 'g'
	'k' -> 'k'
	'&' -> ' '
	'o' -> 'o'
	'7' -> ' '
	's' -> 's'
	'w' -> 'w'
	'(' -> ' '
	'{' -> ' '
	'6' -> ' '
	'Ъ' -> 'ъ'
	'Ь' -> 'ь'
	'В' -> 'в'
	'Ж' -> 'ж'
	'К' -> 'к'
	'>' -> ' '
	'О' -> 'о'
	'Т' -> 'т'
	'Ц' -> 'ц'
	'*' -> ' '
	'Ю' -> 'ю'
	'в' -> 'в'
	'ж' -> 'ж'
	'к' -> 'к'
	'д' -> 'д'
	'т' -> 'т'
	'ц' -> 'ц'
	'ъ' -> 'ъ'
	'D' -> 'd'
	'ю' -> 'ю'
	'R' -> 'r'
	'8' -> ' '
	'Z' -> 'z'
	'^' -> ' '
	':' -> ' '
	'f' -> 'f'
	'j' -> 'j'
	'<' -> ' '
	'r' -> 'r'
	'v' -> 'v'
	'о' -> 'о'
	'~' -> ' '
	'Ё' -> 'е'
	'@' -> ' '
	'Б' -> 'б'
	'B' -> 'b'
	'Е' -> 'е'
	'Й' -> 'и'
	'Н' -> 'н'
	'/' -> ' '
	'С' -> 'с'
	'Х' -> 'х'
	'#' -> ' '
	'Щ' -> 'щ'
	'F' -> 'f'
	'-' -> ' '
	'б' -> 'б'
	'.' -> ' '
	'е' -> 'е'
	'H' -> 'h'
	'й' -> 'и'
	'н' -> 'н'
	'с' -> 'с'
	'!' -> ' '
	'J' -> 'j'
	'х' -> 'х'
	'щ' -> 'щ'
	'э' -> 'э'
	'а' -> 'а'
	'L' -> 'l'
	'ё' -> 'е'
	'U' -> 'u'
	'Y' -> 'y'
	'%' -> ' '
	'N' -> 'n'
	']' -> ' '
	'a' -> 'a'
	'e' -> 'e'
	'P' -> 'p'
	'i' -> 'i'
	'm' -> 'm'
	'q' -> 'q'
	')' -> ' '
	'1' -> ' '
	'u' -> 'u'
	'y' -> 'y'
	'}' -> ' '
	'+' -> ' '
	# -- coding: UTF-8 --

	import unicodedata
	import codecs


	russian = u"йцукеёнгшщзхъфывапролджэячсмитьбю"


	def make():
	"""
	Our goal is to prepare lookup tables for UTF-8 russian text tokenizer.
	We treat `й' as `и' and `ё' as `е'.
	"""
	replacing = {
	u'ё': u'е',
	u'й': u'и',
	}

	# You can add any number of s⃗pêciál symbols here.
	removing = set(u'\u0301 \u20d7 \u0302'.split())

	def extract_modifiers(u):
	u = unicodedata.normalize(u"NFKD", u)
	map(removing.add, u[1:])

	for c in russian:
	replacing[c] = replacing.get(c, c)
	replacing[c.upper()] = replacing.get(c, c)
	extract_modifiers(c)
	extract_modifiers(c.upper())

	for c in map(chr, range(32, 127)):
	if c.isalpha():
	replacing[c] = c.lower()
	elif c not in "\n'": # I will add them manually.
	replacing[c] = ' '

	with codecs.open('mappings.txt', 'w', 'utf-8') as fp:
	fp.write(u"LINES = %d\n" % len(replacing))
	for item in replacing.iteritems():
	fp.write(u"'%s' -> '%s'\n" % item)

	with codecs.open('blacklist.txt', 'w', 'utf-8') as fp:
	fp.write(u"LINES = %d\n" % len(removing))
	for item in removing:
	fp.write(u"%s\n" % item)

	print u"\n".join([u"'%s' -> '%s'" % i for i in replacing.iteritems()])
	print u"\n".join([u"'%s' = 'a%s'" % (repr(i), i) for i in removing])


	make()