Created
August 23, 2012 08:59
-
-
Save voldmar/3434423 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
voldmar@work ~/temp % python rus.py | |
TEXT_RU | |
9.3005130291 | |
6.33021712303 | |
0.310887098312 | |
0.146034002304 | |
0.112301826477 | |
TEXT_EN | |
7.99509000778 | |
4.40287303925 | |
1.02460694313 | |
22.2835221291 | |
8.09571194649 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
from timeit import timeit | |
import re | |
import itertools as it | |
import operator as op | |
RUSSIAN_ALPHA = \ | |
ur"[ЙЦУКЕНГШЩЗХЪЭЖДЛОРПАВЫФЯЧСМИТЬБЮЁйцукенгшщзхъэждлорпавыфячсмитьбюё]" | |
RUSSIAN_ALPHA_SET = frozenset( | |
u'ЙЦУКЕНГШЩЗХЪЭЖДЛОРПАВЫФЯЧСМИТЬБЮЁйцукенгшщзхъэждлорпавыфячсмитьбюё' | |
) | |
is_russian = lambda s: bool(filter(RUSSIAN_ALPHA_SET.__contains__, s)) | |
is_russian2 = lambda s: bool(RUSSIAN_ALPHA_SET & set(s)) | |
is_russian3 = lambda s: bool(re.search(RUSSIAN_ALPHA, s)) | |
def is_russian4(s): | |
i = it.dropwhile(lambda c: c not in RUSSIAN_ALPHA_SET, s) | |
for j in i: | |
return True | |
return False | |
def is_russian5(s): | |
i = it.ifilter(RUSSIAN_ALPHA_SET.__contains__, s) | |
for j in i: | |
return True | |
return False | |
TEXT_RU = u'''Особенность рекламы масштабирует охват аудитории, не считаясь с | |
затратами. Позиционирование на рынке концентрирует поведенческий таргетинг, | |
отвоевывая рыночный сегмент. Несмотря на сложности, департамент маркетинга и | |
продаж допускает выставочный стенд, не считаясь с затратами. | |
Маркетингово-ориентированное издание, пренебрегая деталями, конструктивно. | |
Соц-дем характеристика аудитории, пренебрегая деталями, концентрирует | |
креативный пресс-клиппинг, осознавая социальную ответственность бизнеса.''' | |
TEXT_EN = u'''Lorem Ipsum is simply dummy text of the printing and typesetting | |
industry. Lorem Ipsum has been the industry's standard dummy text ever since | |
the 1500s, when an unknown printer took a galley of type and scrambled it to | |
make a type specimen book. It has survived not only five centuries, but also | |
the leap into electronic typesetting, remaining essentially unchanged. It was | |
popularised in the 1960s with the release of Letraset sheets containing Lorem | |
Ipsum passages, and more recently with desktop publishing software like Aldus | |
PageMaker including versions of Lorem Ipsum.''' | |
LEN = max(map(len, [TEXT_RU, TEXT_EN])) | |
TEXT_RU, TEXT_EN = TEXT_RU[:LEN], TEXT_EN[:LEN] | |
print 'TEXT_RU' | |
print timeit('is_russian(TEXT_RU)', 'from __main__ import is_russian, TEXT_RU', number=100000) | |
print timeit('is_russian2(TEXT_RU)', 'from __main__ import is_russian2, TEXT_RU', number=100000) | |
print timeit('is_russian3(TEXT_RU)', 'from __main__ import is_russian3, TEXT_RU', number=100000) | |
print timeit('is_russian4(TEXT_RU)', 'from __main__ import is_russian4, TEXT_RU', number=100000) | |
print timeit('is_russian5(TEXT_RU)', 'from __main__ import is_russian5, TEXT_RU', number=100000) | |
print 'TEXT_EN' | |
print timeit('is_russian(TEXT_EN)', 'from __main__ import is_russian, TEXT_EN', number=100000) | |
print timeit('is_russian2(TEXT_EN)', 'from __main__ import is_russian2, TEXT_EN', number=100000) | |
print timeit('is_russian3(TEXT_EN)', 'from __main__ import is_russian3, TEXT_EN', number=100000) | |
print timeit('is_russian4(TEXT_EN)', 'from __main__ import is_russian4, TEXT_EN', number=100000) | |
print timeit('is_russian5(TEXT_EN)', 'from __main__ import is_russian5, TEXT_EN', number=100000) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment