Created
August 9, 2014 22:42
-
-
Save leth/1f2ba2d6b961fabe5d60 to your computer and use it in GitHub Desktop.
_clean_data benchmarking
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import re | |
import StringIO | |
import cStringIO | |
import contextlib | |
# caching the compilation of the regex used | |
# to check for lookup calls within data | |
LOOKUP_REGEX=re.compile(r'lookup\s*\(') | |
def _clean_data(orig_data, from_remote=False, from_inventory=False): | |
''' remove jinja2 template tags from a string ''' | |
if not isinstance(orig_data, basestring): | |
return orig_data | |
data = StringIO.StringIO("") | |
# when the data is marked as having come from a remote, we always | |
# replace any print blocks (ie. {{var}}), however when marked as coming | |
# from inventory we only replace print blocks that contain a call to | |
# a lookup plugin (ie. {{lookup('foo','bar'))}}) | |
replace_prints = from_remote or (from_inventory and '{{' in orig_data and LOOKUP_REGEX.search(orig_data) is not None) | |
# these variables keep track of opening block locations, as we only | |
# want to replace matched pairs of print/block tags | |
print_openings = [] | |
block_openings = [] | |
for idx,c in enumerate(orig_data): | |
# if the current character is an opening brace, check to | |
# see if this is a jinja2 token. Otherwise, if the current | |
# character is a closing brace, we backup one character to | |
# see if we have a closing. | |
if c == '{' and idx < len(orig_data) - 1: | |
token = orig_data[idx:idx+2] | |
# if so, and we want to replace this block, push | |
# this token's location onto the appropriate array | |
if token == '{{' and replace_prints: | |
print_openings.append(idx) | |
elif token == '{%': | |
block_openings.append(idx) | |
# finally we write the data to the buffer and write | |
data.seek(0, os.SEEK_END) | |
data.write(c) | |
elif c == '}' and idx > 0: | |
token = orig_data[idx-1:idx+1] | |
prev_idx = -1 | |
if token == '%}' and len(block_openings) > 0: | |
prev_idx = block_openings.pop() | |
elif token == '}}' and len(print_openings) > 0: | |
prev_idx = print_openings.pop() | |
# if we have a closing token, and we have previously found | |
# the opening to the same kind of block represented by this | |
# token, replace both occurrences, otherwise we just write | |
# the current character to the buffer | |
if prev_idx != -1: | |
# replace the opening | |
data.seek(prev_idx, os.SEEK_SET) | |
data.write('{#') | |
# replace the closing | |
data.seek(-1, os.SEEK_END) | |
data.write('#}') | |
else: | |
data.seek(0, os.SEEK_END) | |
data.write(c) | |
else: | |
# not a jinja2 token, so we just write the current char | |
# to the output buffer | |
data.seek(0, os.SEEK_END) | |
data.write(c) | |
return_data = data.getvalue() | |
data.close() | |
return return_data | |
def _clean_data_cstringio(orig_data, from_remote=False, from_inventory=False): | |
''' remove jinja2 template tags from a string ''' | |
if not isinstance(orig_data, basestring): | |
return orig_data | |
data = cStringIO.StringIO() | |
# when the data is marked as having come from a remote, we always | |
# replace any print blocks (ie. {{var}}), however when marked as coming | |
# from inventory we only replace print blocks that contain a call to | |
# a lookup plugin (ie. {{lookup('foo','bar'))}}) | |
replace_prints = from_remote or (from_inventory and '{{' in orig_data and LOOKUP_REGEX.search(orig_data) is not None) | |
# these variables keep track of opening block locations, as we only | |
# want to replace matched pairs of print/block tags | |
print_openings = [] | |
block_openings = [] | |
for idx,c in enumerate(orig_data): | |
# if the current character is an opening brace, check to | |
# see if this is a jinja2 token. Otherwise, if the current | |
# character is a closing brace, we backup one character to | |
# see if we have a closing. | |
if c == '{' and idx < len(orig_data) - 1: | |
token = orig_data[idx:idx+2] | |
# if so, and we want to replace this block, push | |
# this token's location onto the appropriate array | |
if token == '{{' and replace_prints: | |
print_openings.append(idx) | |
elif token == '{%': | |
block_openings.append(idx) | |
# finally we write the data to the buffer and write | |
data.seek(0, os.SEEK_END) | |
data.write(c) | |
elif c == '}' and idx > 0: | |
token = orig_data[idx-1:idx+1] | |
prev_idx = -1 | |
if token == '%}' and len(block_openings) > 0: | |
prev_idx = block_openings.pop() | |
elif token == '}}' and len(print_openings) > 0: | |
prev_idx = print_openings.pop() | |
# if we have a closing token, and we have previously found | |
# the opening to the same kind of block represented by this | |
# token, replace both occurrences, otherwise we just write | |
# the current character to the buffer | |
if prev_idx != -1: | |
# replace the opening | |
data.seek(prev_idx, os.SEEK_SET) | |
data.write('{#') | |
# replace the closing | |
data.seek(-1, os.SEEK_END) | |
data.write('#}') | |
else: | |
data.seek(0, os.SEEK_END) | |
data.write(c) | |
else: | |
# not a jinja2 token, so we just write the current char | |
# to the output buffer | |
data.seek(0, os.SEEK_END) | |
data.write(c) | |
return_data = data.getvalue() | |
data.close() | |
return return_data | |
PRINT_CODE_REGEX = re.compile(r'(?:{[{%]|[%}]})') | |
ONLY_CODE_REGEX = re.compile(r'(?:{%|%})') | |
def _regex(orig_data, from_remote=False, from_inventory=False): | |
''' remove jinja2 template tags from a string ''' | |
if not isinstance(orig_data, basestring): | |
return orig_data | |
# when the data is marked as having come from a remote, we always | |
# replace any print blocks (ie. {{var}}), however when marked as coming | |
# from inventory we only replace print blocks that contain a call to | |
# a lookup plugin (ie. {{lookup('foo','bar'))}}) | |
replace_prints = from_remote or (from_inventory and '{{' in orig_data and LOOKUP_REGEX.search(orig_data) is not None) | |
regex = PRINT_CODE_REGEX if replace_prints else ONLY_CODE_REGEX | |
with contextlib.closing(cStringIO.StringIO()) as data: | |
# these variables keep track of opening block locations, as we only | |
# want to replace matched pairs of print/block tags | |
last_pos = 0 | |
print_openings = [] | |
block_openings = [] | |
for mo in regex.finditer(orig_data): | |
token = mo.group(0) | |
token_start = mo.start(0) | |
token_end = mo.end(0) | |
if token[0] == '{': | |
if token == '{%': | |
block_openings.append(token_start) | |
elif token == '{{': | |
print_openings.append(token_start) | |
data.write(orig_data[last_pos:token_end]) | |
elif token[1] == '}': | |
prev_idx = None | |
if token == '%}' and block_openings: | |
prev_idx = block_openings.pop() | |
elif token == '}}' and print_openings: | |
prev_idx = print_openings.pop() | |
data.write(orig_data[last_pos:token_start]) | |
if prev_idx is not None: | |
# replace the opening | |
data.seek(prev_idx, os.SEEK_SET) | |
data.write('{#') | |
# replace the closing | |
data.seek(0, os.SEEK_END) | |
data.write('#}') | |
else: | |
data.write(token) | |
else: | |
assert False, 'Unhandled regex match' | |
last_pos = token_end | |
data.write(orig_data[last_pos:]) | |
return data.getvalue() | |
def generate_test_strings(): | |
template = '{{ }}' | |
whitespace = ' ' * len(template) | |
templates = ( | |
template * 4, | |
(template * 3) + whitespace, | |
(template + whitespace) * 2, | |
template + (whitespace * 3), | |
whitespace * 4, | |
) | |
for tens in xrange(0,4): | |
length = 10 ** tens | |
for i, template in enumerate(templates): | |
yield i / 4.0, template * length | |
import timeit | |
to_bench = { | |
'orig': _clean_data, | |
'orig cs': _clean_data_cstringio, | |
'regex': _regex, | |
} | |
tests = { | |
'none ': dict(from_remote=False, from_inventory=False), | |
'remote ': dict(from_remote=False, from_inventory=False), | |
'inventory': dict(from_remote=False, from_inventory=False), | |
'both ': dict(from_remote=False, from_inventory=False) | |
} | |
name_padding = max(len(s) for s in to_bench.keys()) | |
for t in generate_test_strings(): | |
test_string = intern(t[1]) | |
print 'length:', len(t[1]), 'density:', t[0] | |
for test, kwargs in tests.iteritems(): | |
for name, fn in to_bench.iteritems(): | |
fn(t, **kwargs) | |
print name.ljust(name_padding), test, '{:.8f}'.format(min(timeit.repeat( | |
stmt=lambda: fn(test_string, **kwargs), | |
number=1000, | |
repeat=20))) | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
length: 20 density: 0.0 | |
regex none 0.00233603 | |
orig cs none 0.01116395 | |
orig none 0.03076696 | |
regex inventory 0.00235701 | |
orig cs inventory 0.01127720 | |
orig inventory 0.03086901 | |
regex both 0.00232005 | |
orig cs both 0.01120687 | |
orig both 0.03084397 | |
regex remote 0.00232410 | |
orig cs remote 0.01129794 | |
orig remote 0.03086996 | |
length: 20 density: 0.25 | |
regex none 0.00231910 | |
orig cs none 0.01046991 | |
orig none 0.03019309 | |
regex inventory 0.00233412 | |
orig cs inventory 0.01054406 | |
orig inventory 0.03022504 | |
regex both 0.00233984 | |
orig cs both 0.01054120 | |
orig both 0.03017092 | |
regex remote 0.00230885 | |
orig cs remote 0.01054597 | |
orig remote 0.03020096 | |
length: 20 density: 0.5 | |
regex none 0.00226808 | |
orig cs none 0.00974989 | |
orig none 0.02947187 | |
regex inventory 0.00223994 | |
orig cs inventory 0.00973392 | |
orig inventory 0.02951097 | |
regex both 0.00225091 | |
orig cs both 0.00973988 | |
orig both 0.02943206 | |
regex remote 0.00227809 | |
orig cs remote 0.00979209 | |
orig remote 0.02976108 | |
length: 20 density: 0.75 | |
regex none 0.00228810 | |
orig cs none 0.00916791 | |
orig none 0.02876782 | |
regex inventory 0.00223589 | |
orig cs inventory 0.00901389 | |
orig inventory 0.02870798 | |
regex both 0.00226593 | |
orig cs both 0.00899220 | |
orig both 0.02879286 | |
regex remote 0.00224280 | |
orig cs remote 0.00903201 | |
orig remote 0.02879405 | |
length: 20 density: 1.0 | |
regex none 0.00212598 | |
orig cs none 0.00824499 | |
orig none 0.02791786 | |
regex inventory 0.00211096 | |
orig cs inventory 0.00822306 | |
orig inventory 0.02799916 | |
regex both 0.00214314 | |
orig cs both 0.00822902 | |
orig both 0.02803493 | |
regex remote 0.00210404 | |
orig cs remote 0.00870681 | |
orig remote 0.02804685 | |
length: 200 density: 0.0 | |
regex none 0.00437117 | |
orig cs none 0.10380411 | |
orig none 0.29348207 | |
regex inventory 0.00436783 | |
orig cs inventory 0.10435104 | |
orig inventory 0.29324698 | |
regex both 0.00440097 | |
orig cs both 0.10405898 | |
orig both 0.29357386 | |
regex remote 0.00437903 | |
orig cs remote 0.10408211 | |
orig remote 0.29358387 | |
length: 200 density: 0.25 | |
regex none 0.00419116 | |
orig cs none 0.09632802 | |
orig none 0.28626513 | |
regex inventory 0.00419402 | |
orig cs inventory 0.09633589 | |
orig inventory 0.28691602 | |
regex both 0.00421095 | |
orig cs both 0.09631491 | |
orig both 0.28645802 | |
regex remote 0.00419688 | |
orig cs remote 0.09635997 | |
orig remote 0.28712702 | |
length: 200 density: 0.5 | |
regex none 0.00368094 | |
orig cs none 0.08877921 | |
orig none 0.27943301 | |
regex inventory 0.00369215 | |
orig cs inventory 0.08880091 | |
orig inventory 0.27882290 | |
regex both 0.00374818 | |
orig cs both 0.08862686 | |
orig both 0.30531192 | |
regex remote 0.00372386 | |
orig cs remote 0.10351205 | |
orig remote 0.32544017 | |
length: 200 density: 0.75 | |
regex none 0.00362611 | |
orig cs none 0.09516382 | |
orig none 0.32432103 | |
regex inventory 0.00345397 | |
orig cs inventory 0.09414315 | |
orig inventory 0.27665305 | |
regex both 0.00341392 | |
orig cs both 0.08206987 | |
orig both 0.27330804 | |
regex remote 0.00338101 | |
orig cs remote 0.08216286 | |
orig remote 0.27378011 | |
length: 200 density: 1.0 | |
regex none 0.00282884 | |
orig cs none 0.07388496 | |
orig none 0.26605392 | |
regex inventory 0.00289392 | |
orig cs inventory 0.07474899 | |
orig inventory 0.26591420 | |
regex both 0.00286889 | |
orig cs both 0.07398915 | |
orig both 0.26416993 | |
regex remote 0.00287604 | |
orig cs remote 0.07397103 | |
orig remote 0.26444697 | |
length: 2000 density: 0.0 | |
regex none 0.02651596 | |
orig cs none 1.04446483 | |
orig none 3.16794777 | |
regex inventory 0.02648711 | |
orig cs inventory 1.04291296 | |
orig inventory 3.17617607 | |
regex both 0.02652907 | |
orig cs both 1.04543400 | |
orig both 3.17354608 | |
regex remote 0.02650285 | |
orig cs remote 1.04383111 | |
orig remote 3.17507601 | |
length: 2000 density: 0.25 | |
regex none 0.02319193 | |
orig cs none 0.97022009 | |
orig none 3.08056402 | |
regex inventory 0.02326107 | |
orig cs inventory 0.97225213 | |
orig inventory 3.09071088 | |
regex both 0.02322698 | |
orig cs both 0.97343111 | |
orig both 3.08821392 | |
regex remote 0.02319694 | |
orig cs remote 0.97419095 | |
orig remote 3.09205699 | |
length: 2000 density: 0.5 | |
regex none 0.01878405 | |
orig cs none 0.89367700 | |
orig none 3.02064180 | |
regex inventory 0.01838994 | |
orig cs inventory 0.89543295 | |
orig inventory 3.01873088 | |
regex both 0.01818085 | |
orig cs both 0.89289713 | |
orig both 3.01739812 | |
regex remote 0.01843214 | |
orig cs remote 0.89358997 | |
orig remote 3.01982999 | |
length: 2000 density: 0.75 | |
regex none 0.01398683 | |
orig cs none 0.81412983 | |
orig none 2.92859387 | |
regex inventory 0.01400113 | |
orig cs inventory 0.81458783 | |
orig inventory 2.93060899 | |
regex both 0.01399398 | |
orig cs both 0.81417394 | |
orig both 2.92725301 | |
regex remote 0.01393986 | |
orig cs remote 0.81492209 | |
orig remote 2.92646503 | |
length: 2000 density: 1.0 | |
regex none 0.00927281 | |
orig cs none 0.73526096 | |
orig none 2.84585190 | |
regex inventory 0.00925303 | |
orig cs inventory 0.73610306 | |
orig inventory 2.85367703 | |
regex both 0.00924015 | |
orig cs both 0.73535395 | |
orig both 2.84879208 | |
regex remote 0.00921488 | |
orig cs remote 0.73591018 | |
orig remote 2.84457397 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment