Skip to content

Instantly share code, notes, and snippets.

@maciejgryka
Last active January 22, 2016 16:26
Show Gist options
  • Save maciejgryka/b7cb7b526aa054347e11 to your computer and use it in GitHub Desktop.
Save maciejgryka/b7cb7b526aa054347e11 to your computer and use it in GitHub Desktop.
from __future__ import print_function
import time
import random
from sklearn.feature_extraction import DictVectorizer
def get_sample():
return {
'a': random.random(),
'b': random.random(),
'c': random.random(),
'd': random.random(),
}
def get_x(n_samples=100000):
return [get_sample() for i in range(n_samples)]
def run_benchmark():
results = []
n_samples = [10, 100, 1000, 10000, 100000, 500000]
for ns in n_samples:
X = get_x(ns)
start = time.time()
DictVectorizer().fit(X)
end = time.time()
results.append((ns, end - start))
for ns, t in results:
print('%s: %.3f' % (str(ns).rjust(6, ' '), t))
if __name__ == '__main__':
run_benchmark()
@maciejgryka
Copy link
Author

Results without _validate_dictvectorizer_input:

    10: 0.000
   100: 0.000
  1000: 0.001
 10000: 0.009
100000: 0.096
500000: 0.481

and with:

    10: 0.000
   100: 0.001
  1000: 0.006
 10000: 0.058
100000: 0.603
500000: 2.958

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment