Last active
November 3, 2020 09:19
-
-
Save faustomorales/65f5d523aa6212766ff2fd2a4ade563d to your computer and use it in GitHub Desktop.
Compare PDQ Hash performance using pure Python versus bindings to C++ implementation. Results show 262ms for pure Python and 7ms for the bindings.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Before running this script, you must clone the ThreatExchange | |
# repo (containing the pure Python implementation) | |
# and install the pdqhash package (containing the C bindings). | |
# You can do this by executing the following at the command line. | |
# git clone https://github.com/facebook/ThreatExchange.git | |
# pip install pdqhash | |
import urllib.request | |
import timeit | |
import sys | |
import os | |
# Make sure Python can find the Python implementation | |
sys.path.insert(0, os.path.abspath('ThreatExchange/hashing/pdq/python')) | |
# Import the bindings package as well as the | |
# pure Python implementation module | |
import pdqhashing.hasher.pdq_hasher as pdqpython | |
import pdqhash as pdqcython | |
import numpy as np | |
import cv2 | |
# Download an example image | |
urllib.request.urlretrieve( | |
"https://commons.wikimedia.org/w/thumb.php?f=Actinoscyphia_aurelia_1.jpg&w=256", | |
"example.jpg" | |
) | |
# We want an apples to apples comparison so | |
# we include I/O as part of the test for both | |
# approaches. | |
pyhasher = pdqpython.PDQHasher() | |
def pdq_using_cython_bindings(filepath): | |
image = cv2.imread(filepath) | |
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) | |
return pdqcython.compute(image) | |
def pdq_using_pure_python(filepath): | |
return pyhasher.fromFile(filepath) | |
n_runs = 100 | |
print('Running Python benchmark.') | |
time_python = timeit.timeit( | |
stmt="pdq_using_pure_python('example.jpg')", | |
globals=globals(), | |
number=n_runs | |
) / n_runs | |
print('Running Cython bindings benchmark.') | |
time_cython = timeit.timeit( | |
stmt="pdq_using_cython_bindings('example.jpg')", globals=globals(), | |
number=n_runs | |
) / n_runs | |
# Here we make sure the output is identical for both the bindings and the | |
# pure Python implementation. | |
# The reshape operation is used to put the bits in the same order as | |
# that shown using: | |
# ./pdq-photo-hasher example.jpg | \ | |
# cut -d',' -f1 | \ | |
# ./hashtool256 bits | tr ' ' ',' | |
hqpython = pdq_using_pure_python('example.jpg') | |
vector_cython, quality_cython = pdq_using_cython_bindings('example.jpg') | |
np.testing.assert_equal( | |
vector_cython.astype('int'), | |
np.array(hqpython.getHash().dumpBitsAcross().split(' ')).astype('int').reshape(16, 16)[::-1].flatten() | |
) | |
assert quality_cython == hqpython.getQuality() | |
print(f'Running with pure Python implementation took {round(1000*time_python)}ms, on average.') | |
print(f'Running with Cython bindings took {round(1000*time_cython)}ms, on average.') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment