Last active
September 28, 2020 16:31
-
-
Save rrr3try/489d7a0faf63b0449de4f8cacd99a04a to your computer and use it in GitHub Desktop.
fuzzywuzzy example
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# pip install fuzzywuzzy[speedup] | |
# pip install tqdm | |
from fuzzywuzzy import fuzz | |
THRESHOLD = 90 | |
length = len(data) | |
data = set(data) | |
data = list(data) | |
def compare_fuzzy(compare_text, data_full): | |
similar = set() | |
compare_text = compare_text[:256].lower() | |
for i, text in enumerate(data_full): | |
if fuzz.ratio(text[:256].lower(), compare_text) > THRESHOLD: | |
similar.add(i) | |
return similar.pop() | |
real_indexes = set() | |
for text in tqdm_notebook(data): | |
real_indexes.add(compare_fuzzy(text, data)) | |
real_data = [data[i] for i in real_indexes] | |
print(f"before {length}|\n after {len(real_data)}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment