Last active
January 27, 2021 16:46
-
-
Save dp-quant/9d4898cecb9448a209b440d35abdf4fa to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import re | |
from collections import defaultdict | |
try: | |
testing_string = sys.argv[1] | |
except IndexError: | |
exit(1) | |
print('Initial String: ', testing_string) | |
# tokenize | |
tokens = testing_string.split(' ') | |
print('Received tokens: ', tokens) | |
seq_counter = defaultdict(int) | |
unique_tokens = set() | |
for token in tokens: | |
# cleanup | |
seq = token = re.sub('\W+','', token) | |
# skip short or dups | |
if len(set(seq)) <= 1: | |
continue | |
# make sequence | |
seq = list(seq) | |
# make it set to sync | |
seq.sort() | |
if token not in unique_tokens: | |
unique_tokens.add(token) # keep the unique list of words | |
seq_counter[tuple(seq)] += 1 | |
print('Result:', sum([i - 1 for i in seq_counter.values()])) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment