Created
October 18, 2020 14:34
-
-
Save meme-lord/8f8ba6797691818f71df4e452a810f72 to your computer and use it in GitHub Desktop.
Script to search a file with a lot of terms against another file. Much faster than grep -f or sift -f .
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# script to run a lot of search terms / regexes against a file | |
# or turn a list of terms into a regex | |
# this is faster than grep or sift's -f option as of October 2020 | |
# python input.txt searchfile.txt | |
import re | |
def add_to_dict(word, the_dict): | |
if len(word) == 0: | |
return | |
if word[0] not in the_dict: | |
the_dict[word[0]] = {} | |
if len(word) == 1: | |
the_dict[word]['END'] = True | |
return | |
add_to_dict(word[1:], the_dict[word[0]]) | |
def collapser(the_dict): | |
out = '' | |
end = 'END' in the_dict | |
if end: | |
the_dict.pop('END') | |
if len(the_dict) == 0: | |
return '' | |
first = True | |
for x in the_dict: | |
if not first: | |
out += '|' | |
else: | |
first = False | |
out += x | |
out += collapser(the_dict[x]) | |
if end: | |
return f'({out})?' | |
if len(the_dict) == 1: | |
return out | |
return f'({out})' | |
def list2regex(input_list): | |
pattern_dict = {} | |
for x in input_list: | |
add_to_dict(x, pattern_dict) | |
return collapser(pattern_dict) | |
if __name__ == "__main__": | |
from sys import argv | |
pattern_dict = {} | |
if len(argv) != 3: | |
exit(f"Usage: python3 {argv[0]} input.txt filetosearch.txt") | |
terms_file = argv[1] | |
search_file = argv[2] | |
with open(terms_file) as f: | |
for line in f: | |
line = line.rstrip() | |
add_to_dict(line, pattern_dict) | |
pattern_string = collapser(pattern_dict) | |
pattern = re.compile(pattern_string) | |
with open(search_file) as f: | |
for line in f: | |
if pattern.search(line): | |
print(line.rstrip()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment