Skip to content

Instantly share code, notes, and snippets.

@meme-lord
Created October 18, 2020 14:34
Show Gist options
  • Save meme-lord/8f8ba6797691818f71df4e452a810f72 to your computer and use it in GitHub Desktop.
Save meme-lord/8f8ba6797691818f71df4e452a810f72 to your computer and use it in GitHub Desktop.
Script to search a file with a lot of terms against another file. Much faster than grep -f or sift -f .
# script to run a lot of search terms / regexes against a file
# or turn a list of terms into a regex
# this is faster than grep or sift's -f option as of October 2020
# python input.txt searchfile.txt
import re
def add_to_dict(word, the_dict):
if len(word) == 0:
return
if word[0] not in the_dict:
the_dict[word[0]] = {}
if len(word) == 1:
the_dict[word]['END'] = True
return
add_to_dict(word[1:], the_dict[word[0]])
def collapser(the_dict):
out = ''
end = 'END' in the_dict
if end:
the_dict.pop('END')
if len(the_dict) == 0:
return ''
first = True
for x in the_dict:
if not first:
out += '|'
else:
first = False
out += x
out += collapser(the_dict[x])
if end:
return f'({out})?'
if len(the_dict) == 1:
return out
return f'({out})'
def list2regex(input_list):
pattern_dict = {}
for x in input_list:
add_to_dict(x, pattern_dict)
return collapser(pattern_dict)
if __name__ == "__main__":
from sys import argv
pattern_dict = {}
if len(argv) != 3:
exit(f"Usage: python3 {argv[0]} input.txt filetosearch.txt")
terms_file = argv[1]
search_file = argv[2]
with open(terms_file) as f:
for line in f:
line = line.rstrip()
add_to_dict(line, pattern_dict)
pattern_string = collapser(pattern_dict)
pattern = re.compile(pattern_string)
with open(search_file) as f:
for line in f:
if pattern.search(line):
print(line.rstrip())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment