-
-
Save drjwbaker/5b7e97705f5b6051f561 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import re | |
output = [] | |
# use a "with" block to automatically close I/O streams | |
with open('mylist.txt') as word_list: | |
# read the contents of mylist.txt into the words list using list comprehension | |
words = [word.strip().lower() for word in word_list] | |
with open('stuff.tsv') as tsv: | |
# read the contents of stuff.tsv into the line list using list comprehension | |
lines = [line for line in tsv] | |
# create a dictionary of compiled regular expressions for the word list | |
regexen = {} | |
for word in words: | |
regexen[word] = re.compile(r'\b{0}\b'.format(word)) | |
# iterate over the lines | |
for line in lines: | |
# iterate over the word list | |
for word in words: | |
# create a regular expression using word boundaries around our word | |
match = regexen[word].search(line.lower()) | |
# if we find one of the words in the line, then add it to the output list | |
if match: | |
# add the line to the output list | |
if line.endswith('\n'): | |
output.append(line) | |
else: | |
output.append('{0}\n'.format(line)) | |
# write some debug output to the console | |
print('Found line {0} that matched word {1}'.format(line, word)) | |
# exit the word while loop | |
break | |
# open output.tsv using a with block with write permissions | |
with open('output.tsv', 'w') as output_file: | |
# write the output list to the file | |
output_file.writelines(output) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment