Skip to content

Instantly share code, notes, and snippets.

@mbrgm
Last active December 20, 2015 22:59
Show Gist options
  • Save mbrgm/6209148 to your computer and use it in GitHub Desktop.
Save mbrgm/6209148 to your computer and use it in GitHub Desktop.
Extracts tokens from an input file for further natural language processing.
#!/usr/bin/env python
import argparse
import codecs
import nltk
def main():
# Parse arguments
parser = argparse.ArgumentParser(description='Tokenize an input file.')
parser.add_argument(dest='input_path', metavar='<file>', help='Input text file')
parser.add_argument(dest='output_path', metavar='<output_file>', help='Output file',
nargs='?')
parser.add_argument('-e', dest='encoding', metavar='<encoding>',
default='windows-1252', help='input file encoding (windows-1252 by ' \
'default)')
parser.add_argument('--no-punctuation', dest='no_punctuation', action='store_true',
help='don\'t treat all punctuation as separate lemmas')
parser.add_argument('-s', dest='separator', metavar='<separator>',
default='\r\n', help='string for separating the tokens (newline by' \
'default)')
args = parser.parse_args()
# Open input file with correct encoding
file = codecs.open(args.input_path, encoding=args.encoding)
contents = file.read()
file.close()
# Tokenize the file contents
tokens = nltk.word_tokenize(contents) if args.no_punctuation \
else nltk.wordpunct_tokenize(contents)
# Output the tokens
output = args.separator.join(tokens)
if args.output_path is None:
print output
else:
f = codecs.open(args.output_path, 'w', args.encoding)
f.write(output)
f.close()
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment