mbrgm · December 20, 2015 22:59
diff --git a/tokenize.py b/tokenize.py
 #!/usr/bin/env python

 import argparse
 import codecs
 import nltk

 def main():
    # Parse arguments
    parser = argparse.ArgumentParser(description='Tokenize an input file.')
    parser.add_argument(dest='input_path', metavar='<file>', help='Input text file')
    parser.add_argument(dest='output_path', metavar='<output_file>', help='Output file',
            nargs='?')
    parser.add_argument('-e', dest='encoding', metavar='<encoding>',
            default='windows-1252', help='input file encoding (windows-1252 by ' \
            'default)')
    parser.add_argument('--no-punctuation', dest='no_punctuation', action='store_true',
            help='don\'t treat all punctuation as separate lemmas')
    parser.add_argument('-s', dest='separator', metavar='<separator>',
            default='\r\n', help='string for separating the tokens (newline by' \
            'default)')
    args = parser.parse_args()

    # Open input file with correct encoding
    file = codecs.open(args.input_path, encoding=args.encoding)
    contents = file.read()
    file.close()

    # Tokenize the file contents
    tokens =  nltk.word_tokenize(contents) if args.no_punctuation \
        else nltk.wordpunct_tokenize(contents)

    # Output the tokens
    output = args.separator.join(tokens)
    if args.output_path is None:
        print output
    else:
        f = codecs.open(args.output_path, 'w', args.encoding)
        f.write(output)
        f.close()

 if __name__ == "__main__":
    main()
	#!/usr/bin/env python

	import argparse
	import codecs
	import nltk

	def main():
	# Parse arguments
	parser = argparse.ArgumentParser(description='Tokenize an input file.')
	parser.add_argument(dest='input_path', metavar='<file>', help='Input text file')
	parser.add_argument(dest='output_path', metavar='<output_file>', help='Output file',
	nargs='?')
	parser.add_argument('-e', dest='encoding', metavar='<encoding>',
	default='windows-1252', help='input file encoding (windows-1252 by ' \
	'default)')
	parser.add_argument('--no-punctuation', dest='no_punctuation', action='store_true',
	help='don\'t treat all punctuation as separate lemmas')
	parser.add_argument('-s', dest='separator', metavar='<separator>',
	default='\r\n', help='string for separating the tokens (newline by' \
	'default)')
	args = parser.parse_args()

	# Open input file with correct encoding
	file = codecs.open(args.input_path, encoding=args.encoding)
	contents = file.read()
	file.close()

	# Tokenize the file contents
	tokens = nltk.word_tokenize(contents) if args.no_punctuation \
	else nltk.wordpunct_tokenize(contents)

	# Output the tokens
	output = args.separator.join(tokens)
	if args.output_path is None:
	print output
	else:
	f = codecs.open(args.output_path, 'w', args.encoding)
	f.write(output)
	f.close()

	if __name__ == "__main__":
	main()