zxygentoo · April 8, 2022 12:20
diff --git a/chinese_word_freq.py b/chinese_word_freq.py
 #! python3
 # -*- coding: utf-8 -*-

 from collections import Counter
 import sys
 import os
 import codecs
 import getopt
 import jieba


 def help_then_exit():
    """Print help message then exit."""

    print('freq.py -i <inputfile>')
    sys.exit()


 def get_filename(argv):
    """Get input filename from command line args."""

    try:
        opts, args = getopt.getopt(argv,"hi:", ["input_file="])

    except getopt.GetoptError:
        help_then_exit()

    else:
        for opt, arg in opts:
            if opt in ("-i", "--input_file"):
                return arg
            else:
                help_then_exit()
        else:
            help_then_exit()
    

 def get_text(filename):
    """Open, read and return file content."""
    
    with codecs.open(filename, 'r', 'utf8') as f:
        return f.read()


 def segment_words(text):
    """Segment text string into word list."""

    return jieba.cut(text)


 def calculate_threshold(text_length):
    """Calculate useful word occurrence threshold from text length."""

    return 5 if text_length < 100000 else int(text_length / 10000)


 def count_words(word_list):
    """Count occurrence for each word in word list."""

    c = Counter()

    for x in word_list:
        c[x] = c[x] + 1 if len(x) > 1 else c[x]

    return c


 def filter_word_dict_on_threshold(word_dict, threshold):
    """Filter word_dict where occurrences are greater than threshold."""

    return [
        (word, count)
        for word, count in word_dict.most_common()
        if count >= threshold
    ]


 def print_result(word_dict):
    """Print result in rank/word/occurrence format."""

    for index, (word, count) in enumerate(word_dict):
        print('%d\t\t%s\t\t%d' % (index + 1, word, count))


 def main(argv):
    """Main function."""

    text = get_text(get_filename(argv))
    print_result(
        filter_word_dict_on_threshold(
            count_words(segment_words(text)),
            calculate_threshold(len(text))
        )
    )


 if __name__ == "__main__":

   main(sys.argv[1:])
	#! python3
	# -- coding: utf-8 --

	from collections import Counter
	import sys
	import os
	import codecs
	import getopt
	import jieba


	def help_then_exit():
	"""Print help message then exit."""

	print('freq.py -i <inputfile>')
	sys.exit()


	def get_filename(argv):
	"""Get input filename from command line args."""

	try:
	opts, args = getopt.getopt(argv,"hi:", ["input_file="])

	except getopt.GetoptError:
	help_then_exit()

	else:
	for opt, arg in opts:
	if opt in ("-i", "--input_file"):
	return arg
	else:
	help_then_exit()
	else:
	help_then_exit()


	def get_text(filename):
	"""Open, read and return file content."""

	with codecs.open(filename, 'r', 'utf8') as f:
	return f.read()


	def segment_words(text):
	"""Segment text string into word list."""

	return jieba.cut(text)


	def calculate_threshold(text_length):
	"""Calculate useful word occurrence threshold from text length."""

	return 5 if text_length < 100000 else int(text_length / 10000)


	def count_words(word_list):
	"""Count occurrence for each word in word list."""

	c = Counter()

	for x in word_list:
	c[x] = c[x] + 1 if len(x) > 1 else c[x]

	return c


	def filter_word_dict_on_threshold(word_dict, threshold):
	"""Filter word_dict where occurrences are greater than threshold."""

	return [
	(word, count)
	for word, count in word_dict.most_common()
	if count >= threshold
	]


	def print_result(word_dict):
	"""Print result in rank/word/occurrence format."""

	for index, (word, count) in enumerate(word_dict):
	print('%d\t\t%s\t\t%d' % (index + 1, word, count))


	def main(argv):
	"""Main function."""

	text = get_text(get_filename(argv))
	print_result(
	filter_word_dict_on_threshold(
	count_words(segment_words(text)),
	calculate_threshold(len(text))
	)
	)


	if __name__ == "__main__":

	main(sys.argv[1:])