Last active
April 8, 2022 12:20
-
-
Save zxygentoo/aab3c7aea570df68ddc874e19890f777 to your computer and use it in GitHub Desktop.
使用 jieba 分词库的简单中文词频统计
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! python3 | |
# -*- coding: utf-8 -*- | |
from collections import Counter | |
import sys | |
import os | |
import codecs | |
import getopt | |
import jieba | |
def help_then_exit(): | |
"""Print help message then exit.""" | |
print('freq.py -i <inputfile>') | |
sys.exit() | |
def get_filename(argv): | |
"""Get input filename from command line args.""" | |
try: | |
opts, args = getopt.getopt(argv,"hi:", ["input_file="]) | |
except getopt.GetoptError: | |
help_then_exit() | |
else: | |
for opt, arg in opts: | |
if opt in ("-i", "--input_file"): | |
return arg | |
else: | |
help_then_exit() | |
else: | |
help_then_exit() | |
def get_text(filename): | |
"""Open, read and return file content.""" | |
with codecs.open(filename, 'r', 'utf8') as f: | |
return f.read() | |
def segment_words(text): | |
"""Segment text string into word list.""" | |
return jieba.cut(text) | |
def calculate_threshold(text_length): | |
"""Calculate useful word occurrence threshold from text length.""" | |
return 5 if text_length < 100000 else int(text_length / 10000) | |
def count_words(word_list): | |
"""Count occurrence for each word in word list.""" | |
c = Counter() | |
for x in word_list: | |
c[x] = c[x] + 1 if len(x) > 1 else c[x] | |
return c | |
def filter_word_dict_on_threshold(word_dict, threshold): | |
"""Filter word_dict where occurrences are greater than threshold.""" | |
return [ | |
(word, count) | |
for word, count in word_dict.most_common() | |
if count >= threshold | |
] | |
def print_result(word_dict): | |
"""Print result in rank/word/occurrence format.""" | |
for index, (word, count) in enumerate(word_dict): | |
print('%d\t\t%s\t\t%d' % (index + 1, word, count)) | |
def main(argv): | |
"""Main function.""" | |
text = get_text(get_filename(argv)) | |
print_result( | |
filter_word_dict_on_threshold( | |
count_words(segment_words(text)), | |
calculate_threshold(len(text)) | |
) | |
) | |
if __name__ == "__main__": | |
main(sys.argv[1:]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment