Created
November 25, 2019 03:03
-
-
Save edison7500/281ae54ea555e902599bbf606052de34 to your computer and use it in GitHub Desktop.
计算文本 tf-idf
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import math | |
from datetime import datetime, timedelta | |
from django.conf import settings | |
from django.core.management.base import BaseCommand | |
from apps.articles.models import Article | |
idf_path = getattr(settings, "IDF_PATH", None) | |
class Command(BaseCommand): | |
help = "gen article tfidf file" | |
def _is_number(self, word): | |
try: | |
float(word) | |
except ValueError: | |
return False | |
return True | |
def add_arguments(self, parser): | |
parser.add_argument("-d", "--delta", type=int, help="cal", default=30) | |
def handle(self, *args, **options): | |
days = options["delta"] | |
word2count = {} | |
since = datetime.now() - timedelta(days=days) | |
_count = Article.objects(published_at__gte=since).count() | |
for row in Article.objects(published_at__gte=since): | |
seg_list = row.gen_idf() | |
for word in seg_list: | |
word = word.strip() | |
if word == "" or self._is_number(word): | |
continue | |
if word not in word2count: | |
word2count[word] = 1 | |
else: | |
word2count[word] += 1 | |
idf_file = open(idf_path, mode="w", encoding="utf-8") | |
for word, df in word2count.items(): | |
_ = "%s %.9f\n" % (word, math.log(_count / df)) | |
idf_file.write(_) | |
idf_file.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment