-
-
Save marcelcaraciolo/1604487 to your computer and use it in GitHub Desktop.
#-*- coding: utf-8 -*- | |
import re | |
import nltk | |
from nltk.tokenize import RegexpTokenizer | |
from nltk import bigrams, trigrams | |
import math | |
stopwords = nltk.corpus.stopwords.words('portuguese') | |
tokenizer = RegexpTokenizer("[\w’]+", flags=re.UNICODE) | |
def freq(word, doc): | |
return doc.count(word) | |
def word_count(doc): | |
return len(doc) | |
def tf(word, doc): | |
return (freq(word, doc) / float(word_count(doc))) | |
def num_docs_containing(word, list_of_docs): | |
count = 0 | |
for document in list_of_docs: | |
if freq(word, document) > 0: | |
count += 1 | |
return 1 + count | |
def idf(word, list_of_docs): | |
return math.log(len(list_of_docs) / | |
float(num_docs_containing(word, list_of_docs))) | |
def tf_idf(word, doc, list_of_docs): | |
return (tf(word, doc) * idf(word, list_of_docs)) | |
#Compute the frequency for each term. | |
vocabulary = [] | |
docs = {} | |
all_tips = [] | |
for tip in (['documment 1', 'documment 2']): | |
tokens = tokenizer.tokenize(tip.text) | |
bi_tokens = bigrams(tokens) | |
tri_tokens = trigrams(tokens) | |
tokens = [token.lower() for token in tokens if len(token) > 2] | |
tokens = [token for token in tokens if token not in stopwords] | |
bi_tokens = [' '.join(token).lower() for token in bi_tokens] | |
bi_tokens = [token for token in bi_tokens if token not in stopwords] | |
tri_tokens = [' '.join(token).lower() for token in tri_tokens] | |
tri_tokens = [token for token in tri_tokens if token not in stopwords] | |
final_tokens = [] | |
final_tokens.extend(tokens) | |
final_tokens.extend(bi_tokens) | |
final_tokens.extend(tri_tokens) | |
docs[tip] = {'freq': {}, 'tf': {}, 'idf': {}, | |
'tf-idf': {}, 'tokens': []} | |
for token in final_tokens: | |
#The frequency computed for each tip | |
docs[tip]['freq'][token] = freq(token, final_tokens) | |
#The term-frequency (Normalized Frequency) | |
docs[tip]['tf'][token] = tf(token, final_tokens) | |
docs[tip]['tokens'] = final_tokens | |
vocabulary.append(final_tokens) | |
for doc in docs: | |
for token in docs[doc]['tf']: | |
#The Inverse-Document-Frequency | |
docs[doc]['idf'][token] = idf(token, vocabulary) | |
#The tf-idf | |
docs[doc]['tf-idf'][token] = tf_idf(token, docs[doc]['tokens'], vocabulary) | |
#Now let's find out the most relevant words by tf-idf. | |
words = {} | |
for doc in docs: | |
for token in docs[doc]['tf-idf']: | |
if token not in words: | |
words[token] = docs[doc]['tf-idf'][token] | |
else: | |
if docs[doc]['tf-idf'][token] > words[token]: | |
words[token] = docs[doc]['tf-idf'][token] | |
print doc | |
for token in docs[doc]['tf-idf']: | |
print token, docs[doc]['tf-idf'][token] | |
for item in sorted(words.items(), key=lambda x: x[1], reverse=True): | |
print "%f <= %s" % (item[1], item[0]) |
I got an error
tokens = tokenizer.tokenize(tip.text)
AttributeError: 'str' object has no attribute 'text'
How to pass the documents ?
I too got an error
tokens = tokenizer.tokenize(tip.text)
AttributeError: 'str' object has no attribute 'text'
How to pass the documents ?
same error
tokens = tokenizer.tokenize(tip.text)
AttributeError: 'str' object has no attribute 'text'
How to pass the documents ?
Thank you for the code. It was really helpful!
same error
tokens = tokenizer.tokenize(tip.text)
AttributeError: 'str' object has no attribute 'text'How to pass the documents ?
Hey were you able to resolve the AttributeError: 'str' object has no attribute 'text' error ?
ame error
tokens = tokenizer.tokenize(tip.text)
AttributeError: 'str' object has no attribute 'text'
How to pass the documents ?
What is document 1 and document 2 ?
Can you give some example ??