Last active
October 10, 2018 00:07
-
-
Save jwlin/e1be1f402226fbff1599db94fc8cd6a7 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def getTopic(self, ai, imgtopic, Dict): | |
# JW: features 是準備要被預測 topic 的變數, 應該要挪到下方 prediction 開始前, 程式比較連貫 | |
#extrat the features of the element | |
features = str(re.sub(' +', ' ', ' '.join(self.extract_features(ai, imgtopic, Dict, 1)))) | |
#print (features) | |
# JW: 從這裡開始可以獨立切一個 function, 只呼叫一次, 把 train 好的 model 存起來 | |
#open training data file | |
current_dir = os.path.dirname(_file_) | |
corpus_dir = os.path.join(current_dir, 'corpus', 'all-corpus') | |
answer = dict() | |
with open(os.path.join(current_dir, 'corpus', 'label-all-corpus.json'), 'r') as f: | |
data = json.load(f) | |
for k, v in data.items(): | |
if v['feature'] in answer.keys(): | |
assert answer[v['feature']] == v['type'] | |
else: | |
answer[v['feature']] = v['type'] | |
ids = list() | |
all_corpus = dict() | |
for fname in os.listdir(corpus_dir): | |
key = fname.split('-')[0] | |
ids.append(key) | |
all_corpus[key] = [line.lower().split() for line in open(os.path.join(corpus_dir, fname), 'r')] | |
correction = [] | |
program_log_data = [] | |
training_ids = ids[:] | |
# training | |
corpus = [] | |
for t_id in training_ids: | |
corpus += all_corpus[t_id] | |
dictionary = corpora.Dictionary(corpus) | |
# common words and tokenize to remove | |
stoplist = set('your a the is and or in be to of for not on with as by'.split()) | |
# remove stop words and words that appear only once | |
stop_ids = [dictionary.token2id[stopword] for stopword in stoplist if stopword in dictionary.token2id] | |
# once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq == 1] | |
once_ids = [] | |
dictionary.filter_tokens(stop_ids + once_ids) # remove stop words and words that appear only once | |
dictionary.compactify() # remove gaps in id sequence after words that were removed | |
corpus_bow = [] | |
for t_id in training_ids: | |
corpus_bow += [dictionary.doc2bow(c) for c in all_corpus[t_id]] | |
tfidf = models.TfidfModel(corpus_bow) # tfidf | |
corpus_tfidf = tfidf[corpus_bow] | |
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=50) #find theSVD | |
corpus_lsi = lsi[corpus_tfidf] | |
index = similarities.MatrixSimilarity(corpus_lsi) # transform element's feature vector to latent vector space | |
# JW: 以上, training 結束,過程中的物件、模型,都可以存成檔案,下次直接呼叫,不用重新 train | |
# JW: 具體來說, dictionary, tfidf, lsi, index 四個變數都可以呼叫 save()/load() 方法以便後續存取 | |
# JW: 使用說明參考官方文件: | |
# JW: https://radimrehurek.com/gensim/tut2.html | |
# JW: https://radimrehurek.com/gensim/tut3.html | |
training_topic = {} | |
for i in range(len(corpus)): | |
feature = ' '.join(corpus[i]) | |
training_topic[str(i)] = { | |
'type': answer[feature], | |
'feature': feature | |
} | |
# JW: training_topic 建好之後,可以存成檔案,下次直接讀入作為對照表使用 | |
# JW: 到此為止,training 結束,以上可以切成一個 function | |
# JW: 以下, prediction 開始, 可以自己獨立一個 function | |
# JW: 在這邊把之前存的 dictionary, tfidf, lsi, index, training_topic 先讀進來就可以了 | |
num_total = 0 | |
num_incorrect = 0 | |
num_multiple_types = 0 | |
d=features.lower().split() | |
num_total += 1 | |
vec_bow = dictionary.doc2bow(d) | |
vec_tfidf = tfidf[vec_bow] | |
vec_lsi = lsi[vec_tfidf] | |
sims = index[vec_lsi] #caculate cosine similarity | |
sims = sorted(enumerate(sims), key=lambda item: -item[1]) | |
#obtain vec_type, the topic of element | |
vec_type = training_topic[str(sims[0][0])]['type'] | |
feature = ' '.join(d) | |
if (sims[0][1] - sims[4][1]) < 0.1: | |
topic_count = {} | |
for s in sims[:5]: | |
key = str(s[0]) | |
if training_topic[key]['type'] in topic_count.keys(): | |
topic_count[training_topic[key]['type']] += 1 | |
else: | |
topic_count[training_topic[key]['type']] = 1 | |
max_times = topic_count[max(topic_count, key=topic_count.get)] | |
max_types = { training_topic[str(v[0])]['type'] for v in sims[:5] \ | |
if topic_count[training_topic[str(v[0])]['type']] == max_times } | |
if len(max_types) > 1: | |
num_multiple_types += 1 | |
vec_type = random.choice(list(max_types)) | |
if vec_type in CnUtil.queryVocalbulary(): | |
return vec_type | |
else: | |
return '_TOPIC@unknown' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment