Created
April 13, 2023 17:16
-
-
Save HQuser/96296df651ad0e81b0ce3b946b7638ee to your computer and use it in GitHub Desktop.
BM25F Custom Implementation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def preprocess_tokens(text): | |
from text_preprocessing import preprocess_text | |
from text_preprocessing import to_lower, remove_email, remove_url, remove_punctuation, lemmatize_word, \ | |
remove_special_character, normalize_unicode, remove_stopword, stem_word, expand_contraction, tokenize_word | |
preprocess_functions = [remove_special_character, normalize_unicode, to_lower, expand_contraction, | |
remove_punctuation, remove_stopword, lemmatize_word, stem_word] | |
return preprocess_text(text, preprocess_functions).split() | |
def extract_text_snippet(snippet_meta): | |
text = snippet_meta['title'] | |
if 'snippet' in snippet_meta: | |
text = text + ' ' + snippet_meta['snippet'] | |
return text | |
def get_sl(stream, corpus): | |
all_sl_s = list() | |
for doc in stream: | |
text = extract_text_snippet(corpus[doc]) | |
text = preprocess_tokens(text) | |
all_sl_s.append(len(text)) | |
return all_sl_s | |
def get_avg_sl(streams, corpus): | |
all_sl_s = list() | |
# for stream in streams: | |
for doc in streams: | |
text = extract_text_snippet(corpus[doc]) | |
text = preprocess_tokens(text) | |
all_sl_s.append(len(text)) | |
from statistics import mean | |
return mean(all_sl_s) | |
def get_tf_ts(stream, corpus, term): | |
all_terms = list() | |
for doc in stream: | |
text = extract_text_snippet(corpus[doc]) | |
text = preprocess_tokens(text) | |
all_terms.append(text.count(term)) | |
return sum(all_terms) | |
def get_total_documents(streams): | |
length = 0 | |
for stream in streams: | |
length += len(stream) | |
return length | |
def get_dtf(term, stream, corpus): | |
doc_count = 0 | |
for doc in stream: | |
text = extract_text_snippet(corpus[doc]) | |
text = preprocess_tokens(text) | |
if term in text: | |
doc_count += 1 | |
# break | |
return doc_count | |
def get_df_td(q, w_s, streams, corpus): | |
tf_td_all = list() | |
for key, stream in streams.items(): | |
for document in stream: | |
text = extract_text_snippet(corpus[document]) | |
text = preprocess_tokens(text) | |
if q in text: | |
sl_s = len(text) # get_sl(doc, corpus) | |
avg_sl_s = get_avg_sl(stream, corpus) | |
b_s = 0.75 | |
tf_td_denominator = (1 - b_s) + (b_s * sl_s / avg_sl_s) | |
tf_t_S = get_tf_ts(stream, corpus, q) | |
tf_td_all.append(w_s * (tf_t_S / tf_td_denominator)) | |
return sum(tf_td_all) | |
def get_expanded_corpus(query, stream, corpus): | |
expanded_stream = dict() | |
w_s = 1 / len(stream) | |
query = preprocess_tokens(query) | |
for stream_name, documents in stream.items(): | |
for doc in documents: | |
if stream_name not in expanded_stream: | |
expanded_stream[stream_name] = dict() | |
text = extract_text_snippet(corpus[doc]) | |
text = preprocess_tokens(text) | |
for stream_i, documents in stream.items(): | |
for document in documents: | |
bm25f_score_doc = 0 | |
for q in query: | |
tf_td = get_df_td(q, w_s, stream, corpus) | |
dft = get_dtf(q, documents, corpus) | |
logliklyhood = math.log((get_total_documents(stream) - dft + 0.5) / (dft + 0.5)) | |
bm25_single_term = tf_td * logliklyhood | |
bm25f_score_doc += bm25_single_term | |
expanded_stream[stream_name][doc] = {doc: bm25f_score_doc} | |
return expanded_stream | |
def BM25F(query, stream, corpus): | |
expanded_stream = get_expanded_corpus(query, stream, corpus) | |
return expanded_stream | |
query = 'New administration justice' | |
stream = {'SCHOOL': [5, 6, 7], 'NEWS': [3, 1, 2]} | |
corpus = {5: {'title': 'new new new administration test sun'}, 6: {'title': 'justice new test sun'}, | |
7: {'title': 'new test sun'}, 3: {'title': 'hi'}, 1: {'title': 'new'}, 2: {'title': 'today'}} | |
BM25F(query, stream, corpus) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment