Skip to content

Instantly share code, notes, and snippets.

@HQuser
Created April 13, 2023 17:16
Show Gist options
  • Save HQuser/96296df651ad0e81b0ce3b946b7638ee to your computer and use it in GitHub Desktop.
Save HQuser/96296df651ad0e81b0ce3b946b7638ee to your computer and use it in GitHub Desktop.
BM25F Custom Implementation
def preprocess_tokens(text):
from text_preprocessing import preprocess_text
from text_preprocessing import to_lower, remove_email, remove_url, remove_punctuation, lemmatize_word, \
remove_special_character, normalize_unicode, remove_stopword, stem_word, expand_contraction, tokenize_word
preprocess_functions = [remove_special_character, normalize_unicode, to_lower, expand_contraction,
remove_punctuation, remove_stopword, lemmatize_word, stem_word]
return preprocess_text(text, preprocess_functions).split()
def extract_text_snippet(snippet_meta):
text = snippet_meta['title']
if 'snippet' in snippet_meta:
text = text + ' ' + snippet_meta['snippet']
return text
def get_sl(stream, corpus):
all_sl_s = list()
for doc in stream:
text = extract_text_snippet(corpus[doc])
text = preprocess_tokens(text)
all_sl_s.append(len(text))
return all_sl_s
def get_avg_sl(streams, corpus):
all_sl_s = list()
# for stream in streams:
for doc in streams:
text = extract_text_snippet(corpus[doc])
text = preprocess_tokens(text)
all_sl_s.append(len(text))
from statistics import mean
return mean(all_sl_s)
def get_tf_ts(stream, corpus, term):
all_terms = list()
for doc in stream:
text = extract_text_snippet(corpus[doc])
text = preprocess_tokens(text)
all_terms.append(text.count(term))
return sum(all_terms)
def get_total_documents(streams):
length = 0
for stream in streams:
length += len(stream)
return length
def get_dtf(term, stream, corpus):
doc_count = 0
for doc in stream:
text = extract_text_snippet(corpus[doc])
text = preprocess_tokens(text)
if term in text:
doc_count += 1
# break
return doc_count
def get_df_td(q, w_s, streams, corpus):
tf_td_all = list()
for key, stream in streams.items():
for document in stream:
text = extract_text_snippet(corpus[document])
text = preprocess_tokens(text)
if q in text:
sl_s = len(text) # get_sl(doc, corpus)
avg_sl_s = get_avg_sl(stream, corpus)
b_s = 0.75
tf_td_denominator = (1 - b_s) + (b_s * sl_s / avg_sl_s)
tf_t_S = get_tf_ts(stream, corpus, q)
tf_td_all.append(w_s * (tf_t_S / tf_td_denominator))
return sum(tf_td_all)
def get_expanded_corpus(query, stream, corpus):
expanded_stream = dict()
w_s = 1 / len(stream)
query = preprocess_tokens(query)
for stream_name, documents in stream.items():
for doc in documents:
if stream_name not in expanded_stream:
expanded_stream[stream_name] = dict()
text = extract_text_snippet(corpus[doc])
text = preprocess_tokens(text)
for stream_i, documents in stream.items():
for document in documents:
bm25f_score_doc = 0
for q in query:
tf_td = get_df_td(q, w_s, stream, corpus)
dft = get_dtf(q, documents, corpus)
logliklyhood = math.log((get_total_documents(stream) - dft + 0.5) / (dft + 0.5))
bm25_single_term = tf_td * logliklyhood
bm25f_score_doc += bm25_single_term
expanded_stream[stream_name][doc] = {doc: bm25f_score_doc}
return expanded_stream
def BM25F(query, stream, corpus):
expanded_stream = get_expanded_corpus(query, stream, corpus)
return expanded_stream
query = 'New administration justice'
stream = {'SCHOOL': [5, 6, 7], 'NEWS': [3, 1, 2]}
corpus = {5: {'title': 'new new new administration test sun'}, 6: {'title': 'justice new test sun'},
7: {'title': 'new test sun'}, 3: {'title': 'hi'}, 1: {'title': 'new'}, 2: {'title': 'today'}}
BM25F(query, stream, corpus)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment