HQuser · April 13, 2023 17:16
diff --git a/BM25F.py b/BM25F.py
 def preprocess_tokens(text):
    from text_preprocessing import preprocess_text
    from text_preprocessing import to_lower, remove_email, remove_url, remove_punctuation, lemmatize_word, \
        remove_special_character, normalize_unicode, remove_stopword, stem_word, expand_contraction, tokenize_word
    preprocess_functions = [remove_special_character, normalize_unicode, to_lower, expand_contraction,
                            remove_punctuation, remove_stopword, lemmatize_word, stem_word]
    return preprocess_text(text, preprocess_functions).split()


 def extract_text_snippet(snippet_meta):
    text = snippet_meta['title']
    if 'snippet' in snippet_meta:
        text = text + ' ' + snippet_meta['snippet']
    return text


 def get_sl(stream, corpus):
    all_sl_s = list()
    for doc in stream:
        text = extract_text_snippet(corpus[doc])
        text = preprocess_tokens(text)
        all_sl_s.append(len(text))
    return all_sl_s


 def get_avg_sl(streams, corpus):
    all_sl_s = list()
    # for stream in streams:
    for doc in streams:
        text = extract_text_snippet(corpus[doc])
        text = preprocess_tokens(text)
        all_sl_s.append(len(text))
    from statistics import mean
    return mean(all_sl_s)


 def get_tf_ts(stream, corpus, term):
    all_terms = list()
    for doc in stream:
        text = extract_text_snippet(corpus[doc])
        text = preprocess_tokens(text)
        all_terms.append(text.count(term))
    return sum(all_terms)


 def get_total_documents(streams):
    length = 0
    for stream in streams:
        length += len(stream)
    return length


 def get_dtf(term, stream, corpus):
    doc_count = 0
    for doc in stream:
        text = extract_text_snippet(corpus[doc])
        text = preprocess_tokens(text)
        if term in text:
            doc_count += 1
            # break
    return doc_count


 def get_df_td(q, w_s, streams, corpus):
    tf_td_all = list()
    for key, stream in streams.items():
        for document in stream:
            text = extract_text_snippet(corpus[document])
            text = preprocess_tokens(text)
            if q in text:
                sl_s = len(text)  # get_sl(doc, corpus)
                avg_sl_s = get_avg_sl(stream, corpus)

                b_s = 0.75

                tf_td_denominator = (1 - b_s) + (b_s * sl_s / avg_sl_s)
                tf_t_S = get_tf_ts(stream, corpus, q)

                tf_td_all.append(w_s * (tf_t_S / tf_td_denominator))
    return sum(tf_td_all)


 def get_expanded_corpus(query, stream, corpus):
    expanded_stream = dict()
    w_s = 1 / len(stream)

    query = preprocess_tokens(query)

    for stream_name, documents in stream.items():
        for doc in documents:
            if stream_name not in expanded_stream:
                expanded_stream[stream_name] = dict()

            text = extract_text_snippet(corpus[doc])
            text = preprocess_tokens(text)

            for stream_i, documents in stream.items():
                for document in documents:
                    bm25f_score_doc = 0
                    for q in query:
                        tf_td = get_df_td(q, w_s, stream, corpus)
                        dft = get_dtf(q, documents, corpus)

                        logliklyhood = math.log((get_total_documents(stream) - dft + 0.5) / (dft + 0.5))

                        bm25_single_term = tf_td * logliklyhood
                        bm25f_score_doc += bm25_single_term
                expanded_stream[stream_name][doc] = {doc: bm25f_score_doc}

    return expanded_stream


 def BM25F(query, stream, corpus):
    expanded_stream = get_expanded_corpus(query, stream, corpus)
    return expanded_stream


 query = 'New administration justice'
 stream = {'SCHOOL': [5, 6, 7], 'NEWS': [3, 1, 2]}
 corpus = {5: {'title': 'new new new administration test sun'}, 6: {'title': 'justice new test sun'},
          7: {'title': 'new test sun'}, 3: {'title': 'hi'}, 1: {'title': 'new'}, 2: {'title': 'today'}}

 BM25F(query, stream, corpus)
	def preprocess_tokens(text):
	from text_preprocessing import preprocess_text
	from text_preprocessing import to_lower, remove_email, remove_url, remove_punctuation, lemmatize_word, \
	remove_special_character, normalize_unicode, remove_stopword, stem_word, expand_contraction, tokenize_word
	preprocess_functions = [remove_special_character, normalize_unicode, to_lower, expand_contraction,
	remove_punctuation, remove_stopword, lemmatize_word, stem_word]
	return preprocess_text(text, preprocess_functions).split()


	def extract_text_snippet(snippet_meta):
	text = snippet_meta['title']
	if 'snippet' in snippet_meta:
	text = text + ' ' + snippet_meta['snippet']
	return text


	def get_sl(stream, corpus):
	all_sl_s = list()
	for doc in stream:
	text = extract_text_snippet(corpus[doc])
	text = preprocess_tokens(text)
	all_sl_s.append(len(text))
	return all_sl_s


	def get_avg_sl(streams, corpus):
	all_sl_s = list()
	# for stream in streams:
	for doc in streams:
	text = extract_text_snippet(corpus[doc])
	text = preprocess_tokens(text)
	all_sl_s.append(len(text))
	from statistics import mean
	return mean(all_sl_s)


	def get_tf_ts(stream, corpus, term):
	all_terms = list()
	for doc in stream:
	text = extract_text_snippet(corpus[doc])
	text = preprocess_tokens(text)
	all_terms.append(text.count(term))
	return sum(all_terms)


	def get_total_documents(streams):
	length = 0
	for stream in streams:
	length += len(stream)
	return length


	def get_dtf(term, stream, corpus):
	doc_count = 0
	for doc in stream:
	text = extract_text_snippet(corpus[doc])
	text = preprocess_tokens(text)
	if term in text:
	doc_count += 1
	# break
	return doc_count


	def get_df_td(q, w_s, streams, corpus):
	tf_td_all = list()
	for key, stream in streams.items():
	for document in stream:
	text = extract_text_snippet(corpus[document])
	text = preprocess_tokens(text)
	if q in text:
	sl_s = len(text) # get_sl(doc, corpus)
	avg_sl_s = get_avg_sl(stream, corpus)

	b_s = 0.75

	tf_td_denominator = (1 - b_s) + (b_s * sl_s / avg_sl_s)
	tf_t_S = get_tf_ts(stream, corpus, q)

	tf_td_all.append(w_s * (tf_t_S / tf_td_denominator))
	return sum(tf_td_all)


	def get_expanded_corpus(query, stream, corpus):
	expanded_stream = dict()
	w_s = 1 / len(stream)

	query = preprocess_tokens(query)

	for stream_name, documents in stream.items():
	for doc in documents:
	if stream_name not in expanded_stream:
	expanded_stream[stream_name] = dict()

	text = extract_text_snippet(corpus[doc])
	text = preprocess_tokens(text)

	for stream_i, documents in stream.items():
	for document in documents:
	bm25f_score_doc = 0
	for q in query:
	tf_td = get_df_td(q, w_s, stream, corpus)
	dft = get_dtf(q, documents, corpus)

	logliklyhood = math.log((get_total_documents(stream) - dft + 0.5) / (dft + 0.5))

	bm25_single_term = tf_td * logliklyhood
	bm25f_score_doc += bm25_single_term
	expanded_stream[stream_name][doc] = {doc: bm25f_score_doc}

	return expanded_stream


	def BM25F(query, stream, corpus):
	expanded_stream = get_expanded_corpus(query, stream, corpus)
	return expanded_stream


	query = 'New administration justice'
	stream = {'SCHOOL': [5, 6, 7], 'NEWS': [3, 1, 2]}
	corpus = {5: {'title': 'new new new administration test sun'}, 6: {'title': 'justice new test sun'},
	7: {'title': 'new test sun'}, 3: {'title': 'hi'}, 1: {'title': 'new'}, 2: {'title': 'today'}}

	BM25F(query, stream, corpus)