find_similarity_threshold_using_use.md

import pandas as pd
import numpy as np
import json
import nltk
import string
import re
import tensorflow as tf
import tensorflow_hub as hub

WARNING: Logging before flag parsing goes to stderr.
W0520 15:39:11.145781 4611618240 __init__.py:56] Some hub symbols are not available because TensorFlow version is less than 1.14

# Prepare actual similarity data
data_xlsx = pd.read_excel('./similarity_manually_label.xlsx', 'Sheet1', index_col=0)
actual_matrix = np.array(data_xlsx.values)

# Import base and test data
with open('./text_similarity_base.json') as data_file:    
    text_similarity_base = json.load(data_file)
    
with open('./text_similarity_test.json') as data_file:    
    text_similarity_test = json.load(data_file)

# Create base and test data frames
base_df = pd.DataFrame.from_dict(text_similarity_base, orient='columns')
test_df = pd.DataFrame.from_dict(text_similarity_test, orient='columns')

nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/kittisakp/nltk_data...
[nltk_data]   Package punkt is already up-to-date!





True

# Text pre-processing functions
stemmer = nltk.stem.porter.PorterStemmer()
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)
stopwords = nltk.corpus.stopwords.words('english')

def tokenize(text):
    return nltk.word_tokenize(text)

def stem_tokens(tokens):
    return [stemmer.stem(item) for item in tokens]

def remove_stopwords(tokens):
    return [item for item in tokens if item not in stopwords]

def keep_alphabetic(tokens):
    return [item for item in tokens if item.isalpha()]

def reduce_lengthening(tokens):
    pattern = re.compile(r"(.)\1{2,}")
    return [pattern.sub(r"\1\1", item) for item in tokens]

'''lowercase, punctuation, remove stopwords, only alphabetic, reduce lengthening, stem'''
def normalize(text):
    lower_text_without_punctuation = text.lower().translate(remove_punctuation_map)
    return ' '.join(
                stem_tokens(
                reduce_lengthening(
                keep_alphabetic(
                remove_stopwords(
                tokenize(
                lower_text_without_punctuation))))))

# Text cleansing
base_df['normalized_text'] = base_df['text'].apply(lambda text: normalize(text))
test_df['normalized_text'] = test_df['text'].apply(lambda text: normalize(text))

# Define constants
thresholds = [
    0,
    0.1,
    0.2,
    0.3,
    0.4,
    0.5,
    0.6,
    0.7,
    0.75,
    0.76,
    0.77,
    0.78,
    0.79,
    0.8,
    0.9,
    1
]

base_sentences = base_df['normalized_text'].values
test_sentences = test_df['normalized_text'].values
base_count = len(base_sentences)
test_count = len(test_sentences)

# tensroflow hub module for Universal sentence Encoder
module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/3"

embed = hub.Module(module_url)

WARNING:tensorflow:From /usr/local/lib/python3.7/site-packages/tensorflow/python/ops/control_flow_ops.py:3632: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.


W0520 15:40:21.697924 4611618240 deprecation.py:323] From /usr/local/lib/python3.7/site-packages/tensorflow/python/ops/control_flow_ops.py:3632: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.

def get_features(texts):
    if type(texts) is str:
        texts = [texts]
    with tf.Session() as sess:
        sess.run([tf.global_variables_initializer(), tf.tables_initializer()])
        return sess.run(embed(texts))

def cosine_similarity(v1, v2):
    mag1 = np.linalg.norm(v1)
    mag2 = np.linalg.norm(v2)
    if (not mag1) or (not mag2):
        return 0
    return np.dot(v1, v2) / (mag1 * mag2)

base_vector = get_features(base_sentences)
test_vector = get_features(test_sentences)

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0520 15:41:12.798565 4611618240 saver.py:1483] Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0520 15:41:22.793359 4611618240 saver.py:1483] Saver not created because there are no variables in the graph to restore

def calculate_similarity(threshold):    
    predict_matrix = np.array([[None for j in range(test_count)] for i in range(base_count)])
    tp_count = 0
    tn_count = 0
    fp_count = 0
    fn_count = 0
    
    # Prepare predict data
    for base_index, base_value in enumerate(base_vector):
        for test_index, test_value in enumerate(test_vector):
            similarity = cosine_similarity(base_value, test_value)

            if similarity >= threshold:
                predict_matrix[base_index][test_index] = 1 # 1 means duplicate
            else:
                predict_matrix[base_index][test_index] = 0 # 0 means non-duplicate
    
    # Calculate result
    for i in range(base_count):
        for j in range(test_count):
            actual = actual_matrix[i][j]
            predict = predict_matrix[i][j]

            if actual == 0 and predict == 0: # true negative
                tn_count += 1
            elif actual == 1 and predict == 1: # true position
                tp_count += 1
            elif actual == 1 and predict == 0: # false negative 
                fn_count += 1
            elif actual == 0 and predict == 1: # false positive
                fp_count += 1

    accuracy = (tn_count + tp_count) / (tn_count + tp_count + fn_count + fp_count)
                
    print("threshold:", threshold)
    print("true negative:", tn_count)
    print("true position:", tp_count)
    print("false negative:", fn_count)
    print("false positive:", fp_count)
    print("accuracy:", accuracy)
    print("\n======================================\n")

print("Base count: %d, Test count: %d, Total = %d\n" % (base_count, test_count, base_count * test_count))

for threshold in thresholds:
    calculate_similarity(threshold)

Base count: 94, Test count: 20, Total = 1880

threshold: 0
true negative: 1
true position: 25
false negative: 0
false positive: 1854
accuracy: 0.013829787234042552

======================================

threshold: 0.1
true negative: 32
true position: 25
false negative: 0
false positive: 1823
accuracy: 0.03031914893617021

======================================

threshold: 0.2
true negative: 172
true position: 25
false negative: 0
false positive: 1683
accuracy: 0.10478723404255319

======================================

threshold: 0.3
true negative: 543
true position: 23
false negative: 2
false positive: 1312
accuracy: 0.30106382978723406

======================================

threshold: 0.4
true negative: 1028
true position: 18
false negative: 7
false positive: 827
accuracy: 0.5563829787234043

======================================

threshold: 0.5
true negative: 1448
true position: 14
false negative: 11
false positive: 407
accuracy: 0.7776595744680851

======================================

threshold: 0.6
true negative: 1734
true position: 7
false negative: 18
false positive: 121
accuracy: 0.926063829787234

======================================

threshold: 0.7
true negative: 1838
true position: 2
false negative: 23
false positive: 17
accuracy: 0.9787234042553191

======================================

threshold: 0.75
true negative: 1852
true position: 0
false negative: 25
false positive: 3
accuracy: 0.9851063829787234

======================================

threshold: 0.76
true negative: 1854
true position: 0
false negative: 25
false positive: 1
accuracy: 0.9861702127659574

======================================

threshold: 0.77
true negative: 1854
true position: 0
false negative: 25
false positive: 1
accuracy: 0.9861702127659574

======================================

threshold: 0.78
true negative: 1855
true position: 0
false negative: 25
false positive: 0
accuracy: 0.9867021276595744

======================================

threshold: 0.79
true negative: 1855
true position: 0
false negative: 25
false positive: 0
accuracy: 0.9867021276595744

======================================

threshold: 0.8
true negative: 1855
true position: 0
false negative: 25
false positive: 0
accuracy: 0.9867021276595744

======================================

threshold: 0.9
true negative: 1855
true position: 0
false negative: 25
false positive: 0
accuracy: 0.9867021276595744

======================================

threshold: 1
true negative: 1855
true position: 0
false negative: 25
false positive: 0
accuracy: 0.9867021276595744

======================================

kittisak-phetrungnapha/find_similarity_threshold_using_use.md