Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save kittisak-phetrungnapha/b3f539188a3cbfd355a43a70d83243e7 to your computer and use it in GitHub Desktop.
Save kittisak-phetrungnapha/b3f539188a3cbfd355a43a70d83243e7 to your computer and use it in GitHub Desktop.
import pandas as pd
import numpy as np
import json
import nltk
import string
import re
import tensorflow as tf
import tensorflow_hub as hub
WARNING: Logging before flag parsing goes to stderr.
W0520 15:39:11.145781 4611618240 __init__.py:56] Some hub symbols are not available because TensorFlow version is less than 1.14
# Prepare actual similarity data
data_xlsx = pd.read_excel('./similarity_manually_label.xlsx', 'Sheet1', index_col=0)
actual_matrix = np.array(data_xlsx.values)
# Import base and test data
with open('./text_similarity_base.json') as data_file:    
    text_similarity_base = json.load(data_file)
    
with open('./text_similarity_test.json') as data_file:    
    text_similarity_test = json.load(data_file)
# Create base and test data frames
base_df = pd.DataFrame.from_dict(text_similarity_base, orient='columns')
test_df = pd.DataFrame.from_dict(text_similarity_test, orient='columns')
nltk.download('punkt')
[nltk_data] Downloading package punkt to /Users/kittisakp/nltk_data...
[nltk_data]   Package punkt is already up-to-date!





True
# Text pre-processing functions
stemmer = nltk.stem.porter.PorterStemmer()
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)
stopwords = nltk.corpus.stopwords.words('english')

def tokenize(text):
    return nltk.word_tokenize(text)

def stem_tokens(tokens):
    return [stemmer.stem(item) for item in tokens]

def remove_stopwords(tokens):
    return [item for item in tokens if item not in stopwords]

def keep_alphabetic(tokens):
    return [item for item in tokens if item.isalpha()]

def reduce_lengthening(tokens):
    pattern = re.compile(r"(.)\1{2,}")
    return [pattern.sub(r"\1\1", item) for item in tokens]

'''lowercase, punctuation, remove stopwords, only alphabetic, reduce lengthening, stem'''
def normalize(text):
    lower_text_without_punctuation = text.lower().translate(remove_punctuation_map)
    return ' '.join(
                stem_tokens(
                reduce_lengthening(
                keep_alphabetic(
                remove_stopwords(
                tokenize(
                lower_text_without_punctuation))))))
# Text cleansing
base_df['normalized_text'] = base_df['text'].apply(lambda text: normalize(text))
test_df['normalized_text'] = test_df['text'].apply(lambda text: normalize(text))
# Define constants
thresholds = [
    0,
    0.1,
    0.2,
    0.3,
    0.4,
    0.5,
    0.6,
    0.7,
    0.75,
    0.76,
    0.77,
    0.78,
    0.79,
    0.8,
    0.9,
    1
]

base_sentences = base_df['normalized_text'].values
test_sentences = test_df['normalized_text'].values
base_count = len(base_sentences)
test_count = len(test_sentences)
# tensroflow hub module for Universal sentence Encoder
module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/3"
embed = hub.Module(module_url)
WARNING:tensorflow:From /usr/local/lib/python3.7/site-packages/tensorflow/python/ops/control_flow_ops.py:3632: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.


W0520 15:40:21.697924 4611618240 deprecation.py:323] From /usr/local/lib/python3.7/site-packages/tensorflow/python/ops/control_flow_ops.py:3632: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.
def get_features(texts):
    if type(texts) is str:
        texts = [texts]
    with tf.Session() as sess:
        sess.run([tf.global_variables_initializer(), tf.tables_initializer()])
        return sess.run(embed(texts))
def cosine_similarity(v1, v2):
    mag1 = np.linalg.norm(v1)
    mag2 = np.linalg.norm(v2)
    if (not mag1) or (not mag2):
        return 0
    return np.dot(v1, v2) / (mag1 * mag2)
base_vector = get_features(base_sentences)
test_vector = get_features(test_sentences)
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0520 15:41:12.798565 4611618240 saver.py:1483] Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0520 15:41:22.793359 4611618240 saver.py:1483] Saver not created because there are no variables in the graph to restore
def calculate_similarity(threshold):    
    predict_matrix = np.array([[None for j in range(test_count)] for i in range(base_count)])
    tp_count = 0
    tn_count = 0
    fp_count = 0
    fn_count = 0
    
    # Prepare predict data
    for base_index, base_value in enumerate(base_vector):
        for test_index, test_value in enumerate(test_vector):
            similarity = cosine_similarity(base_value, test_value)

            if similarity >= threshold:
                predict_matrix[base_index][test_index] = 1 # 1 means duplicate
            else:
                predict_matrix[base_index][test_index] = 0 # 0 means non-duplicate
    
    # Calculate result
    for i in range(base_count):
        for j in range(test_count):
            actual = actual_matrix[i][j]
            predict = predict_matrix[i][j]

            if actual == 0 and predict == 0: # true negative
                tn_count += 1
            elif actual == 1 and predict == 1: # true position
                tp_count += 1
            elif actual == 1 and predict == 0: # false negative 
                fn_count += 1
            elif actual == 0 and predict == 1: # false positive
                fp_count += 1

    accuracy = (tn_count + tp_count) / (tn_count + tp_count + fn_count + fp_count)
                
    print("threshold:", threshold)
    print("true negative:", tn_count)
    print("true position:", tp_count)
    print("false negative:", fn_count)
    print("false positive:", fp_count)
    print("accuracy:", accuracy)
    print("\n======================================\n")
print("Base count: %d, Test count: %d, Total = %d\n" % (base_count, test_count, base_count * test_count))

for threshold in thresholds:
    calculate_similarity(threshold)
Base count: 94, Test count: 20, Total = 1880

threshold: 0
true negative: 1
true position: 25
false negative: 0
false positive: 1854
accuracy: 0.013829787234042552

======================================

threshold: 0.1
true negative: 32
true position: 25
false negative: 0
false positive: 1823
accuracy: 0.03031914893617021

======================================

threshold: 0.2
true negative: 172
true position: 25
false negative: 0
false positive: 1683
accuracy: 0.10478723404255319

======================================

threshold: 0.3
true negative: 543
true position: 23
false negative: 2
false positive: 1312
accuracy: 0.30106382978723406

======================================

threshold: 0.4
true negative: 1028
true position: 18
false negative: 7
false positive: 827
accuracy: 0.5563829787234043

======================================

threshold: 0.5
true negative: 1448
true position: 14
false negative: 11
false positive: 407
accuracy: 0.7776595744680851

======================================

threshold: 0.6
true negative: 1734
true position: 7
false negative: 18
false positive: 121
accuracy: 0.926063829787234

======================================

threshold: 0.7
true negative: 1838
true position: 2
false negative: 23
false positive: 17
accuracy: 0.9787234042553191

======================================

threshold: 0.75
true negative: 1852
true position: 0
false negative: 25
false positive: 3
accuracy: 0.9851063829787234

======================================

threshold: 0.76
true negative: 1854
true position: 0
false negative: 25
false positive: 1
accuracy: 0.9861702127659574

======================================

threshold: 0.77
true negative: 1854
true position: 0
false negative: 25
false positive: 1
accuracy: 0.9861702127659574

======================================

threshold: 0.78
true negative: 1855
true position: 0
false negative: 25
false positive: 0
accuracy: 0.9867021276595744

======================================

threshold: 0.79
true negative: 1855
true position: 0
false negative: 25
false positive: 0
accuracy: 0.9867021276595744

======================================

threshold: 0.8
true negative: 1855
true position: 0
false negative: 25
false positive: 0
accuracy: 0.9867021276595744

======================================

threshold: 0.9
true negative: 1855
true position: 0
false negative: 25
false positive: 0
accuracy: 0.9867021276595744

======================================

threshold: 1
true negative: 1855
true position: 0
false negative: 25
false positive: 0
accuracy: 0.9867021276595744

======================================
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment