import pandas as pd
import numpy as np
import json
import nltk
import string
import re
import tensorflow as tf
import tensorflow_hub as hub
WARNING: Logging before flag parsing goes to stderr.
W0520 15:39:11.145781 4611618240 __init__.py:56] Some hub symbols are not available because TensorFlow version is less than 1.14
# Prepare actual similarity data
data_xlsx = pd.read_excel('./similarity_manually_label.xlsx', 'Sheet1', index_col=0)
actual_matrix = np.array(data_xlsx.values)
# Import base and test data
with open('./text_similarity_base.json') as data_file:
text_similarity_base = json.load(data_file)
with open('./text_similarity_test.json') as data_file:
text_similarity_test = json.load(data_file)
# Create base and test data frames
base_df = pd.DataFrame.from_dict(text_similarity_base, orient='columns')
test_df = pd.DataFrame.from_dict(text_similarity_test, orient='columns')
[nltk_data] Downloading package punkt to /Users/kittisakp/nltk_data...
[nltk_data] Package punkt is already up-to-date!
True
# Text pre-processing functions
stemmer = nltk.stem.porter.PorterStemmer()
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)
stopwords = nltk.corpus.stopwords.words('english')
def tokenize(text):
return nltk.word_tokenize(text)
def stem_tokens(tokens):
return [stemmer.stem(item) for item in tokens]
def remove_stopwords(tokens):
return [item for item in tokens if item not in stopwords]
def keep_alphabetic(tokens):
return [item for item in tokens if item.isalpha()]
def reduce_lengthening(tokens):
pattern = re.compile(r"(.)\1{2,}")
return [pattern.sub(r"\1\1", item) for item in tokens]
'''lowercase, punctuation, remove stopwords, only alphabetic, reduce lengthening, stem'''
def normalize(text):
lower_text_without_punctuation = text.lower().translate(remove_punctuation_map)
return ' '.join(
stem_tokens(
reduce_lengthening(
keep_alphabetic(
remove_stopwords(
tokenize(
lower_text_without_punctuation))))))
# Text cleansing
base_df['normalized_text'] = base_df['text'].apply(lambda text: normalize(text))
test_df['normalized_text'] = test_df['text'].apply(lambda text: normalize(text))
# Define constants
thresholds = [
0,
0.1,
0.2,
0.3,
0.4,
0.5,
0.6,
0.7,
0.75,
0.76,
0.77,
0.78,
0.79,
0.8,
0.9,
1
]
base_sentences = base_df['normalized_text'].values
test_sentences = test_df['normalized_text'].values
base_count = len(base_sentences)
test_count = len(test_sentences)
# tensroflow hub module for Universal sentence Encoder
module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/3"
embed = hub.Module(module_url)
WARNING:tensorflow:From /usr/local/lib/python3.7/site-packages/tensorflow/python/ops/control_flow_ops.py:3632: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.
W0520 15:40:21.697924 4611618240 deprecation.py:323] From /usr/local/lib/python3.7/site-packages/tensorflow/python/ops/control_flow_ops.py:3632: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.
def get_features(texts):
if type(texts) is str:
texts = [texts]
with tf.Session() as sess:
sess.run([tf.global_variables_initializer(), tf.tables_initializer()])
return sess.run(embed(texts))
def cosine_similarity(v1, v2):
mag1 = np.linalg.norm(v1)
mag2 = np.linalg.norm(v2)
if (not mag1) or (not mag2):
return 0
return np.dot(v1, v2) / (mag1 * mag2)
base_vector = get_features(base_sentences)
test_vector = get_features(test_sentences)
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
I0520 15:41:12.798565 4611618240 saver.py:1483] Saver not created because there are no variables in the graph to restore
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
I0520 15:41:22.793359 4611618240 saver.py:1483] Saver not created because there are no variables in the graph to restore
def calculate_similarity(threshold):
predict_matrix = np.array([[None for j in range(test_count)] for i in range(base_count)])
tp_count = 0
tn_count = 0
fp_count = 0
fn_count = 0
# Prepare predict data
for base_index, base_value in enumerate(base_vector):
for test_index, test_value in enumerate(test_vector):
similarity = cosine_similarity(base_value, test_value)
if similarity >= threshold:
predict_matrix[base_index][test_index] = 1 # 1 means duplicate
else:
predict_matrix[base_index][test_index] = 0 # 0 means non-duplicate
# Calculate result
for i in range(base_count):
for j in range(test_count):
actual = actual_matrix[i][j]
predict = predict_matrix[i][j]
if actual == 0 and predict == 0: # true negative
tn_count += 1
elif actual == 1 and predict == 1: # true position
tp_count += 1
elif actual == 1 and predict == 0: # false negative
fn_count += 1
elif actual == 0 and predict == 1: # false positive
fp_count += 1
accuracy = (tn_count + tp_count) / (tn_count + tp_count + fn_count + fp_count)
print("threshold:", threshold)
print("true negative:", tn_count)
print("true position:", tp_count)
print("false negative:", fn_count)
print("false positive:", fp_count)
print("accuracy:", accuracy)
print("\n======================================\n")
print("Base count: %d, Test count: %d, Total = %d\n" % (base_count, test_count, base_count * test_count))
for threshold in thresholds:
calculate_similarity(threshold)
Base count: 94, Test count: 20, Total = 1880
threshold: 0
true negative: 1
true position: 25
false negative: 0
false positive: 1854
accuracy: 0.013829787234042552
======================================
threshold: 0.1
true negative: 32
true position: 25
false negative: 0
false positive: 1823
accuracy: 0.03031914893617021
======================================
threshold: 0.2
true negative: 172
true position: 25
false negative: 0
false positive: 1683
accuracy: 0.10478723404255319
======================================
threshold: 0.3
true negative: 543
true position: 23
false negative: 2
false positive: 1312
accuracy: 0.30106382978723406
======================================
threshold: 0.4
true negative: 1028
true position: 18
false negative: 7
false positive: 827
accuracy: 0.5563829787234043
======================================
threshold: 0.5
true negative: 1448
true position: 14
false negative: 11
false positive: 407
accuracy: 0.7776595744680851
======================================
threshold: 0.6
true negative: 1734
true position: 7
false negative: 18
false positive: 121
accuracy: 0.926063829787234
======================================
threshold: 0.7
true negative: 1838
true position: 2
false negative: 23
false positive: 17
accuracy: 0.9787234042553191
======================================
threshold: 0.75
true negative: 1852
true position: 0
false negative: 25
false positive: 3
accuracy: 0.9851063829787234
======================================
threshold: 0.76
true negative: 1854
true position: 0
false negative: 25
false positive: 1
accuracy: 0.9861702127659574
======================================
threshold: 0.77
true negative: 1854
true position: 0
false negative: 25
false positive: 1
accuracy: 0.9861702127659574
======================================
threshold: 0.78
true negative: 1855
true position: 0
false negative: 25
false positive: 0
accuracy: 0.9867021276595744
======================================
threshold: 0.79
true negative: 1855
true position: 0
false negative: 25
false positive: 0
accuracy: 0.9867021276595744
======================================
threshold: 0.8
true negative: 1855
true position: 0
false negative: 25
false positive: 0
accuracy: 0.9867021276595744
======================================
threshold: 0.9
true negative: 1855
true position: 0
false negative: 25
false positive: 0
accuracy: 0.9867021276595744
======================================
threshold: 1
true negative: 1855
true position: 0
false negative: 25
false positive: 0
accuracy: 0.9867021276595744
======================================