Created
September 16, 2019 06:56
-
-
Save Deepayan137/b0301274edc8e0eb11181ce743ec0544 to your computer and use it in GitHub Desktop.
word prediction and ground truth alignment
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from collections import Counter, defaultdict | |
from textdistance import levenshtein as lev | |
import numpy as np | |
import pdb | |
from tqdm import * | |
def CharMajVoting(words): | |
def most_frequent(list_): | |
counter = Counter(list_) | |
return counter.most_common()[0][0] | |
dict_ = defaultdict(list) | |
lengths = [len(word) for word in words] | |
common_length = most_frequent(lengths) | |
for word in words: | |
for i in range(len(word)): | |
dict_[i].append(word[i]) | |
str_='' | |
for i in range(len(dict_)): | |
str_+=most_frequent(dict_[i]) | |
return str_[:common_length] | |
def similarity(word1, word2): | |
return lev.normalized_distance(word1, word2) | |
def text_align(prWords, gtWords): | |
row, col = len(prWords), len(gtWords) | |
adjMat= np.zeros((row, col), dtype=float) | |
for i in trange(len(prWords)): | |
for j in range(len(gtWords)): | |
adjMat[i, j] = similarity(prWords[i], gtWords[j]) | |
pr_aligned=[] | |
for i in range(len(prWords)): | |
nn = list(map(lambda x:gtWords[x], np.argsort(adjMat[i, :])[:1])) | |
pr_aligned.append((prWords[i], nn[0])) | |
return pr_aligned |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment