Created
January 2, 2018 02:30
-
-
Save jtejido/076c2e7623e3251eb56b718c44d64e93 to your computer and use it in GitHub Desktop.
TverskyIndexSimilarity extension for php-nlp-tools
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
namespace App\Utilities; | |
/** | |
* http://en.wikipedia.org/wiki/Tversky_index | |
*/ | |
class TverskyIndex | |
{ | |
/** | |
* The similarity returned by this algorithm is a number between 0,1 | |
* The algorithm described in http://www.cogsci.ucsd.edu/~coulson/203/tversky-features.pdf, which generalizes both | |
* Dice/Sorensen and Jaccard/Tanimoto coeeficient, does not meet the criteria for a similarity metric (due to inherent | |
* assymetry), but has been made symmetrical as applied here (by Jimenez, S., Becerra, C., Gelbukh, A.): | |
* http://aclweb.org/anthology/S/S13/S13-1028.pdf | |
*/ | |
public function similarity(&$A, &$B, $alpha = 0.5, $beta = 1) | |
{ | |
$a = array_fill_keys($A,1); | |
$b = array_fill_keys($B,1); | |
$min = min(count(array_diff_key($a,$b)),count(array_diff_key($b, $a))); | |
$max = max(count(array_diff_key($a,$b)),count(array_diff_key($b, $a))); | |
$intersect = count(array_intersect_key($a,$b)); | |
return $intersect/($intersect + ($beta * ($alpha * $min + $max*(1-$alpha)) )); | |
} | |
public function dist(&$A, &$B, $alpha = 0.5, $beta = 1) | |
{ | |
return 1-$this->similarity($A,$B,$alpha,$beta); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment