Skip to content

Instantly share code, notes, and snippets.

@jtejido
Created January 2, 2018 02:30
Show Gist options
  • Save jtejido/076c2e7623e3251eb56b718c44d64e93 to your computer and use it in GitHub Desktop.
Save jtejido/076c2e7623e3251eb56b718c44d64e93 to your computer and use it in GitHub Desktop.
TverskyIndexSimilarity extension for php-nlp-tools
<?php
namespace App\Utilities;
/**
* http://en.wikipedia.org/wiki/Tversky_index
*/
class TverskyIndex
{
/**
* The similarity returned by this algorithm is a number between 0,1
* The algorithm described in http://www.cogsci.ucsd.edu/~coulson/203/tversky-features.pdf, which generalizes both
* Dice/Sorensen and Jaccard/Tanimoto coeeficient, does not meet the criteria for a similarity metric (due to inherent
* assymetry), but has been made symmetrical as applied here (by Jimenez, S., Becerra, C., Gelbukh, A.):
* http://aclweb.org/anthology/S/S13/S13-1028.pdf
*/
public function similarity(&$A, &$B, $alpha = 0.5, $beta = 1)
{
$a = array_fill_keys($A,1);
$b = array_fill_keys($B,1);
$min = min(count(array_diff_key($a,$b)),count(array_diff_key($b, $a)));
$max = max(count(array_diff_key($a,$b)),count(array_diff_key($b, $a)));
$intersect = count(array_intersect_key($a,$b));
return $intersect/($intersect + ($beta * ($alpha * $min + $max*(1-$alpha)) ));
}
public function dist(&$A, &$B, $alpha = 0.5, $beta = 1)
{
return 1-$this->similarity($A,$B,$alpha,$beta);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment