Created
January 11, 2018 14:34
-
-
Save jtejido/95c666dcb884f49151f9f842241809bd to your computer and use it in GitHub Desktop.
VectorSpaceModel extension for php-nlp-tools
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
namespace App\Utilities\Search; | |
use App\Library\InverseDocumentFrequency; | |
use App\Utilities\Search\TfIdfFeatureFactory; | |
use NlpTools\Documents\TrainingSet; | |
/** | |
* Vector Space Model is a Class for calculating Relevance ranking by comparing the deviation of angles between each | |
* document vector and the original query vector. | |
* | |
* You can use current implementation of php-nlp-tools' cosine similarity but it was made to return an Exception in case of | |
* 0 vector product instead of returning 0. | |
* | |
* https://en.wikipedia.org/wiki/Vector_space_model#Example:_tf-idf_weights | |
* | |
* @author Jericko Tejido <[email protected]> | |
*/ | |
class VectorSpaceModel | |
{ | |
protected $tset; | |
public function __construct(TrainingSet $tset) | |
{ | |
$this->tset = $tset; | |
} | |
/** | |
* Returns Score ranking per Documents added by ascending order. | |
* | |
* @param string $term | |
* @return array | |
*/ | |
public function getScores(TrainingSet $qset) | |
{ | |
$score = array(); | |
$term_tfidf = array(); | |
$doc_tfidf = array(); | |
$docs_idf = new InverseDocumentFrequency($this->tset); | |
$ff = new TfIdfFeatureFactory( | |
$docs_idf, | |
array( | |
function ($c, $d) { | |
return $d->getDocumentData(); | |
} | |
) | |
); | |
for($i = 0; $i < count($this->tset); $i++){ | |
; | |
$id = $this->tset->offsetGet($i)->getClass(); | |
$query = $ff->getFeatureArray("", $qset->offsetGet(0)); | |
$documents = $ff->getFeatureArray("", $this->tset->offsetGet($i)); | |
$score[$id] = $this->getSimilarity($query, $documents); | |
} | |
arsort($score); | |
return $score; | |
} | |
/** | |
* returns Euclidean norm | |
* | |
* @param array $vector | |
* @return mixed | |
*/ | |
private function norm(array $vector) { | |
return sqrt($this->dotProduct($vector, $vector)); | |
} | |
/** | |
* returns Dot product | |
* | |
* @param array $a | |
* @param array $b | |
* @return mixed | |
*/ | |
private function dotProduct(array $a, array $b) { | |
$dotProduct = 0; | |
$keysA = array_keys(array_filter($a)); | |
$keysB = array_keys(array_filter($b)); | |
$uniqueKeys = array_unique(array_merge($keysA, $keysB)); | |
foreach ($uniqueKeys as $key) { | |
if (!empty($a[$key]) && !empty($b[$key])) | |
$dotProduct += ($a[$key] * $b[$key]); | |
} | |
return $dotProduct; | |
} | |
/** | |
* returns cos(theta) between two non-normalised vectors | |
* sim(a, b) = (a・b) / (||a|| * ||b||) | |
* | |
* @param array $a | |
* @param array $b | |
* @return mixed | |
*/ | |
private function getSimilarity(array $a, array $b) { | |
$normA = $this->norm($a); | |
$normB = $this->norm($b); | |
return (($normA * $normB) != 0) | |
? $this->dotProduct($a, $b) / ($normA * $normB) | |
: 0; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment