Skip to content

Instantly share code, notes, and snippets.

@jtejido
Created January 11, 2018 14:34
Show Gist options
  • Save jtejido/95c666dcb884f49151f9f842241809bd to your computer and use it in GitHub Desktop.
Save jtejido/95c666dcb884f49151f9f842241809bd to your computer and use it in GitHub Desktop.
VectorSpaceModel extension for php-nlp-tools
<?php
namespace App\Utilities\Search;
use App\Library\InverseDocumentFrequency;
use App\Utilities\Search\TfIdfFeatureFactory;
use NlpTools\Documents\TrainingSet;
/**
* Vector Space Model is a Class for calculating Relevance ranking by comparing the deviation of angles between each
* document vector and the original query vector.
*
* You can use current implementation of php-nlp-tools' cosine similarity but it was made to return an Exception in case of
* 0 vector product instead of returning 0.
*
* https://en.wikipedia.org/wiki/Vector_space_model#Example:_tf-idf_weights
*
* @author Jericko Tejido <[email protected]>
*/
class VectorSpaceModel
{
protected $tset;
public function __construct(TrainingSet $tset)
{
$this->tset = $tset;
}
/**
* Returns Score ranking per Documents added by ascending order.
*
* @param string $term
* @return array
*/
public function getScores(TrainingSet $qset)
{
$score = array();
$term_tfidf = array();
$doc_tfidf = array();
$docs_idf = new InverseDocumentFrequency($this->tset);
$ff = new TfIdfFeatureFactory(
$docs_idf,
array(
function ($c, $d) {
return $d->getDocumentData();
}
)
);
for($i = 0; $i < count($this->tset); $i++){
;
$id = $this->tset->offsetGet($i)->getClass();
$query = $ff->getFeatureArray("", $qset->offsetGet(0));
$documents = $ff->getFeatureArray("", $this->tset->offsetGet($i));
$score[$id] = $this->getSimilarity($query, $documents);
}
arsort($score);
return $score;
}
/**
* returns Euclidean norm
*
* @param array $vector
* @return mixed
*/
private function norm(array $vector) {
return sqrt($this->dotProduct($vector, $vector));
}
/**
* returns Dot product
*
* @param array $a
* @param array $b
* @return mixed
*/
private function dotProduct(array $a, array $b) {
$dotProduct = 0;
$keysA = array_keys(array_filter($a));
$keysB = array_keys(array_filter($b));
$uniqueKeys = array_unique(array_merge($keysA, $keysB));
foreach ($uniqueKeys as $key) {
if (!empty($a[$key]) && !empty($b[$key]))
$dotProduct += ($a[$key] * $b[$key]);
}
return $dotProduct;
}
/**
* returns cos(theta) between two non-normalised vectors
* sim(a, b) = (a・b) / (||a|| * ||b||)
*
* @param array $a
* @param array $b
* @return mixed
*/
private function getSimilarity(array $a, array $b) {
$normA = $this->norm($a);
$normB = $this->norm($b);
return (($normA * $normB) != 0)
? $this->dotProduct($a, $b) / ($normA * $normB)
: 0;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment