Last active
October 25, 2024 03:48
-
-
Save jtejido/6e07a6b6670786c79877780f9532ade5 to your computer and use it in GitHub Desktop.
Okapi BM25 extension for php-nlp-tools
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
namespace App\Utilities\Search; | |
use App\Library\TermFrequency; | |
use NlpTools\Documents\TrainingSet; | |
/** | |
* BM25 is a class for ranking documents against a query. | |
* | |
* The implementation is based on the paper by Stephen E. Robertson, Steve Walker, Susan Jones, | |
* Micheline Hancock-Beaulieu & Mike Gatford (November 1994). | |
* that can be found at http://trec.nist.gov/pubs/trec3/t3_proceedings.html. | |
* | |
* Some modifications have been made to allow for non-negative scoring as suggested here. | |
* https://doc.rero.ch/record/16754/files/Dolamic_Ljiljana_-_When_Stopword_Lists_Make_the_Difference_20091218.pdf | |
* | |
* We also made use of a delta(δ) value of 1, which modifies BM25 to account for an issue against penalizing | |
* long documents and allowing shorter ones to dominate. The delta values assures BM25 to be lower-bounded. | |
* http://sifaka.cs.uiuc.edu/~ylv2/pub/cikm11-lowerbound.pdf | |
* | |
* An addition for extending current implementation of FreqDist is needed (which has additional options for SMART tf-idf | |
* notations). | |
* | |
* | |
* Example usage: | |
* $english = Normalizer::factory(); | |
* $tset = new TrainingSet(); | |
* $tset->addDocument( | |
* "id1", | |
* new TokensDocument( | |
* $english->normalizeAll(array('a','whale','is','a','fish')) | |
* ) | |
* ); | |
* | |
* $tset->addDocument( | |
* "id2", | |
* new TokensDocument( | |
* $english->normalizeAll(array('a','Whale','is','a','mammal')) | |
* ) | |
* ); | |
* $tset->addDocument( | |
* "id3", | |
* new TokensDocument( | |
* $english->normalizeAll(array('a','man','is','a','mammal')) | |
* ) | |
* ); | |
* | |
* $bm25 = new BM25($tset); | |
* | |
* $scores = $bm25->getScores('whale mammal'); | |
* | |
* | |
* @author Jericko Tejido <[email protected]> | |
*/ | |
class BM25 | |
{ | |
const B = 0.75; | |
const K = 1.3; | |
const D = 1; | |
const COUNT_MODE = 5; | |
const type = array('bm25', 'bm25p'); | |
protected $tset; | |
protected $b; | |
protected $k; | |
protected $d; | |
protected $type; | |
public function __construct(TrainingSet $tset, $type = null, $b = self::B, $k = self::K, $d = self::D) | |
{ | |
$this->tset = $tset; | |
$this->b = $b; | |
$this->k = $k; | |
$this->type = $type; | |
if(strtolower($this->type) === null) { | |
$this->type = 'bm25'; | |
$this->d = 0; | |
} elseif(strtolower($this->type) === 'bm25') { | |
$this->d = 0; | |
} elseif(strtolower($this->type) === 'bm25p') { | |
$this->d = $d; | |
} | |
elseif(!in_array(strtolower($this->type),self::type)){ | |
throw new \InvalidArgumentException( | |
"The type is not accepted." | |
); | |
} | |
} | |
/** | |
* Returns Idf(Inverse Document Frequency). | |
* log{1+[(n−dfj +0.5)/(dfj +0.5)]} | |
* To avoid negative results when the underlying term tj occurs in more than half of | |
* the documents (dfj > n/2) we add 1 before getting log(). | |
* | |
* @param int $nqi | |
* @return float | |
*/ | |
private function getIdf($nqi) | |
{ | |
return log(1 + ((count($this->tset)-$nqi+0.5)/($nqi + 0.5))); | |
} | |
/** | |
* Returns Tf(Term Frequency). | |
* | |
* @param array $document | |
* @param string $term | |
* @return float | |
*/ | |
private function getTf($document, $term) | |
{ | |
$freqDist = new TermFrequency($document, self::COUNT_MODE); | |
return $freqDist->getTf($term); | |
} | |
/** | |
* Returns Okapi BM25's Scoring value per added document. | |
* BM25 = idf*( ($tf * (k + 1))/($tf + k * (1-b+b*(D/Av_D))) ) | |
* BM25+ = idf*( ($tf * (k + 1))/($tf + k * (1-b+b*(D/Av_D))) + δ) | |
* | |
* @param string $term | |
* @param float $idf | |
* @param int $allLength | |
* @param array $score | |
* @return array | |
*/ | |
private function getBm25($tf, $idf, $length, $avg_dl) | |
{ | |
$num = $tf * ($this->k + 1); | |
$denom = $tf + $this->k * (1 - $this->b + $this->b * ($length / $avg_dl)); | |
return $idf * (($num / $denom) + $this->d); | |
} | |
/** | |
* Returns Score ranking per Documents added by ascending order. | |
* | |
* @param string $term | |
* @return array | |
*/ | |
public function getScores(array $terms) | |
{ | |
$allLength = $this->getAllLength(); | |
$score = array(); | |
foreach($terms as $term) { | |
$nqi = $this->getNQi($term); | |
$idf = $this->getIdf($nqi); | |
foreach ($this->tset as $class=>$doc) { | |
$score[$class] = isset($score[$class]) ? $score[$class] : 0; | |
$tf = $this->getTf($doc->getDocumentData(), $term); | |
$length = count($doc->getDocumentData()); | |
$avg_dl = $length/$allLength; | |
$score[$class] += $this->getBm25($tf, $idf, $length, $avg_dl); | |
} | |
} | |
arsort($score); | |
return $score; | |
} | |
/** | |
* Returns number of documents containing the query word. | |
* | |
* @param string $term | |
* @return int | |
*/ | |
private function getNQi($term) | |
{ | |
$nqi = 0; | |
foreach ($this->tset as $class=>$doc) { | |
if(in_array($term, $doc->getDocumentData())){ | |
$nqi++; | |
} | |
} | |
return $nqi; | |
} | |
/** | |
* Returns number of words on all added documents. | |
* @return int | |
*/ | |
private function getAllLength() | |
{ | |
$allLength = 0; | |
foreach ($this->tset as $class=>$doc) { | |
$allLength += count($doc->getDocumentData()); | |
} | |
return $allLength; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment