Created
January 8, 2018 13:29
-
-
Save jtejido/b7469ef107062f00048ec32bed08a253 to your computer and use it in GitHub Desktop.
InverseDocumentFrequency extension for php-nlp-tools
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
namespace App\Utilities; | |
use NlpTools\FeatureFactories\DataAsFeatures; | |
use NlpTools\Documents\TrainingSet; | |
class InverseDocumentFrequency | |
{ | |
const FREQUENCY_MODE = 1; | |
const SMOOTH_MODE = 2; | |
const PROBABILISTIC_MODE = 3; | |
public function __construct(TrainingSet $tset, $mode=self::FREQUENCY_MODE) | |
{ | |
$this->mode = $mode; | |
$ff = new DataAsFeatures(); | |
$tset->setAsKey(TrainingSet::CLASS_AS_KEY); | |
foreach ($tset as $class=>$doc) { | |
$tokens = $ff->getFeatureArray($class,$doc); | |
$tokens = array_fill_keys($tokens,1); | |
foreach ($tokens as $token=>$v) { | |
if (isset($this->idf[$token])) | |
$this->idf[$token]++; | |
else | |
$this->idf[$token] = 1; | |
} | |
} | |
$D = count($tset); | |
if($this->mode === self::SMOOTH_MODE){ | |
foreach ($this->idf as $key => &$value) { | |
$value = log(1 + ($D/$value)); | |
} | |
} | |
elseif($this->mode === self::FREQUENCY_MODE){ | |
foreach ($this->idf as $key => &$value) { | |
$value = log($D/$value); | |
} | |
} | |
elseif($this->mode === self::PROBABILISTIC_MODE){ | |
foreach ($this->idf as $key => &$value) { | |
$value = log(($D-$value)/$value); | |
} | |
} | |
$this->logD = log($D); | |
} | |
public function getIdf($term) | |
{ | |
if (isset($this->idf[$term])) { | |
return $this->idf[$term]; | |
} else { | |
return $this->logD; | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment