Skip to content

Instantly share code, notes, and snippets.

@jtejido
Created January 8, 2018 13:29
Show Gist options
  • Save jtejido/b7469ef107062f00048ec32bed08a253 to your computer and use it in GitHub Desktop.
Save jtejido/b7469ef107062f00048ec32bed08a253 to your computer and use it in GitHub Desktop.
InverseDocumentFrequency extension for php-nlp-tools
<?php
namespace App\Utilities;
use NlpTools\FeatureFactories\DataAsFeatures;
use NlpTools\Documents\TrainingSet;
class InverseDocumentFrequency
{
const FREQUENCY_MODE = 1;
const SMOOTH_MODE = 2;
const PROBABILISTIC_MODE = 3;
public function __construct(TrainingSet $tset, $mode=self::FREQUENCY_MODE)
{
$this->mode = $mode;
$ff = new DataAsFeatures();
$tset->setAsKey(TrainingSet::CLASS_AS_KEY);
foreach ($tset as $class=>$doc) {
$tokens = $ff->getFeatureArray($class,$doc);
$tokens = array_fill_keys($tokens,1);
foreach ($tokens as $token=>$v) {
if (isset($this->idf[$token]))
$this->idf[$token]++;
else
$this->idf[$token] = 1;
}
}
$D = count($tset);
if($this->mode === self::SMOOTH_MODE){
foreach ($this->idf as $key => &$value) {
$value = log(1 + ($D/$value));
}
}
elseif($this->mode === self::FREQUENCY_MODE){
foreach ($this->idf as $key => &$value) {
$value = log($D/$value);
}
}
elseif($this->mode === self::PROBABILISTIC_MODE){
foreach ($this->idf as $key => &$value) {
$value = log(($D-$value)/$value);
}
}
$this->logD = log($D);
}
public function getIdf($term)
{
if (isset($this->idf[$term])) {
return $this->idf[$term];
} else {
return $this->logD;
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment