Created
February 18, 2012 03:25
-
-
Save siahr/1857194 to your computer and use it in GitHub Desktop.
English Stemmer
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* English stemming algorithm, based on the publication from | |
* Porter (1980), "An algorithm for suffix stripping". | |
* | |
* This PHP code is based on the product of BaseX Team. | |
* https://github.com/BaseXdb/basex/blob/master/src/main/java/org/basex/util/ft/EnglishStemmer.java | |
* | |
* @package Sugina | |
* @author BaseX Team 2005-12, BSD License | |
* @author Christian Gruen | |
* @author Toshio HIRAI (Porting to PHP) | |
* @license http://opensource.org/licenses/BSD-3-Clause The BSD License | |
*/ | |
class EnglishStemmer { | |
/** Token to be stemmed. */ | |
private $tok = array(); | |
/** Token length. */ | |
private $te; | |
/** Stemming length. */ | |
private $tt; | |
/** | |
* Constructor. | |
*/ | |
public function __construct() { | |
/* Stemming character. */ | |
$this->AT = self::token("at"); | |
$this->BL = self::token("bl"); | |
$this->ED = self::token("ed"); | |
$this->EED = self::token("eed"); | |
$this->IES = self::token("ies"); | |
$this->ING = self::token("ing"); | |
$this->ION = self::token("ion"); | |
$this->IZ = self::token("iz"); | |
$this->LL = self::token("ll"); | |
$this->SION = self::token("sion"); | |
$this->SSES = self::token("sses"); | |
$this->TION = self::token("tion"); | |
$this->S = 115; | |
$this->Y = 121; | |
$this->E = 101; | |
$this->L = 108; | |
/* Step 2. */ | |
$this->ST2 = array( | |
self::tokens(array("abli", "able")), self::tokens(array("alism", "al")), self::tokens(array("aliti", "al")), | |
self::tokens(array("alli", "al")), self::tokens(array("anci", "ance")), self::tokens(array("ation", "ate")), | |
self::tokens(array("ational", "ate")), self::tokens(array("ator", "ate")), self::tokens(array("biliti", "ble")), | |
self::tokens(array("eli", "e")), self::tokens(array("enci", "ence")), self::tokens(array("entli", "ent")), | |
self::tokens(array("fulness", "ful")), self::tokens(array("iveness", "ive")), | |
self::tokens(array("iviti", "ive")), | |
self::tokens(array("ization", "ize")), self::tokens(array("ization", "ize")), | |
self::tokens(array("izer", "ize")), | |
self::tokens(array("izer", "ize")), self::tokens(array("ousli", "ous")), self::tokens(array("ousness", "ous")), | |
self::tokens(array("tional", "tion")), | |
); | |
/* Step 3. */ | |
$this->ST3 = array( | |
self::tokens(array("alize", "al")), self::tokens(array("alize", "al")), self::tokens(array("ative", "")), | |
self::tokens(array("ful", "")), self::tokens(array("ical", "ic")), self::tokens(array("icate", "ic")), | |
self::tokens(array("iciti", "ic")), self::tokens(array("ness", "")) | |
); | |
/* Step 4. */ | |
$this->ST4 = self::tokens( | |
array( | |
"able", "al", "ance", "ant", "ate", "ement", "ence", "ent", "er", "ible", | |
"ic", "ism", "iti", "ive", "ize", "ment", "ou", "ous", "sion", "tion" | |
) | |
); | |
} | |
/** | |
* Stems a word. | |
* @param array $str input word to stem | |
* @return array the stem of the word | |
*/ | |
public function stem(array $str) { | |
$this->te = count($str); | |
$this->tok = $str; | |
return !$this->s() ? $this->tok : array_slice($this->tok, 0, $this->te); | |
} | |
/** | |
* Stems the current word. | |
* @return boolean true if word was stemmed | |
*/ | |
private function s() { | |
if($this->te < 3) return false; | |
// step 1 | |
if($this->e($this->S)) { | |
if($this->e($this->SSES) || $this->e($this->IES)) $this->te -= 2; | |
else if($this->l($this->te - 2) != 115) --$this->te; | |
} | |
if($this->e($this->EED)) { | |
if($this->m() > 0) --$this->te; | |
} else if(($this->e($this->ED) || $this->e($this->ING)) && $this->v()) { | |
$this->te = $this->tt; | |
if($this->e($this->AT) || $this->e($this->BL) || $this->e($this->IZ)) { | |
$this->tt = $this->te; | |
$this->ac(101); | |
} else if($this->te > 1) { | |
$c = $this->l($this->te - 1); | |
if($c == $this->l($this->te - 2) && $c != 108 && $c != 115 && $c != 122) { | |
--$this->te; | |
} else if($this->m() == 1) { | |
if($this->c($this->te)) $this->ac(101); | |
} | |
} | |
} | |
if($this->e($this->Y) && $this->v()) $this->ac(105); | |
// step 2 | |
foreach($this->ST2 as $s) { | |
if($this->e($s[0])) { | |
if($this->m() > 0) $this->at($s[1]); | |
break; | |
} | |
} | |
// step 3 | |
foreach($this->ST3 as $s) { | |
if($this->e($s[0])) { | |
if($this->m() > 0) $this->at($s[1]); | |
break; | |
} | |
} | |
// step 4 | |
if(($this->e($this->TION) || $this->e($this->SION)) && $this->e($this->ION) && $this->m() > 1) { | |
$this->te -= 3; | |
} else { | |
foreach($this->ST4 as $s) { | |
if($this->e($s)) { | |
if($this->m() > 1) $this->te = $this->tt; | |
break; | |
} | |
} | |
} | |
// step 5 | |
if($this->e($this->E)) { | |
$m = $this->m(); | |
if($m > 1 || $m == 1 && !$this->c($this->te - 1)) --$this->te; | |
} | |
if($this->e($this->LL) && $this->e($this->L) && $this->m() > 1) --$this->te; | |
return $this->te != count($this->tok); | |
} | |
/** | |
* Checks for the cvc pattern. | |
* @param $l position | |
* @return boolean result of check | |
*/ | |
private function c($l) { | |
if($l < 3) return false; | |
$c = $this->l($l - 1); | |
return $c != 119 && $c != 120 && $c != 121 && | |
!$this->vt($l - 1) && $this->vt($l - 2) && !$this->vt($l - 3); | |
} | |
/** | |
* Suffix test for a token. | |
* @param mixed $s suffix | |
* @return boolean result of check | |
*/ | |
private function e($s) { | |
if (is_array($s)) { | |
$sl = count($s); | |
$l = $this->te - $sl; | |
if($l < 0) return false; | |
for($i = 0; $i < $sl; ++$i) | |
if($this->l($l + $i) != $s[$i]) return false; | |
$this->tt = $l; | |
return true; | |
} | |
$l = $this->te - 1; | |
if($l < 0 || $this->l($l) != $s) return false; | |
$this->tt = $l; | |
return true; | |
} | |
/** | |
* Returns word measure. | |
* @return integer measure | |
*/ | |
private function m() { | |
$c = 0; | |
$i = -1; | |
$v = false; | |
while(++$i < $this->tt) { | |
if($v xor $this->vt($i)) { | |
if($v) ++$c; | |
$v = ($v xor true); | |
} | |
} | |
return $c; | |
} | |
/** | |
* Vowel test. | |
* @return boolean result of check | |
*/ | |
private function v() { | |
for($i = 0; $i < $this->tt; ++$i) | |
if($this->vt($i)) return true; | |
return false; | |
} | |
/** | |
* Vowel test. | |
* @param integer $p position | |
* @return boolean result of check | |
*/ | |
private function vt($p) { | |
$c = $this->l($p); | |
return $c == 97 || $c == 101 || $c == 105 || $c == 111 || $c == 117 || | |
$c == 121 && $p != 0 && !$this->vt($p - 1); | |
} | |
/** | |
* Returns the lower character at the specified position. | |
* @param integer $p position | |
* @return integer result of check | |
*/ | |
private function l($p) { | |
return TokenUtil::lc($this->tok[$p]); | |
} | |
/** | |
* Adds a character. | |
* @param integer $c character | |
*/ | |
private function ac($c) { | |
$this->te = $this->tt; | |
$this->tok[$this->te++] = $c; | |
} | |
/** | |
* Adds a token. | |
* @param array $t token | |
*/ | |
private function at($t) { | |
$this->te = $this->tt; | |
foreach($t as $c) { | |
$this->tok[$this->te++] = $c; | |
} | |
} | |
/** | |
* Converts a string to chars array. | |
* All strings should be converted by this function to guarantee | |
* a consistent character conversion. | |
* @param string $str string to be converted | |
* @return array chars array | |
*/ | |
private static function token($str) { | |
return TokenUtil::toChars($str, "UTF-8"); | |
} | |
/** | |
* Converts the specified strings to tokens. | |
* @param array $strs strings | |
* @return array tokens array | |
*/ | |
private static function tokens(array $strs) { | |
$tokens = array(); | |
foreach($strs as $str) { | |
$tokens[] = self::token($str); | |
} | |
return $tokens; | |
} | |
} | |
?> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Usage:
This code returns "token".
TokenUtil.php will be found here.
https://gist.github.com/1857186