siahr · February 18, 2012 03:25 · siahr · Feb 18, 2012
diff --git a/EnglishStemmer.php b/EnglishStemmer.php
 <?php
 /**
 * English stemming algorithm, based on the publication from
 * Porter (1980), "An algorithm for suffix stripping".
 *
 * This PHP code is based on the product of BaseX Team.
 * https://github.com/BaseXdb/basex/blob/master/src/main/java/org/basex/util/ft/EnglishStemmer.java
 *
 * @package Sugina
 * @author BaseX Team 2005-12, BSD License
 * @author Christian Gruen
 * @author Toshio HIRAI (Porting to PHP)
 * @license http://opensource.org/licenses/BSD-3-Clause The BSD License
 */

 class EnglishStemmer {
 	/** Token to be stemmed. */
 	private $tok = array();
 	/** Token length. */
 	private $te;
 	/** Stemming length. */
 	private $tt;

 	/**
 	* Constructor.
 	*/
 	public function __construct() {
 		/* Stemming character. */
 		$this->AT = self::token("at");
 		$this->BL = self::token("bl");
 		$this->ED = self::token("ed");
 		$this->EED = self::token("eed");
 		$this->IES = self::token("ies");
 		$this->ING = self::token("ing");
 		$this->ION = self::token("ion");
 		$this->IZ = self::token("iz");
 		$this->LL = self::token("ll");
 		$this->SION = self::token("sion");
 		$this->SSES = self::token("sses");
 		$this->TION = self::token("tion");

 		$this->S = 115;
 		$this->Y = 121;
 		$this->E = 101;
 		$this->L = 108;

 		/* Step 2. */
 		$this->ST2 = array(
 			self::tokens(array("abli", "able")), self::tokens(array("alism", "al")), self::tokens(array("aliti", "al")),
 			self::tokens(array("alli", "al")), self::tokens(array("anci", "ance")), self::tokens(array("ation", "ate")),
 			self::tokens(array("ational", "ate")), self::tokens(array("ator", "ate")), self::tokens(array("biliti", "ble")),
 			self::tokens(array("eli", "e")), self::tokens(array("enci", "ence")), self::tokens(array("entli", "ent")),
 			self::tokens(array("fulness", "ful")), self::tokens(array("iveness", "ive")),
 			self::tokens(array("iviti", "ive")),
 			self::tokens(array("ization", "ize")), self::tokens(array("ization", "ize")),
 			self::tokens(array("izer", "ize")),
 			self::tokens(array("izer", "ize")), self::tokens(array("ousli", "ous")), self::tokens(array("ousness", "ous")),
 			self::tokens(array("tional", "tion")),
 		);

 		/* Step 3. */
 		$this->ST3 = array(
 			self::tokens(array("alize", "al")), self::tokens(array("alize", "al")), self::tokens(array("ative", "")),
 			self::tokens(array("ful", "")), self::tokens(array("ical", "ic")), self::tokens(array("icate", "ic")),
 			self::tokens(array("iciti", "ic")), self::tokens(array("ness", ""))
 		);

 		/* Step 4. */
 		$this->ST4 = self::tokens(
 			array(
 		      "able", "al", "ance", "ant", "ate", "ement", "ence", "ent", "er", "ible",
 		      "ic", "ism", "iti", "ive", "ize", "ment", "ou", "ous", "sion", "tion"
 		    )
 		);
 	}

 	/**
 	 * Stems a word.
 	 * @param array $str input word to stem
 	 * @return array the stem of the word
 	 */
 	public function stem(array $str) {
 		$this->te = count($str);
 		$this->tok = $str;
 		return !$this->s() ? $this->tok : array_slice($this->tok, 0, $this->te);
 	}

 	/**
 	* Stems the current word.
 	* @return boolean true if word was stemmed
 	*/
 	private function s() {
 		if($this->te < 3) return false;

 		// step 1
 		if($this->e($this->S)) {
 			if($this->e($this->SSES) || $this->e($this->IES)) $this->te -= 2;
 			else if($this->l($this->te - 2) != 115) --$this->te;
 		}

 		if($this->e($this->EED)) {
 			if($this->m() > 0) --$this->te;
 		} else if(($this->e($this->ED) || $this->e($this->ING)) && $this->v()) {
 			$this->te = $this->tt;

 			if($this->e($this->AT) || $this->e($this->BL) || $this->e($this->IZ)) {
 				$this->tt = $this->te;
 				$this->ac(101);
 			} else if($this->te > 1) {
 				$c = $this->l($this->te - 1);
 				if($c == $this->l($this->te - 2) && $c != 108 && $c != 115 && $c != 122) {
 					--$this->te;
 				} else if($this->m() == 1) {
 					if($this->c($this->te)) $this->ac(101);
 				}
 			}
 		}
 		if($this->e($this->Y) && $this->v()) $this->ac(105);

 		// step 2
 		foreach($this->ST2 as $s) {
 			if($this->e($s[0])) {
 				if($this->m() > 0) $this->at($s[1]);
 				break;
 			}
 		}

 		// step 3
 		foreach($this->ST3 as $s) {
 			if($this->e($s[0])) {
 				if($this->m() > 0) $this->at($s[1]);
 				break;
 			}
 		}

 		// step 4
 		if(($this->e($this->TION) || $this->e($this->SION)) && $this->e($this->ION) && $this->m() > 1) {
 			$this->te -= 3;
 		} else {
 			foreach($this->ST4 as $s) {
 				if($this->e($s)) {
 					if($this->m() > 1) $this->te = $this->tt;
 					break;
 				}
 			}
 		}

 		// step 5
 		if($this->e($this->E)) {
 			$m = $this->m();
 			if($m > 1 || $m == 1 && !$this->c($this->te - 1)) --$this->te;
 		}
 		if($this->e($this->LL) && $this->e($this->L) && $this->m() > 1) --$this->te;

 		return $this->te != count($this->tok);
 	}

 	/**
 	* Checks for the cvc pattern.
 	* @param $l position
 	* @return boolean result of check
 	*/
 	private function c($l) {
 		if($l < 3) return false;
 		$c = $this->l($l - 1);
 		return $c != 119 && $c != 120 && $c != 121 &&
 		!$this->vt($l - 1) && $this->vt($l - 2) && !$this->vt($l - 3);
 	}

 	/**
 	 * Suffix test for a token.
 	 * @param mixed $s suffix
 	 * @return boolean result of check
 	 */
 	private function e($s) {
 		if (is_array($s)) {
 			$sl = count($s);
 			$l = $this->te - $sl;
 			if($l < 0) return false;
 			for($i = 0; $i < $sl; ++$i)
 			if($this->l($l + $i) != $s[$i]) return false;
 			$this->tt = $l;
 			return true;
 		}
 		$l = $this->te - 1;
 		if($l < 0 || $this->l($l) != $s) return false;
 		$this->tt = $l;
 		return true;
 	}

 	/**
 	 * Returns word measure.
 	 * @return integer measure
 	 */
 	private function m() {
 		$c = 0;
 		$i = -1;
 		$v = false;
 		while(++$i < $this->tt) {
 			if($v xor $this->vt($i)) {
 				if($v) ++$c;
 				$v = ($v xor true);
 			}
 		}
 		return $c;
 	}

 	/**
 	 * Vowel test.
 	 * @return boolean result of check
 	 */
 	private function v() {
 		for($i = 0; $i < $this->tt; ++$i)
 		if($this->vt($i)) return true;
 		return false;
 	}

 	/**
 	 * Vowel test.
 	 * @param integer $p position
 	 * @return boolean result of check
 	 */
 	private function vt($p) {
 		$c = $this->l($p);
 		return $c == 97 || $c == 101 || $c == 105 || $c == 111 || $c == 117 ||
 		$c == 121 && $p != 0 && !$this->vt($p - 1);
 	}

 	/**
 	 * Returns the lower character at the specified position.
 	 * @param integer $p position
 	 * @return integer result of check
 	 */
 	private function l($p) {
 		return TokenUtil::lc($this->tok[$p]);
 	}

 	/**
 	 * Adds a character.
 	 * @param integer $c character
 	 */
 	private function ac($c) {
 		$this->te = $this->tt;
 		$this->tok[$this->te++] = $c;
 	}

 	/**
 	 * Adds a token.
 	 * @param array $t token
 	 */
 	private function at($t) {
 		$this->te = $this->tt;
 		foreach($t as $c) {
 			$this->tok[$this->te++] = $c;
 		}
 	}

 	/**
 	* Converts a string to chars array.
 	* All strings should be converted by this function to guarantee
 	* a consistent character conversion.
 	* @param string $str string to be converted
 	* @return array chars array
 	*/
 	private static function token($str) {
 		return TokenUtil::toChars($str, "UTF-8");
 	}

 	/**
 	 * Converts the specified strings to tokens.
 	 * @param array $strs strings
 	 * @return array tokens array
 	 */
 	private static function tokens(array $strs) {
 		$tokens = array();
 		foreach($strs as $str) {
 			$tokens[] = self::token($str);
 		}
 		return $tokens;
 	}
 }
 ?>
	<?php
	/**
	* English stemming algorithm, based on the publication from
	* Porter (1980), "An algorithm for suffix stripping".
	*
	* This PHP code is based on the product of BaseX Team.
	* https://github.com/BaseXdb/basex/blob/master/src/main/java/org/basex/util/ft/EnglishStemmer.java
	*
	* @package Sugina
	* @author BaseX Team 2005-12, BSD License
	* @author Christian Gruen
	* @author Toshio HIRAI (Porting to PHP)
	* @license http://opensource.org/licenses/BSD-3-Clause The BSD License
	*/

	class EnglishStemmer {
	/** Token to be stemmed. */
	private $tok = array();
	/** Token length. */
	private $te;
	/** Stemming length. */
	private $tt;

	/**
	* Constructor.
	*/
	public function __construct() {
	/* Stemming character. */
	$this->AT = self::token("at");
	$this->BL = self::token("bl");
	$this->ED = self::token("ed");
	$this->EED = self::token("eed");
	$this->IES = self::token("ies");
	$this->ING = self::token("ing");
	$this->ION = self::token("ion");
	$this->IZ = self::token("iz");
	$this->LL = self::token("ll");
	$this->SION = self::token("sion");
	$this->SSES = self::token("sses");
	$this->TION = self::token("tion");

	$this->S = 115;
	$this->Y = 121;
	$this->E = 101;
	$this->L = 108;

	/* Step 2. */
	$this->ST2 = array(
	self::tokens(array("abli", "able")), self::tokens(array("alism", "al")), self::tokens(array("aliti", "al")),
	self::tokens(array("alli", "al")), self::tokens(array("anci", "ance")), self::tokens(array("ation", "ate")),
	self::tokens(array("ational", "ate")), self::tokens(array("ator", "ate")), self::tokens(array("biliti", "ble")),
	self::tokens(array("eli", "e")), self::tokens(array("enci", "ence")), self::tokens(array("entli", "ent")),
	self::tokens(array("fulness", "ful")), self::tokens(array("iveness", "ive")),
	self::tokens(array("iviti", "ive")),
	self::tokens(array("ization", "ize")), self::tokens(array("ization", "ize")),
	self::tokens(array("izer", "ize")),
	self::tokens(array("izer", "ize")), self::tokens(array("ousli", "ous")), self::tokens(array("ousness", "ous")),
	self::tokens(array("tional", "tion")),
	);

	/* Step 3. */
	$this->ST3 = array(
	self::tokens(array("alize", "al")), self::tokens(array("alize", "al")), self::tokens(array("ative", "")),
	self::tokens(array("ful", "")), self::tokens(array("ical", "ic")), self::tokens(array("icate", "ic")),
	self::tokens(array("iciti", "ic")), self::tokens(array("ness", ""))
	);

	/* Step 4. */
	$this->ST4 = self::tokens(
	array(
	"able", "al", "ance", "ant", "ate", "ement", "ence", "ent", "er", "ible",
	"ic", "ism", "iti", "ive", "ize", "ment", "ou", "ous", "sion", "tion"
	)
	);
	}

	/**
	* Stems a word.
	* @param array $str input word to stem
	* @return array the stem of the word
	*/
	public function stem(array $str) {
	$this->te = count($str);
	$this->tok = $str;
	return !$this->s() ? $this->tok : array_slice($this->tok, 0, $this->te);
	}

	/**
	* Stems the current word.
	* @return boolean true if word was stemmed
	*/
	private function s() {
	if($this->te < 3) return false;

	// step 1
	if($this->e($this->S)) {
	if($this->e($this->SSES) \|\| $this->e($this->IES)) $this->te -= 2;
	else if($this->l($this->te - 2) != 115) --$this->te;
	}

	if($this->e($this->EED)) {
	if($this->m() > 0) --$this->te;
	} else if(($this->e($this->ED) \|\| $this->e($this->ING)) && $this->v()) {
	$this->te = $this->tt;

	if($this->e($this->AT) \|\| $this->e($this->BL) \|\| $this->e($this->IZ)) {
	$this->tt = $this->te;
	$this->ac(101);
	} else if($this->te > 1) {
	$c = $this->l($this->te - 1);
	if($c == $this->l($this->te - 2) && $c != 108 && $c != 115 && $c != 122) {
	--$this->te;
	} else if($this->m() == 1) {
	if($this->c($this->te)) $this->ac(101);
	}
	}
	}
	if($this->e($this->Y) && $this->v()) $this->ac(105);

	// step 2
	foreach($this->ST2 as $s) {
	if($this->e($s[0])) {
	if($this->m() > 0) $this->at($s[1]);
	break;
	}
	}

	// step 3
	foreach($this->ST3 as $s) {
	if($this->e($s[0])) {
	if($this->m() > 0) $this->at($s[1]);
	break;
	}
	}

	// step 4
	if(($this->e($this->TION) \|\| $this->e($this->SION)) && $this->e($this->ION) && $this->m() > 1) {
	$this->te -= 3;
	} else {
	foreach($this->ST4 as $s) {
	if($this->e($s)) {
	if($this->m() > 1) $this->te = $this->tt;
	break;
	}
	}
	}

	// step 5
	if($this->e($this->E)) {
	$m = $this->m();
	if($m > 1 \|\| $m == 1 && !$this->c($this->te - 1)) --$this->te;
	}
	if($this->e($this->LL) && $this->e($this->L) && $this->m() > 1) --$this->te;

	return $this->te != count($this->tok);
	}

	/**
	* Checks for the cvc pattern.
	* @param $l position
	* @return boolean result of check
	*/
	private function c($l) {
	if($l < 3) return false;
	$c = $this->l($l - 1);
	return $c != 119 && $c != 120 && $c != 121 &&
	!$this->vt($l - 1) && $this->vt($l - 2) && !$this->vt($l - 3);
	}

	/**
	* Suffix test for a token.
	* @param mixed $s suffix
	* @return boolean result of check
	*/
	private function e($s) {
	if (is_array($s)) {
	$sl = count($s);
	$l = $this->te - $sl;
	if($l < 0) return false;
	for($i = 0; $i < $sl; ++$i)
	if($this->l($l + $i) != $s[$i]) return false;
	$this->tt = $l;
	return true;
	}
	$l = $this->te - 1;
	if($l < 0 \|\| $this->l($l) != $s) return false;
	$this->tt = $l;
	return true;
	}

	/**
	* Returns word measure.
	* @return integer measure
	*/
	private function m() {
	$c = 0;
	$i = -1;
	$v = false;
	while(++$i < $this->tt) {
	if($v xor $this->vt($i)) {
	if($v) ++$c;
	$v = ($v xor true);
	}
	}
	return $c;
	}

	/**
	* Vowel test.
	* @return boolean result of check
	*/
	private function v() {
	for($i = 0; $i < $this->tt; ++$i)
	if($this->vt($i)) return true;
	return false;
	}

	/**
	* Vowel test.
	* @param integer $p position
	* @return boolean result of check
	*/
	private function vt($p) {
	$c = $this->l($p);
	return $c == 97 \|\| $c == 101 \|\| $c == 105 \|\| $c == 111 \|\| $c == 117 \|\|
	$c == 121 && $p != 0 && !$this->vt($p - 1);
	}

	/**
	* Returns the lower character at the specified position.
	* @param integer $p position
	* @return integer result of check
	*/
	private function l($p) {
	return TokenUtil::lc($this->tok[$p]);
	}

	/**
	* Adds a character.
	* @param integer $c character
	*/
	private function ac($c) {
	$this->te = $this->tt;
	$this->tok[$this->te++] = $c;
	}

	/**
	* Adds a token.
	* @param array $t token
	*/
	private function at($t) {
	$this->te = $this->tt;
	foreach($t as $c) {
	$this->tok[$this->te++] = $c;
	}
	}

	/**
	* Converts a string to chars array.
	* All strings should be converted by this function to guarantee
	* a consistent character conversion.
	* @param string $str string to be converted
	* @return array chars array
	*/
	private static function token($str) {
	return TokenUtil::toChars($str, "UTF-8");
	}

	/**
	* Converts the specified strings to tokens.
	* @param array $strs strings
	* @return array tokens array
	*/
	private static function tokens(array $strs) {
	$tokens = array();
	foreach($strs as $str) {
	$tokens[] = self::token($str);
	}
	return $tokens;
	}
	}
	?>