Skip to content

Instantly share code, notes, and snippets.

@jurijsk
Last active August 16, 2021 22:14
Show Gist options
  • Save jurijsk/08d1e82e6906901fb98819e45604e2fa to your computer and use it in GitHub Desktop.
Save jurijsk/08d1e82e6906901fb98819e45604e2fa to your computer and use it in GitHub Desktop.
Comparison of Javascript/Typescript stemmers, lemmatizers, part of speech taggers and inflectors
//yarn add compromise stemmer en-inflectors wink-lemmatizer wink-pos-tagger wink-porter2-stemmer
//the goal is to find a way to generate all possible forms of word starting with just one form: any_word -> part of speech + lemma -> inflection
let terms = ['design', 'designs', 'designed', 'designing', 'designer', 'designers', 'designate', 'designated', 'designator', 'designators', 'ox', 'oxen', 'index', 'indices', 'criteria'];
console.group('stemmer');
console.time('stemmer');
import {stemmer} from 'stemmer';
console.timeLog('stemmer', 'loaded');
(function tryStemmer() {
for(let i = 0; i < terms.length; i++) {
const term = terms[i];
stemmer
console.log(`${term}: ${stemmer(term)}`);
}
console.groupEnd();
})()
console.log('fast but dumb');
console.timeEnd('stemmer');
console.group('compromise');
console.time('compromise');
import nlp from 'compromise';
console.timeLog('compromise', 'loaded');
(function tryCompromise() {
let model = nlp(terms.join(' '));
model.cache({root: true});
let model_terms = (model as any).termList() as Array<{text: string, root: string}>;
for(let i = 0; i < model_terms.length; i++) {
const term = model_terms[i];
console.log(`${term.text}: ${term.root}`);
}
})();
console.info('makes mistakes - disigned -> disigne');
console.timeEnd('compromise');
console.groupEnd();
console.group('wink-pos-tagger');
console.time('wink-pos-tagger');
let posTagger = require('wink-pos-tagger');
console.timeLog('wink-pos-tagger', 'loaded');
(function tryWinkPosTagger() {
let tagger = posTagger();
let taggedTerms = tagger.tagRawTokens(terms) as Array<{value: string, tag: string, normal: string, pos: string, lemma: string}>;
for(let i = 0; i < taggedTerms.length; i++) {
const term = taggedTerms[i];
console.log(`${term.value}: ${term.lemma}`, term);
}
})();
console.log('makes no mistakes, not small, but worth it');
console.timeEnd('wink-pos-tagger');
console.groupEnd();
console.group('wink-lemmatizer');
console.time('wink-lemmatizer');
var lemmatize = require('wink-lemmatizer');
console.timeLog('wink-lemmatizer', 'loaded');
(function tryWinkLemmatizer(){
for (let i = 0; i < terms.length; i++) {
const term = terms[i];
console.log(`[${term}] adj: ${lemmatize.adjective(term)}; noun: ${lemmatize.noun(term)}; verb: ${lemmatize.verb(term)};`);
}
})();
console.info('smart, but slow to load/massive');
console.timeEnd('wink-lemmatizer');
console.groupEnd();
console.group('wink-porter2-stemmer');
console.time('wink-porter2-stemmer');
var winkStemmer = require('wink-porter2-stemmer');
console.timeLog('wink-porter2-stemmer', 'loaded');
(function tryWinkPorter2Stemmer() {
for(let i = 0; i < terms.length; i++) {
const term = terms[i];
console.log(`${term}: ${winkStemmer(term)}`);
}
})();
console.info('slower then stemmer but as dumb');
console.timeEnd('wink-porter2-stemmer');
console.groupEnd();
console.group('en-inflectors');
console.time('en-inflectors');
import {Inflectors} from "en-inflectors";
console.timeLog('en-inflectors', 'loaded');
(function tryEnInflectors() {
for(let i = 0; i < terms.length; i++) {
const term = terms[i];
let inflector = new Inflectors(term);
console.log(`${term}:`, inflector.toPresent(), inflector.toPresentS(), inflector.toPast(), inflector.toPastParticiple(), inflector.toGerund(), inflector.toPlural(), inflector.toSingular());
}
})()
console.info('pretty fast, makes mistakes (see designator) but together with wink-pos-tagger does everything I need');
console.timeEnd('en-inflectors');
console.groupEnd();
console.group('en-pos');
console.time('en-pos');
import {Tag} from 'en-pos';
console.timeLog('en-pos', 'loaded');
(function tryEnPos() {
for(let i = 0; i < terms.length; i++) {
const term = terms[i];
let tag = new Tag([term]);
console.log(`${term}`, tag.initial().tags, tag.smooth());
}
})();
console.info('I think it makes mistakes with designator');
console.timeEnd('en-pos');
console.groupEnd();
global.Buffer = require('buffer').Buffer;
console.group('wink-nlp');
console.time('wink-nlp');
const winkNLP = require('wink-nlp');
const model = require('wink-eng-lite-web-model');
console.timeLog('wink-nlp', 'loaded');
const wnlp = winkNLP(model);
for (let i = 0; i < terms.length; i++) {
const term = terms[i];
const doc = wnlp.readDoc(term);
console.log(`${term}:`, doc.tokens().itemAt(0).out(wnlp.its.lemma), doc.tokens().itemAt(0).out(wnlp.its.pos));
}
console.timeEnd('wink-nlp');
console.groupEnd();
console.group('logic');
console.time('logic');
//import {Inflectors} from "en-inflectors";
for(let i = 0; i < terms.length; i++) {
const term = terms[i];
const doc = wnlp.readDoc(term);
let pos = doc.tokens().itemAt(0).out(wnlp.its.pos);
let lemma = doc.tokens().itemAt(0).out(wnlp.its.lemma);
console.log(`${term}:`, lemma, pos);
let inflector = new Inflectors(lemma);
if(pos == 'NOUN'){
console.log(`[${term}|${lemma}] sing: ${inflector.toSingular()}; plural: ${inflector.toPlural()}`);
} else if (pos == 'VERB'){
console.log(`[${term}|${lemma}] pre: ${inflector.toPresent()}; preS: ${inflector.toPresentS()}; past: ${inflector.toPast()}; gerund: ${inflector.toGerund()}`);
}
}
console.timeEnd('logic');
console.groupEnd();
export class TextProccessor {}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment