Last active
August 16, 2021 22:14
-
-
Save jurijsk/08d1e82e6906901fb98819e45604e2fa to your computer and use it in GitHub Desktop.
Comparison of Javascript/Typescript stemmers, lemmatizers, part of speech taggers and inflectors
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//yarn add compromise stemmer en-inflectors wink-lemmatizer wink-pos-tagger wink-porter2-stemmer | |
//the goal is to find a way to generate all possible forms of word starting with just one form: any_word -> part of speech + lemma -> inflection | |
let terms = ['design', 'designs', 'designed', 'designing', 'designer', 'designers', 'designate', 'designated', 'designator', 'designators', 'ox', 'oxen', 'index', 'indices', 'criteria']; | |
console.group('stemmer'); | |
console.time('stemmer'); | |
import {stemmer} from 'stemmer'; | |
console.timeLog('stemmer', 'loaded'); | |
(function tryStemmer() { | |
for(let i = 0; i < terms.length; i++) { | |
const term = terms[i]; | |
stemmer | |
console.log(`${term}: ${stemmer(term)}`); | |
} | |
console.groupEnd(); | |
})() | |
console.log('fast but dumb'); | |
console.timeEnd('stemmer'); | |
console.group('compromise'); | |
console.time('compromise'); | |
import nlp from 'compromise'; | |
console.timeLog('compromise', 'loaded'); | |
(function tryCompromise() { | |
let model = nlp(terms.join(' ')); | |
model.cache({root: true}); | |
let model_terms = (model as any).termList() as Array<{text: string, root: string}>; | |
for(let i = 0; i < model_terms.length; i++) { | |
const term = model_terms[i]; | |
console.log(`${term.text}: ${term.root}`); | |
} | |
})(); | |
console.info('makes mistakes - disigned -> disigne'); | |
console.timeEnd('compromise'); | |
console.groupEnd(); | |
console.group('wink-pos-tagger'); | |
console.time('wink-pos-tagger'); | |
let posTagger = require('wink-pos-tagger'); | |
console.timeLog('wink-pos-tagger', 'loaded'); | |
(function tryWinkPosTagger() { | |
let tagger = posTagger(); | |
let taggedTerms = tagger.tagRawTokens(terms) as Array<{value: string, tag: string, normal: string, pos: string, lemma: string}>; | |
for(let i = 0; i < taggedTerms.length; i++) { | |
const term = taggedTerms[i]; | |
console.log(`${term.value}: ${term.lemma}`, term); | |
} | |
})(); | |
console.log('makes no mistakes, not small, but worth it'); | |
console.timeEnd('wink-pos-tagger'); | |
console.groupEnd(); | |
console.group('wink-lemmatizer'); | |
console.time('wink-lemmatizer'); | |
var lemmatize = require('wink-lemmatizer'); | |
console.timeLog('wink-lemmatizer', 'loaded'); | |
(function tryWinkLemmatizer(){ | |
for (let i = 0; i < terms.length; i++) { | |
const term = terms[i]; | |
console.log(`[${term}] adj: ${lemmatize.adjective(term)}; noun: ${lemmatize.noun(term)}; verb: ${lemmatize.verb(term)};`); | |
} | |
})(); | |
console.info('smart, but slow to load/massive'); | |
console.timeEnd('wink-lemmatizer'); | |
console.groupEnd(); | |
console.group('wink-porter2-stemmer'); | |
console.time('wink-porter2-stemmer'); | |
var winkStemmer = require('wink-porter2-stemmer'); | |
console.timeLog('wink-porter2-stemmer', 'loaded'); | |
(function tryWinkPorter2Stemmer() { | |
for(let i = 0; i < terms.length; i++) { | |
const term = terms[i]; | |
console.log(`${term}: ${winkStemmer(term)}`); | |
} | |
})(); | |
console.info('slower then stemmer but as dumb'); | |
console.timeEnd('wink-porter2-stemmer'); | |
console.groupEnd(); | |
console.group('en-inflectors'); | |
console.time('en-inflectors'); | |
import {Inflectors} from "en-inflectors"; | |
console.timeLog('en-inflectors', 'loaded'); | |
(function tryEnInflectors() { | |
for(let i = 0; i < terms.length; i++) { | |
const term = terms[i]; | |
let inflector = new Inflectors(term); | |
console.log(`${term}:`, inflector.toPresent(), inflector.toPresentS(), inflector.toPast(), inflector.toPastParticiple(), inflector.toGerund(), inflector.toPlural(), inflector.toSingular()); | |
} | |
})() | |
console.info('pretty fast, makes mistakes (see designator) but together with wink-pos-tagger does everything I need'); | |
console.timeEnd('en-inflectors'); | |
console.groupEnd(); | |
console.group('en-pos'); | |
console.time('en-pos'); | |
import {Tag} from 'en-pos'; | |
console.timeLog('en-pos', 'loaded'); | |
(function tryEnPos() { | |
for(let i = 0; i < terms.length; i++) { | |
const term = terms[i]; | |
let tag = new Tag([term]); | |
console.log(`${term}`, tag.initial().tags, tag.smooth()); | |
} | |
})(); | |
console.info('I think it makes mistakes with designator'); | |
console.timeEnd('en-pos'); | |
console.groupEnd(); | |
global.Buffer = require('buffer').Buffer; | |
console.group('wink-nlp'); | |
console.time('wink-nlp'); | |
const winkNLP = require('wink-nlp'); | |
const model = require('wink-eng-lite-web-model'); | |
console.timeLog('wink-nlp', 'loaded'); | |
const wnlp = winkNLP(model); | |
for (let i = 0; i < terms.length; i++) { | |
const term = terms[i]; | |
const doc = wnlp.readDoc(term); | |
console.log(`${term}:`, doc.tokens().itemAt(0).out(wnlp.its.lemma), doc.tokens().itemAt(0).out(wnlp.its.pos)); | |
} | |
console.timeEnd('wink-nlp'); | |
console.groupEnd(); | |
console.group('logic'); | |
console.time('logic'); | |
//import {Inflectors} from "en-inflectors"; | |
for(let i = 0; i < terms.length; i++) { | |
const term = terms[i]; | |
const doc = wnlp.readDoc(term); | |
let pos = doc.tokens().itemAt(0).out(wnlp.its.pos); | |
let lemma = doc.tokens().itemAt(0).out(wnlp.its.lemma); | |
console.log(`${term}:`, lemma, pos); | |
let inflector = new Inflectors(lemma); | |
if(pos == 'NOUN'){ | |
console.log(`[${term}|${lemma}] sing: ${inflector.toSingular()}; plural: ${inflector.toPlural()}`); | |
} else if (pos == 'VERB'){ | |
console.log(`[${term}|${lemma}] pre: ${inflector.toPresent()}; preS: ${inflector.toPresentS()}; past: ${inflector.toPast()}; gerund: ${inflector.toGerund()}`); | |
} | |
} | |
console.timeEnd('logic'); | |
console.groupEnd(); | |
export class TextProccessor {} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment