Last active
December 6, 2019 21:38
-
-
Save AlexTitovWork/0ab01e26eb6964c3dd9893e75e5ccbcf to your computer and use it in GitHub Desktop.
transformWeka.java
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* load training data and set feature generators | |
*/ | |
public void transform() { | |
try { | |
trainData = loadDataset(TRAIN_DATA); | |
saveArff(trainData, TRAIN_ARFF_ARFF); | |
/** | |
* create the filter and set the attribute to be transformed from text into a feature vector (the last one) | |
*/ | |
StringToWordVector filter = new StringToWordVector(); | |
filter.setAttributeIndices("last"); | |
/** | |
* Add ngram tokenizer to filter with min and max length set to 1 | |
*/ | |
NGramTokenizer tokenizer = new NGramTokenizer(); | |
tokenizer.setNGramMinSize(1); | |
tokenizer.setNGramMaxSize(1); | |
/** | |
* Tokenize based on delimiter | |
*/ | |
tokenizer.setDelimiters("\\W"); | |
filter.setTokenizer(tokenizer); | |
/** | |
* To lowercase converting | |
*/ | |
filter.setLowerCaseTokens(true); | |
/** | |
* Set filter to classifier | |
*/ | |
classifier.setFilter(filter); | |
} catch (Exception e) { | |
LOGGER.warning(e.getMessage()); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment