Forked from joseph-rickert/BIG DATA with RevoScale R
Created
July 18, 2014 17:09
Revisions
-
joseph-rickert created this gist
Feb 8, 2013 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,208 @@ #------------------------------------------------------------ # REVOLUTION ANALYTICS WEBINAR: INTRODUCTION TO R FOR DATA MINING # February 14, 2013 # Joseph B. Rickert # Technical Marketing Manager # # BIG DATA with RevoScaleR # # Copyright: Revolution Analytics # This script is licensed under the GPLv2 license # http://www.gnu.org/licenses/gpl-2.0.html # ---------------------------------------------------------------------- # LOOK AT THE MORTGATE DEFAULT DATA #------------------------------------------------------------------------ dataDir <- "C:/Users/Joseph/Documents/DATA/Mortgage Data/mortDefault" mdata <- file.path(dataDir,"mortDefault.xdf") rxGetInfo(mdata,getVarInfo=TRUE) #----------------------------------------------------------------------------------- ## Create a new data file having a variable with uniform random numbers # going from 1 to 10. This variable will be used to create the training and test # data sets. # A little note on how the random numbers are created: # A transform should work on an arbitrary chunk of data. Typically # RevoScaleR functions will test transforms on a small chunk before # fully processing. The internal variable (.rxNumRows) gives the size # of the chunk. rxDataStep(inData = mdata, outFile = "mortDefault2", transforms=list(urns = as.integer(runif(.rxNumRows,1,11))), overwrite=TRUE) rxGetInfo("mortDefault2",getVarInfo=TRUE,numRows=3) # #------------------------------------------------------------ # KMEANS ANALYSIS #------------------------------------------------------------ rxDataStep(inData="mortDefault2",outFile="mortDefault3", varsToDrop="default", overwrite=TRUE) rxGetInfo("mortDefault3",getVarInfo=TRUE,numRows=5) form <- formula(~ creditScore + houseAge + yearsEmploy + ccDebt + year) md.km <- rxKmeans(formula=form, data = "mortDefault3", numClusters = 3, outFile = "mortDefault3", algorithm = "lloyd", overwrite=TRUE) rxGetInfo("mortDefault3",getVarInfo=TRUE,numRows=5) md.km # Build a data frame to do a plot mdDf <- rxXdfToDataFrame(file="mortDefault3", rowSelection=urns == 5, maxRowsByCols = 1000) plot(mdDf[,1:4],col=mdDf$.rxCluster) title(main="Clusters in Mortgage Default Data",line=3) ###### SCRIPT TO BUILD LOGISTIC REGRESSION MODEL TO PREDICT MORTGAGE DEFAULTS ##### #--------------------------------------------------------------------------- # Some subsidary functions #--------------------------------------------------------------------------- # Function to compute a "long form" of the confusion matrix Cmatrix <- function(df){ df <- as.data.frame(df) df$Result <- c("True Negative","False Negative","False Positive","True Positive") df$PCT <- round(df$Counts/sum(df$Counts),2)*100 df$Rates <- round(c(df$Counts[1]/(df$Counts[1]+df$Counts[3]), df$Counts[2]/(df$Counts[2]+df$Counts[4]), df$Counts[3]/(df$Counts[1]+df$Counts[3]), df$Counts[4]/(df$Counts[2]+df$Counts[4])),2) names(df) <- c("Actual","Predicted","Counts","Results","Pct","Rates") return(df) } #------------------------------------------------------------------------------ ##### CREATE TRAINING AND TEST FILES #----------------------------------- #info <- rxGetInfo(mdata) #N <- info$numRows # #------------------------------------------------------------------------------- # BUILD THE TRAINING FILE #------------------------ rxDataStepXdf(inFile = "mortDefault2", outFile = "mdTrain", rowSelection = urns < 9, transforms=list(CS = creditScore, YR = year, yrE = yearsEmploy, HA = houseAge, ccD = ccDebt), blocksPerRead=20, rowsPerRead=500000, overwrite=TRUE ) rxGetInfo("mdTrain",getVarInfo=TRUE,numRows=5) rxHistogram(~default,data="mdTrain") #------------------------- # BUILD THE TEST FILE #------------------------- rxDataStepXdf(inFile = "mortDefault2", outFile = "mdTest", rowSelection = urns > 8, transforms=list(CS = creditScore, YR = year, yrE = yearsEmploy, HA = houseAge, ccD = ccDebt), blocksPerRead=20, rowsPerRead=500000, overwrite=TRUE ) # rxGetInfo("mdTest",getVarInfo=TRUE,numRows=5) rxHistogram(~default,data="mdTest") #--------------------------------------------------------------------------- # BUILD A CLASSIFICATION MODEL USING LOGISTIC REGRESSION #--------------------------------------------------------------------------- system.time( model <- rxLogit(default ~ F(houseAge) + F(year)+ creditScore + yearsEmploy + ccDebt, data="mdTrain", reportProgress=rxGetOption("reportProgress") ) ) # #Elapsed computation time: 21.533 secs. #user system elapsed #56.15 12.02 21.55 #Elapsed computation time: 23.149 secs. #user system elapsed #56.81 10.58 23.17 #Elapsed computation time: 24.384 secs. #user system elapsed #59.29 10.31 24.48 summary(model) #---------------------------------------------------------------------- # MAKE PREDICTIONS ON THE TEST DATA USING THE MODEL CREATED ABOVE #---------------------------------------------------------------------- rxPredict(modelObject=model,data="mdTest",outData="mdTest",overwrite=TRUE,predVarNames="LogitPred") rxGetInfo("mdTest",getVarInfo=TRUE,numRows=5) #rxSummary(~default_Pred,data="mdTest") # Add a new prediction variable rxDataStep(inData="mdTest",outFile="mdTest", transforms=list(LogitPred.L = as.logical(round(LogitPred))), overwrite=TRUE) # rxGetInfo("mdTest",getVarInfo=TRUE,numRows=5) #------------------------------------------------------------------------------- # GENERATE THE CONFUSION MATRIX #------------------------------- conMc <- rxCube(~ F(default):F(LogitPred.L),data="mdTest") Cmatrix(conMc) # Examine the performance of the model total.pct.correct <- round(100*(conMc$Counts[1]+conMc$Counts[4]) / sum(conMc$Counts),2) total.pct.correct #----------------------------------------------------------------------------------- # Generate the ROC Curve # rxRocCurve(actualVarName="default",predVarName="LogitPred",data="mdTest") # #------------------------------------------------------------------------------------- # BUILD A TREE MODEL system.time( model.tree <- rxDTree(default ~ HA + YR + CS + yrE + ccD, data="mdTrain", blocksPerRead = 1, maxDepth=5, reportProgress=rxGetOption("reportProgress") ) ) ## #Elapsed time for RxDTreeBase: 89.545 secs. # #user system elapsed #245.13 12.50 89.57 #Elapsed time for RxDTreeBase: 403.785 secs. # This was to fully build out the tree #user system elapsed #1092.37 75.89 403.83 model.tree # #---------------------------------------------------------------- # Plot the Tree plot(rxAddInheritance(model.tree),uniform=TRUE) text(rxAddInheritance(model.tree),digits=2) title(main="Classification Tree for Mortgage Data", sub=paste(format(Sys.time(), "%Y-%b-%d %H:%M:%S"), Sys.info()["user"])) #------------------------------------------------------------------- ###### - END DEMO HERE - ########### This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,87 @@ # #------------------------------------------------------------ # REVOLUTION ANALYTICS WEBINAR: INTRODUCTION TO R FOR DATA MINING # February 14, 2013 # Joseph B. Rickert # Technical Marketing Manager # # GETTING STARTED # # Copyright: Revolution Analytics # This script is licensed under the GPLv2 license # http://www.gnu.org/licenses/gpl-2.0.html #--------------------------------------------------------------------- # Execute the following command to install all of the packages needed for the webinar #install.packages(c( "ada","boot","caret","corrplot","doParallel","ellipse", #"ISwR","partykit","pROC","rattle","RColorBrewer", #"rpart","Snowball","ROCR","tm","twitteR","wordcloud")) # #---------------------------------------------------------------------- # A First look at R # A simple regression example from # Statistics and Computing, Introductory Statistics with R # Peter Dalgaard, Springer 2002 ## library(ISwR) # Load a library data() # Have a look at what data sets are available data(thuesen) # Load thuesen into the environment thuesen # Have a look at it class(thuesen) # Find out what kind of object thuesen is sapply(thuesen,class) # See what kinds of animal the variables are # plot(short.velocity ~ blood.glucose, data=thuesen) #plot the data using the formula interface # plot(thuesen$blood.glucose,thuesen$short.velocity) # plot the data by indexining into the data frame # model <- lm(short.velocity ~ blood.glucose, data=thuesen) # build a linear model summary(model) # Look at the results str(model) # Look at the structure of the model object # Build a fancier plot plot(x=thuesen$blood.glucose, y=thuesen$short.velocity, xlab="blood glucose (mmol / l)", ylab = "circumferential shortening velocity (%/s)", main = "Thuesen Data set", col="blue", pch=19 ) abline(model,col="red") # par(mfrow=c(2,2)) # Set up for multiple plots plot(model, col="blue") # look at some diagnostics #--------------------------------------------------------------------- # # A FIRST LOOK AT FUNCTIONS # # Let's create a simple function joe.stats <- function(data){ min <- min(data) max <- max(data) q <- quantile(data,probs=seq(0,1,0.25)) res <- list(min,max,q) return(res) } attach(thuesen) # make the columns of thuesen available # in the global environment as variables joe.stats(blood.glucose) # Run our function summary(blood.glucose) # R does it better # Set up for later rm(list=ls()) load("WEBINAR_2-14-13_Intro_R_DM_caret .RData") #-------------------------------------------------------------------------------- #SOME ADDITIONAL ONLINE RESOURCES #An Introduction to R #Notes on R: A Programming Environment for Data Analysis and Graphics #Version 2.15.2 (2012-10-26) #http://cran.r-project.org/doc/manuals/R-intro.pdf # #Using R for Data Analysis and Graphics #Introduction, Code and Commentary #J H Maindonald #http://cran.r-project.org/doc/contrib/usingR.pdf This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,132 @@ #------------------------------------------------------------------------ # REVOLUTION ANALYTICS WEBINAR: INTRODUCTION TO R FOR DATA MINING # February 14, 2013 # Joseph B. Rickert # Technical Marketing Manager # #### BUILD A TREE MODEL WITH RPART AND EVALUATE ##### # # Copyright: Revolution Analytics # This script is licensed under the GPLv2 license # http://www.gnu.org/licenses/gpl-2.0.html #------------------------------------------------------------------------- # This script divides the data into training, validation and testing data, # builds two different decision trees (rpart) using the training data and # evaluates their performance using the test data set # An ROC curve is produced for the better model #------------------------------------------------------------------------ library(rattle) library(rpart) library(ROCR) library(caret) # ----------------------------------------------------------------------- # Read in the data from disk # name <- "weather.csv" # path <- file.path(getwd(),name) # weather <- read.csv(path,header=TRUE) # Show weather on the IDE editor data(weather) head(weather) #------------------------------------------------------------------------ # Select variables for the model weather <- subset(weather,select=c(MinTemp:RainTomorrow)) set.seed(42) # Set seed #------------------------------------------------------------------------- # Determined the observations for the training,validate and test datasets. N <- nrow(weather) # 366 observations train <- sample(N, 0.8*N) # 292 observations test <- setdiff(seq_len(N),train) # 74 observations not intrain #------------------------------------------------------------------------- # Build the model M <- ncol(weather) input <- names(weather)[1:(M-2)] # names of input variables target <- "RainTomorrow" # name of target variable form <- formula(RainTomorrow ~ .) # Describe the model to R tree.m <- rpart(RainTomorrow ~ ., data=weather[train, c(input,target)], method="class", parms=list(split="information"), control=rpart.control(usesurrogate=0, maxsurrogate=0)) #--------------------------------------------------------------------------- # Look at the textual description of the tree. tree.m # print the model printcp(tree.m) # print the table of optimal prunings based on the complexity parameter #---------------------------------------------------------------------------- # Plot the tree drawTreeNodes(tree.m) title(main="Weather Data tree.m", sub=paste(format(Sys.time(), "%Y-%b-%d %H:%M:%S"), Sys.info()["user"])) #---------------------------------------------------------------------------- # Evaluate performance # Run the tree model on the validate set pred <- predict(tree.m, weather[test, c(input,target)], type="class") levels(pred) <- c("Yes","No") # change order of levesl to match documentation for confusionMatrix # Generate the confusion matrix actual <- weather[test, c(input,target)]$RainTomorrow levels(actual) <- c("Yes","No") # change order of levels to match documantation for confusion matrix AP <- c("Predicted","Actual") # row names for CM CM <- table(pred,actual,dnn=AP) # CM counts confusionMatrix(CM) # from the caret package ?confusionMatrix # Look at meaning of confusionMatrix outputs # Notes # The\no-information rate"shown on the output is the largest proportion of the observed classes # A one-sided hypothesis test is computed to evaluate whether the overall accuracy rate is greater # than the rate of the largest class. This is helpful for data sets where there is a large imbalance # between the classes. # # The kappa statistic yields a measure of how well the actual and predicted values agree # See http://www.chestx-ray.com/statistics/kappa.html or # http://en.wikipedia.org/wiki/Cohen%27s_kappa # # The null hypothesis for McNemar's chi squared test is that the actual and predicted # probabilities are the same # See http://en.wikipedia.org/wiki/McNemar%27s_test # #-------------------------------------------------------------------------------------------- # Try another model using different variables form <- formula(RainTomorrow ~ Cloud9am + Pressure9am + WindDir9am + Temp9am + Humidity9am) tree.m2 <- rpart(form, data=weather[train, c(input,target)], method="class", parms=list(split="information"), control=rpart.control(usesurrogate=2, maxsurrogate=0, minsplit=30, maxdepth=20)) #---------------------------------------------------------------------------------------------- # Plot the new tree drawTreeNodes(tree.m2) title(main="Weather Data tree.m2", sub=paste(format(Sys.time(), "%Y-%b-%d %H:%M:%S"), Sys.info()["user"])) tree.mod.p <- as.party(tree.m2) # make the tree.mod object into a party object plot(tree.mod.p) #---------------------------------------------------------------------------- # Evaluate performance of the new model on the test set pred2 <- predict(tree.m2, weather[test, c(input,target)], type="class") levels(pred2) <- c("Yes","No") CM2 <- table(pred2,actual,dnn=AP) confusionMatrix(CM2) # ----------------------------------------------------------------------------------- # # GENERATE THE ROC CURVE FOR THE BEST MODEL prROC <- predict(tree.m, weather[test, c(input,target)])[,2] # # Get vector RainTommorrow in test data set testRT <- weather[test, c(input,target)]$RainTomorrow pr <- prediction(prROC, testRT) #------------------------------------------------------------------------------------ # Plot the ROC curve plot(performance(pr, "tpr", "fpr"), col="#CC0000FF", lty=1, lwd=2,add=FALSE) #fpr: False positive rate. P(Yhat = + | Y = -). Estimated as: FP/N. #tpr: True positive rate. P(Yhat = + | Y = +). Estimated as: TP/P. segments(0,0,1,1,col="blue",lwd=2) # Add a legend to the plot. legend("bottomright", c("tree.m"), col=rainbow(1, 1, .8), lty=1:1, title="Models", inset=c(0.05, 0.05)) # Add decorations to the plot. title(main="ROC Curve weather.csv [test data]", sub=paste(format(Sys.time(), "%Y-%b-%d %H:%M:%S"), Sys.info()["user"])) # This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,215 @@ #------------------------------------------------------------------------------ # REVOLUTION ANALYTICS WEBINAR: INTRODUCTION TO R FOR DATA MINING # February 14, 2013 # Joseph B. Rickert # Technical Marketing Manager # # DATA MINING with CARET # # Copyright: Revolution Analytics # This script is licensed under the GPLv2 license # http://www.gnu.org/licenses/gpl-2.0.html #------------------------------------------------------------------------------ # INTRODUCTION TO THE CARET PACKAGE # caret is a feature rich package for doing data mining in R. # This script explores caret's capabilities using data included in the # package that was described in the paper: # Hill et al "Impact of image segmentation on high-content # screening data quality for SK-BR-3 cells" # BMC fioinformatics (2007) vol 8 (1) pp. 340 # # Background # Well-segmented cells are cells for which location and size may be accurrately detremined # through optical measurements. Cells that are not Well-segmented (WS) are said to be # Poorly-segmented (PS). # # Problem # Given a set of optical measurements can we predict which cells will be PS? # This is a classic classification problem #--------------------------- library(ada) # Boosting algorithms library(caret) library(rpart) # CART algorithm for decision trees library(partykit) # Plotting trees library(doParallel) # parallel processing # by default # Multicore functionality on Unix (single machine only) # Snow functionality on Windows (cluster) library(pROC) # plot the ROC curve library(corrplot) # plot correlations #--------------------------- # data(package="caret") data(segmentationData) # Load the segmentation data set dim(segmentationData) head(segmentationData) # Have a look at the data #[1] 2019 61 trainIndex <- createDataPartition(segmentationData$Case,p=.5,list=FALSE) trainData <- segmentationData[trainIndex,] dim(trainData) #1010 61 testData <- segmentationData[-trainIndex,] dim(testData) #1009 61 #------------------------------------------------------------------------------------- # VISUALIZE CORRELATIONS trainV <- trainData[,4:61] corrplot(cor(trainV),order="hclust",tl.cex=.5,method="ellipse") #----------------------------------------------------------------- # BUILD AN ADABOOST MODEL WITH ADA form <- formula(Class ~ .) control <- rpart.control(maxdepth=30, # the maximum depth of any node of the final tree cp = 0.01, # complexity parameter. Any split that does not decrease the overall lack of fit by a factor of cp is not attempted. minsplit=20, # the minimum number of observations that must exist in a node in order for a split to be attempted xval=10) # number of cross-validations ada.model <- ada(formula=form, data=trainData, control=control, nu = .01, # shrinkage parameter for boosting iter=50) ada.model$model[[1]] # Look at the trees in the model ada.model # look at the model performance plot(ada.model,TRUE) # Plot error rate vs. iterations of the model varplot(ada.model) # Variable importance plot #---------------------------------------------------------------------- # FIND THE "BEST" MODEL # # This is an interesting model, but how do you select the best values for the # for the three tuning parameters? # nu # iter # maxdepth #--------------------------------------------------------------------------------- # Algorithm for training the model: # for each resampled data set do # hold out some samples # for each combination of the three tuning parameters # do # Fit the model on the resampled data set # Predict the values of class on the hold out samples # end # Calculate AUC: the area under the ROC for each sample # Select the combination of tuning parmeters that yields the best AUC # # caret provides the "train" function to do all of this # # The trainControl function to set the training method # Note the default method of picking the best model is accuracy and Cohen's Kappa # #----------------------------------------------------------------------------------- # Set up the parameters to run the boosting function ctrl <- trainControl(method="repeatedcv", # use repeated 10fold cross validation number=5, # the number of folds repeats=2, # do 2 repititions of 5-fold cv summaryFunction=twoClassSummary, # Use AUC to pick the best model classProbs=TRUE) # Use the expand.grid to specify the search space # Note that the default search grid selects 3 values of each tuning parameter # grid <- expand.grid(.nu=c(.1,1), # .iter=c(20,50), .maxdepth=c(20,30)) # # set.seed(1) #names(trainData) trainX <-trainData[,4:61] #----------------------------------------------------------------- # PARALLEL COMPUTING # vignette("gettingstartedParallel") cl <- makeCluster(4) # Use this to manually create a cluster # But, since I only have a single Windows machine # all I am doing is passing the number of cores to use to # registerDoParallel() registerDoParallel(cl) # Registrer a parallel backend for train getDoParWorkers() system.time(ada.tune <- train(x=trainX,y=trainData$Class, method = "ada", metric = "ROC", trControl = ctrl, control=control, tuneGrid=grid)) # stopCluster(cl) #user system elapsed #14.33 0.02 206.25 #------------------------------------------------------------------------------- # ADA RESULTS ada.tune # Look at the results for the training grid ada.tune$finalModel # Look at the performance of the final model plot(ada.tune) # Plot the performance of the training models #-------------------------------------------------------------------------------- # ADA PREDICTIONS testX <- testData[,4:61] ada.pred <- predict(ada.tune,testX) # confusionMatrix(ada.pred,testData$Class) #----------------------------------------------------------------- # DRAW THE ROC CURVE # Use roc function from the pROC package ada.probs <- predict(ada.tune,testX,type="prob") ada.ROC <- roc(predictor=ada.probs$PS, response=testData$Class, levels=rev(levels(testData$Class))) plot(ada.ROC,col=2) ada.ROC$auc # Get the area under the curve #------------------------------------------------------------------------------------ # # SUPPORT VECTOR MACHINE MODEL # set.seed(1) registerDoParallel(4,cores=4) getDoParWorkers() system.time( svm.tune <- train(x=trainX, y= trainData$Class, method = "svmRadial", tuneLength = 5, # 5 values of the cost function preProc = c("center","scale"), metric="ROC", trControl=ctrl) # same as for ada above ) #user system elapsed #2.40 0.14 26.10 #-------------------------------------------------------------- # SVM RESULTS svm.tune # Look at the results for the training grid svm.tune$finalModel # Look at the performance of the final mode plot(svm.tune, metric="ROC", scales=list(x=list(log=2))) #--------------------------------------------------------------- # SVM PREDICTIONS svm.pred <- predict(svm.tune,testX) confusionMatrix(svm.pred,testData$Class) # #---------------------------------------------------------------- # COMPARE THE SVM AND ADA MODELS USING RESAMPLING # # Because we set the same seed before running the models we can compare the models using resampling # See Hothorn at al, "The design and analysis of benchmark experiments" # Journal of Computational and Graphical Statistics (2005) vol 14 (3) pp 675-699 # for comparing models using resampling. # # The resamples function in caret collates the resampling results from the two models rValues <- resamples(list(svm=svm.tune,ada=ada.tune)) rValues$values # Look at the resample values summary(rValues) # Summarize the resamples #--------------------------------------------- xyplot(rValues,metric="ROC") # scatter plot bwplot(rValues,metric="ROC") # boxplot parallel(rValues,metric="ROC") # parallel plot dotplot(rValues,metric="ROC") # dotplot # This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,113 @@ ############################################################################## # REVOLUTION ANALYTICS WEBINAR: INTRODUCTION TO R FOR DATA MINING # February 14, 2013 # Joseph B. Rickert # Technical Marketing Manager # # ROLL with RATTLE # # Copyright: Revolution Analytics # This script is licensed under the GPLv2 license # http://www.gnu.org/licenses/gpl-2.0.html ################################################################################# # library(rattle) # load the rattle package rattle() # start the rattle user interface # #data() # see what data sets are available in all of the loaded packages data(package="rattle") # see what data sets are availeble in rattle ls("package:rattle") # See what functions are in the Rattle package #lsf.str("package:rattle") # see what functions are in rattle # # THE FOLLOWING INSTRUCTIONS SHOULD BE HELPFUL FOR EXPLORING THE RATTLE GUI # # LOAD THE WEATHER DATA SET. # The weather data set consists of observations made at a weather monitoring station # in Canberra, Australia. Each ovservation describs weather conditions on a particular day. # See page 25 of Gram Willians' Data Mining with Rattle, The Art of Excavating Data for # Knowlwdge Discovery, Springer 2011 # # Go to the Data Tab and click on Execute # Rattle will ask if you want to use the weather data as default. Click yes. # # SUMMARY STATISTICS # Go to the Explore Tab # Select summary and basics # Hit Execute # # SCATTER PLOTS # Go to the Explore Tab # Select Distributions # Click on Execute # # LOOK AT A SINGLE VARIABLE # Go to Explore Tab # Select RainTomorrow Bar Plot # Hit Execute # This produces a bar plot of the target variable RainTomorrow # 84% of the observations have no rain # A model that always predicts no rain should be about 84% accurate # # INVESTIGATE MULTIPLE VARIABLES # Go to the Explore Tab # In the upper panel select Box Plot and Histogram for both # MaxTemp # Sunshine # Click Execute # # Boxplots top left: Temperature generally higher day before it rains # Boxplots top right: Less sunshine day before it rains # # CORRELATIONS # Go to the Explore Tab # Un select any variables that may be selected # Select Correlation # Click on Execute # # # INTERACTIVELY EXPLORE DATA # Select Interactive and then Lattiscist # In bottom center panel # Select MaxTemp for y axis and # Select Sunshine for x axis # Place crosshair on outlier and right click # # BUILD A TREE MODEL # Go to Model Tab # Select Tree # Click Execute # Click on Draw button to get the graph # Click on Rules button to see rules # Select Log Tab to look at R code # # EVALUATE THE MODEL # Go to the Evaluate tab # Select # Type = Error Matrix # Model = Tree # Data = Testing # Click on Execute # # Error matrix for the Decision Tree model on weather.csv [test] (counts): # #Predicted #Actual No Yes #No 35 6 False positive rate = FP/N = 6/(35+6) = .146 = negatives incorrectly classified / total negatives #Yes 5 10 True positive rate = TP/P = 10/(10+5) = .667 = positives correctly classified / total positives # = sensitivity = recall = hit rate # True negative rate = TN / (FP + TN) = 1 - FP rate = .854 # = specificity # # False positives = 6 = Type I Error (Test rejects true null hypothesis) # False negatives = 5 = Type II Error (Test fails to reject false null hypothesis) #Error matrix for the Decision Tree model on weather.csv [test] (%): # #Predicted #Actual No Yes #No 62 11 62% (35/56) of cases model predicts it won't rain and it didn't #Yes 9 18 18% (10/56) of cases model predicts it would rain and it did # Accracy of test = 62% + 18% = 80% # #Overall error: 0.1964286 This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,100 @@ #------------------------------------------------------------ # REVOLUTION ANALYTICS WEBINAR: INTRODUCTION TO R FOR DATA MINING # February 14, 2013 # Joseph B. Rickert # Technical Marketing Manager # # JUST FOR FUN - BUILD A WORD CLOUD # # Copyright: Revolution Analytics # This script is licensed under the GPLv2 license # http://www.gnu.org/licenses/gpl-2.0.html # From example at RDataMining # http://www.rdatamining.com/examples/text-mining # This page shows an example on text mining of Twitter #----------------------------------------------------------------------------- # Load the libraries necesary library(twitteR) # twitteR provides access to Twitter data library(tm) # tm provides functions for text mining library(Snowball) # Wrappers for Weka Java stemming funcitons library(wordcloud) # wordcloud visualizes the result with a word cloud library(RColorBrewer) # provides the rainbow colors #------------------------------------------------------------------------------ # retrieve the first 100 tweets (or all tweets if fewer than 100) # from the user timeline of @rdatammining # Tweets <- searchTwitter("#rstats",n=100) n <- length(Tweets) # Tweets[1:3] # #------------------------------------------------------------------------------- #Transforming Text #The tweets are first converted to a data frame and then to a corpus. df <- do.call("rbind", lapply(Tweets, as.data.frame)) #dim(df) # Just in case twitter is off-line #df <-read.csv("UseRTweets.csv",header=TRUE,row.names=1) #head(df) # # Build a corpus, which is a collection of text documents # VectorSource specifies that the source is character vectors. myCorpus <- Corpus(VectorSource(df$text)) #After that, the corpus needs a couple of transformations, including #changing letters to lower case, #removing punctuations/numbers and removing stop words. #The general English stop-word list is tailored by #adding "available" and "via" and removing "r". myCorpus <- tm_map(myCorpus, tolower) # lower case myCorpus <- tm_map(myCorpus, removePunctuation) # remove punctuation myCorpus <- tm_map(myCorpus, removeNumbers) # remove numbers # keep "r" by removing it from stopwords myStopwords <- c(stopwords('english'), "available", "via") idx <- which(myStopwords == "r") myStopwords <- myStopwords[-idx] myCorpus <- tm_map(myCorpus, removeWords, myStopwords) #---------------------------------------------------------------------------- #Stemming Words # In many cases, words need to be stemmed to retrieve their radicals. # For instance, "example" and "examples" are both stemmed to "exampl". # However, after that, one may want to complete the stems to their original # forms, so that the words would look "normal". dictCorpus <- myCorpus # stem words in a text document with the snowball stemmers, # which requires packages Snowball, RWeka, rJava, RWekajars myCorpus <- tm_map(myCorpus, stemDocument) #inspect(myCorpus[1:3]) # inspect the first three ``documents" #myCorpus <- tm_map(myCorpus, stemCompletion, dictionary=dictCorpus) # stem completion # # #inspect(myCorpus[1:3]) #Print the first three documents in the built corpus. #---------------------------------------------------------------------------------------- #Building a Document-Term Matrix myDtm <- TermDocumentMatrix(myCorpus, control = list(minWordLength = 1)) # inspect(myDtm[266:270,31:40]) # # findFreqTerms(myDtm, lowfreq=10) #Frequent Terms and Associations # findAssocs(myDtm, 'analytics', 0.30) # which words are associated with "analytics"? #----------------------------------------------------------------------------------------- #Build the word cloud #After building a document-term matrix, we can show the importance of #words with a word cloud (also kown as a tag cloud) . m <- as.matrix(myDtm) # calculate the frequency of words v <- sort(rowSums(m), decreasing=TRUE) myNames <- names(v) d <- data.frame(word=myNames, freq=v) # Plot the word cloud pal <- brewer.pal(6,"Dark2") pal <- pal[-(1)] #random colors wordcloud(d$word,d$freq,c(4,1),2,,TRUE,TRUE,.15,pal)