Skip to content

Instantly share code, notes, and snippets.

Revisions

  1. joseph-rickert created this gist Feb 8, 2013.
    208 changes: 208 additions & 0 deletions BIG DATA with RevoScale R
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,208 @@

    #------------------------------------------------------------
    # REVOLUTION ANALYTICS WEBINAR: INTRODUCTION TO R FOR DATA MINING
    # February 14, 2013
    # Joseph B. Rickert
    # Technical Marketing Manager
    #
    # BIG DATA with RevoScaleR
    #
    # Copyright: Revolution Analytics
    # This script is licensed under the GPLv2 license
    # http://www.gnu.org/licenses/gpl-2.0.html
    # ----------------------------------------------------------------------
    # LOOK AT THE MORTGATE DEFAULT DATA
    #------------------------------------------------------------------------
    dataDir <- "C:/Users/Joseph/Documents/DATA/Mortgage Data/mortDefault"
    mdata <- file.path(dataDir,"mortDefault.xdf")
    rxGetInfo(mdata,getVarInfo=TRUE)

    #-----------------------------------------------------------------------------------
    ## Create a new data file having a variable with uniform random numbers
    # going from 1 to 10. This variable will be used to create the training and test
    # data sets.
    # A little note on how the random numbers are created:
    # A transform should work on an arbitrary chunk of data. Typically
    # RevoScaleR functions will test transforms on a small chunk before
    # fully processing. The internal variable (.rxNumRows) gives the size
    # of the chunk.

    rxDataStep(inData = mdata, outFile = "mortDefault2",
    transforms=list(urns = as.integer(runif(.rxNumRows,1,11))),
    overwrite=TRUE)
    rxGetInfo("mortDefault2",getVarInfo=TRUE,numRows=3)

    #
    #------------------------------------------------------------
    # KMEANS ANALYSIS
    #------------------------------------------------------------
    rxDataStep(inData="mortDefault2",outFile="mortDefault3",
    varsToDrop="default",
    overwrite=TRUE)
    rxGetInfo("mortDefault3",getVarInfo=TRUE,numRows=5)

    form <- formula(~ creditScore + houseAge + yearsEmploy + ccDebt + year)
    md.km <- rxKmeans(formula=form,
    data = "mortDefault3",
    numClusters = 3,
    outFile = "mortDefault3",
    algorithm = "lloyd",
    overwrite=TRUE)
    rxGetInfo("mortDefault3",getVarInfo=TRUE,numRows=5)
    md.km
    # Build a data frame to do a plot
    mdDf <- rxXdfToDataFrame(file="mortDefault3",
    rowSelection=urns == 5,
    maxRowsByCols = 1000)

    plot(mdDf[,1:4],col=mdDf$.rxCluster)
    title(main="Clusters in Mortgage Default Data",line=3)

    ###### SCRIPT TO BUILD LOGISTIC REGRESSION MODEL TO PREDICT MORTGAGE DEFAULTS #####
    #---------------------------------------------------------------------------
    # Some subsidary functions
    #---------------------------------------------------------------------------
    # Function to compute a "long form" of the confusion matrix
    Cmatrix <- function(df){
    df <- as.data.frame(df)
    df$Result <- c("True Negative","False Negative","False Positive","True Positive")
    df$PCT <- round(df$Counts/sum(df$Counts),2)*100
    df$Rates <- round(c(df$Counts[1]/(df$Counts[1]+df$Counts[3]),
    df$Counts[2]/(df$Counts[2]+df$Counts[4]),
    df$Counts[3]/(df$Counts[1]+df$Counts[3]),
    df$Counts[4]/(df$Counts[2]+df$Counts[4])),2)
    names(df) <- c("Actual","Predicted","Counts","Results","Pct","Rates")
    return(df)
    }
    #------------------------------------------------------------------------------
    ##### CREATE TRAINING AND TEST FILES
    #-----------------------------------
    #info <- rxGetInfo(mdata)
    #N <- info$numRows
    #

    #-------------------------------------------------------------------------------
    # BUILD THE TRAINING FILE
    #------------------------
    rxDataStepXdf(inFile = "mortDefault2",
    outFile = "mdTrain",
    rowSelection = urns < 9,
    transforms=list(CS = creditScore,
    YR = year,
    yrE = yearsEmploy,
    HA = houseAge,
    ccD = ccDebt),
    blocksPerRead=20,
    rowsPerRead=500000,
    overwrite=TRUE )

    rxGetInfo("mdTrain",getVarInfo=TRUE,numRows=5)
    rxHistogram(~default,data="mdTrain")
    #-------------------------
    # BUILD THE TEST FILE
    #-------------------------
    rxDataStepXdf(inFile = "mortDefault2",
    outFile = "mdTest",
    rowSelection = urns > 8,
    transforms=list(CS = creditScore,
    YR = year,
    yrE = yearsEmploy,
    HA = houseAge,
    ccD = ccDebt),
    blocksPerRead=20,
    rowsPerRead=500000,
    overwrite=TRUE )
    #
    rxGetInfo("mdTest",getVarInfo=TRUE,numRows=5)
    rxHistogram(~default,data="mdTest")
    #---------------------------------------------------------------------------
    # BUILD A CLASSIFICATION MODEL USING LOGISTIC REGRESSION
    #---------------------------------------------------------------------------
    system.time(
    model <- rxLogit(default ~ F(houseAge) + F(year)+ creditScore + yearsEmploy + ccDebt,
    data="mdTrain",
    reportProgress=rxGetOption("reportProgress") )
    )
    #
    #Elapsed computation time: 21.533 secs.
    #user system elapsed
    #56.15 12.02 21.55


    #Elapsed computation time: 23.149 secs.
    #user system elapsed
    #56.81 10.58 23.17
    #Elapsed computation time: 24.384 secs.
    #user system elapsed
    #59.29 10.31 24.48

    summary(model)

    #----------------------------------------------------------------------
    # MAKE PREDICTIONS ON THE TEST DATA USING THE MODEL CREATED ABOVE
    #----------------------------------------------------------------------
    rxPredict(modelObject=model,data="mdTest",outData="mdTest",overwrite=TRUE,predVarNames="LogitPred")
    rxGetInfo("mdTest",getVarInfo=TRUE,numRows=5)
    #rxSummary(~default_Pred,data="mdTest")
    # Add a new prediction variable
    rxDataStep(inData="mdTest",outFile="mdTest",
    transforms=list(LogitPred.L = as.logical(round(LogitPred))),
    overwrite=TRUE)
    #
    rxGetInfo("mdTest",getVarInfo=TRUE,numRows=5)

    #-------------------------------------------------------------------------------
    # GENERATE THE CONFUSION MATRIX
    #-------------------------------
    conMc <- rxCube(~ F(default):F(LogitPred.L),data="mdTest")
    Cmatrix(conMc)

    # Examine the performance of the model
    total.pct.correct <- round(100*(conMc$Counts[1]+conMc$Counts[4]) / sum(conMc$Counts),2)
    total.pct.correct
    #-----------------------------------------------------------------------------------
    # Generate the ROC Curve
    #
    rxRocCurve(actualVarName="default",predVarName="LogitPred",data="mdTest")
    #
    #-------------------------------------------------------------------------------------

    # BUILD A TREE MODEL
    system.time(
    model.tree <- rxDTree(default ~ HA + YR + CS + yrE + ccD,
    data="mdTrain",
    blocksPerRead = 1,
    maxDepth=5,
    reportProgress=rxGetOption("reportProgress") )
    )
    ##

    #Elapsed time for RxDTreeBase: 89.545 secs.
    #
    #user system elapsed
    #245.13 12.50 89.57


    #Elapsed time for RxDTreeBase: 403.785 secs.
    # This was to fully build out the tree
    #user system elapsed
    #1092.37 75.89 403.83

    model.tree
    #
    #----------------------------------------------------------------
    # Plot the Tree
    plot(rxAddInheritance(model.tree),uniform=TRUE)
    text(rxAddInheritance(model.tree),digits=2)
    title(main="Classification Tree for Mortgage Data",
    sub=paste(format(Sys.time(), "%Y-%b-%d %H:%M:%S"), Sys.info()["user"]))
    #-------------------------------------------------------------------

    ###### - END DEMO HERE - ###########







    87 changes: 87 additions & 0 deletions GETTING STARTED
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,87 @@
    #
    #------------------------------------------------------------
    # REVOLUTION ANALYTICS WEBINAR: INTRODUCTION TO R FOR DATA MINING
    # February 14, 2013
    # Joseph B. Rickert
    # Technical Marketing Manager
    #
    # GETTING STARTED
    #
    # Copyright: Revolution Analytics
    # This script is licensed under the GPLv2 license
    # http://www.gnu.org/licenses/gpl-2.0.html

    #---------------------------------------------------------------------
    # Execute the following command to install all of the packages needed for the webinar
    #install.packages(c( "ada","boot","caret","corrplot","doParallel","ellipse",
    #"ISwR","partykit","pROC","rattle","RColorBrewer",
    #"rpart","Snowball","ROCR","tm","twitteR","wordcloud"))
    #
    #----------------------------------------------------------------------
    # A First look at R
    # A simple regression example from
    # Statistics and Computing, Introductory Statistics with R
    # Peter Dalgaard, Springer 2002
    ##
    library(ISwR) # Load a library
    data() # Have a look at what data sets are available
    data(thuesen) # Load thuesen into the environment
    thuesen # Have a look at it
    class(thuesen) # Find out what kind of object thuesen is
    sapply(thuesen,class) # See what kinds of animal the variables are
    #
    plot(short.velocity ~ blood.glucose, data=thuesen) #plot the data using the formula interface
    #
    plot(thuesen$blood.glucose,thuesen$short.velocity) # plot the data by indexining into the data frame
    #
    model <- lm(short.velocity ~ blood.glucose, data=thuesen) # build a linear model
    summary(model) # Look at the results
    str(model) # Look at the structure of the model object
    # Build a fancier plot
    plot(x=thuesen$blood.glucose,
    y=thuesen$short.velocity,
    xlab="blood glucose (mmol / l)",
    ylab = "circumferential shortening velocity (%/s)",
    main = "Thuesen Data set",
    col="blue",
    pch=19
    )
    abline(model,col="red")
    #
    par(mfrow=c(2,2)) # Set up for multiple plots
    plot(model, col="blue") # look at some diagnostics

    #---------------------------------------------------------------------
    #
    # A FIRST LOOK AT FUNCTIONS
    #
    # Let's create a simple function
    joe.stats <- function(data){
    min <- min(data)
    max <- max(data)
    q <- quantile(data,probs=seq(0,1,0.25))
    res <- list(min,max,q)
    return(res)
    }

    attach(thuesen) # make the columns of thuesen available
    # in the global environment as variables
    joe.stats(blood.glucose) # Run our function

    summary(blood.glucose) # R does it better


    # Set up for later
    rm(list=ls())
    load("WEBINAR_2-14-13_Intro_R_DM_caret .RData")
    #--------------------------------------------------------------------------------
    #SOME ADDITIONAL ONLINE RESOURCES
    #An Introduction to R
    #Notes on R: A Programming Environment for Data Analysis and Graphics
    #Version 2.15.2 (2012-10-26)
    #http://cran.r-project.org/doc/manuals/R-intro.pdf
    #
    #Using R for Data Analysis and Graphics
    #Introduction, Code and Commentary
    #J H Maindonald
    #http://cran.r-project.org/doc/contrib/usingR.pdf
    132 changes: 132 additions & 0 deletions IN THE TREES
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,132 @@
    #------------------------------------------------------------------------
    # REVOLUTION ANALYTICS WEBINAR: INTRODUCTION TO R FOR DATA MINING
    # February 14, 2013
    # Joseph B. Rickert
    # Technical Marketing Manager
    #
    #### BUILD A TREE MODEL WITH RPART AND EVALUATE #####
    #
    # Copyright: Revolution Analytics
    # This script is licensed under the GPLv2 license
    # http://www.gnu.org/licenses/gpl-2.0.html
    #-------------------------------------------------------------------------
    # This script divides the data into training, validation and testing data,
    # builds two different decision trees (rpart) using the training data and
    # evaluates their performance using the test data set
    # An ROC curve is produced for the better model
    #------------------------------------------------------------------------
    library(rattle)
    library(rpart)
    library(ROCR)
    library(caret)
    # -----------------------------------------------------------------------
    # Read in the data from disk
    # name <- "weather.csv"
    # path <- file.path(getwd(),name)
    # weather <- read.csv(path,header=TRUE)
    # Show weather on the IDE editor
    data(weather)
    head(weather)
    #------------------------------------------------------------------------
    # Select variables for the model
    weather <- subset(weather,select=c(MinTemp:RainTomorrow))
    set.seed(42) # Set seed
    #-------------------------------------------------------------------------
    # Determined the observations for the training,validate and test datasets.
    N <- nrow(weather) # 366 observations
    train <- sample(N, 0.8*N) # 292 observations
    test <- setdiff(seq_len(N),train) # 74 observations not intrain
    #-------------------------------------------------------------------------
    # Build the model
    M <- ncol(weather)
    input <- names(weather)[1:(M-2)] # names of input variables
    target <- "RainTomorrow" # name of target variable
    form <- formula(RainTomorrow ~ .) # Describe the model to R
    tree.m <- rpart(RainTomorrow ~ .,
    data=weather[train, c(input,target)],
    method="class",
    parms=list(split="information"),
    control=rpart.control(usesurrogate=0, maxsurrogate=0))
    #---------------------------------------------------------------------------
    # Look at the textual description of the tree.
    tree.m # print the model
    printcp(tree.m) # print the table of optimal prunings based on the complexity parameter
    #----------------------------------------------------------------------------
    # Plot the tree
    drawTreeNodes(tree.m)
    title(main="Weather Data tree.m",
    sub=paste(format(Sys.time(), "%Y-%b-%d %H:%M:%S"), Sys.info()["user"]))
    #----------------------------------------------------------------------------
    # Evaluate performance
    # Run the tree model on the validate set
    pred <- predict(tree.m, weather[test, c(input,target)], type="class")
    levels(pred) <- c("Yes","No") # change order of levesl to match documentation for confusionMatrix
    # Generate the confusion matrix
    actual <- weather[test, c(input,target)]$RainTomorrow
    levels(actual) <- c("Yes","No") # change order of levels to match documantation for confusion matrix
    AP <- c("Predicted","Actual") # row names for CM
    CM <- table(pred,actual,dnn=AP) # CM counts
    confusionMatrix(CM) # from the caret package
    ?confusionMatrix # Look at meaning of confusionMatrix outputs

    # Notes
    # The\no-information rate"shown on the output is the largest proportion of the observed classes
    # A one-sided hypothesis test is computed to evaluate whether the overall accuracy rate is greater
    # than the rate of the largest class. This is helpful for data sets where there is a large imbalance
    # between the classes.
    #
    # The kappa statistic yields a measure of how well the actual and predicted values agree
    # See http://www.chestx-ray.com/statistics/kappa.html or
    # http://en.wikipedia.org/wiki/Cohen%27s_kappa
    #
    # The null hypothesis for McNemar's chi squared test is that the actual and predicted
    # probabilities are the same
    # See http://en.wikipedia.org/wiki/McNemar%27s_test
    #
    #--------------------------------------------------------------------------------------------
    # Try another model using different variables
    form <- formula(RainTomorrow ~ Cloud9am + Pressure9am + WindDir9am + Temp9am + Humidity9am)
    tree.m2 <- rpart(form,
    data=weather[train, c(input,target)],
    method="class",
    parms=list(split="information"),
    control=rpart.control(usesurrogate=2,
    maxsurrogate=0,
    minsplit=30,
    maxdepth=20))
    #----------------------------------------------------------------------------------------------
    # Plot the new tree
    drawTreeNodes(tree.m2)
    title(main="Weather Data tree.m2",
    sub=paste(format(Sys.time(), "%Y-%b-%d %H:%M:%S"), Sys.info()["user"]))

    tree.mod.p <- as.party(tree.m2) # make the tree.mod object into a party object
    plot(tree.mod.p)

    #----------------------------------------------------------------------------
    # Evaluate performance of the new model on the test set
    pred2 <- predict(tree.m2, weather[test, c(input,target)], type="class")
    levels(pred2) <- c("Yes","No")
    CM2 <- table(pred2,actual,dnn=AP)
    confusionMatrix(CM2)
    # -----------------------------------------------------------------------------------
    #
    # GENERATE THE ROC CURVE FOR THE BEST MODEL
    prROC <- predict(tree.m, weather[test, c(input,target)])[,2]
    #
    # Get vector RainTommorrow in test data set
    testRT <- weather[test, c(input,target)]$RainTomorrow
    pr <- prediction(prROC, testRT)
    #------------------------------------------------------------------------------------
    # Plot the ROC curve
    plot(performance(pr, "tpr", "fpr"), col="#CC0000FF", lty=1, lwd=2,add=FALSE)
    #fpr: False positive rate. P(Yhat = + | Y = -). Estimated as: FP/N.
    #tpr: True positive rate. P(Yhat = + | Y = +). Estimated as: TP/P.
    segments(0,0,1,1,col="blue",lwd=2)
    # Add a legend to the plot.
    legend("bottomright", c("tree.m"), col=rainbow(1, 1, .8), lty=1:1, title="Models", inset=c(0.05, 0.05))
    # Add decorations to the plot.
    title(main="ROC Curve weather.csv [test data]",
    sub=paste(format(Sys.time(), "%Y-%b-%d %H:%M:%S"), Sys.info()["user"]))
    #

    215 changes: 215 additions & 0 deletions INTRO to CARET
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,215 @@

    #------------------------------------------------------------------------------
    # REVOLUTION ANALYTICS WEBINAR: INTRODUCTION TO R FOR DATA MINING
    # February 14, 2013
    # Joseph B. Rickert
    # Technical Marketing Manager
    #
    # DATA MINING with CARET
    #
    # Copyright: Revolution Analytics
    # This script is licensed under the GPLv2 license
    # http://www.gnu.org/licenses/gpl-2.0.html
    #------------------------------------------------------------------------------
    # INTRODUCTION TO THE CARET PACKAGE
    # caret is a feature rich package for doing data mining in R.
    # This script explores caret's capabilities using data included in the
    # package that was described in the paper:
    # Hill et al "Impact of image segmentation on high-content
    # screening data quality for SK-BR-3 cells"
    # BMC fioinformatics (2007) vol 8 (1) pp. 340
    #
    # Background
    # Well-segmented cells are cells for which location and size may be accurrately detremined
    # through optical measurements. Cells that are not Well-segmented (WS) are said to be
    # Poorly-segmented (PS).
    #
    # Problem
    # Given a set of optical measurements can we predict which cells will be PS?
    # This is a classic classification problem
    #---------------------------
    library(ada) # Boosting algorithms
    library(caret)
    library(rpart) # CART algorithm for decision trees
    library(partykit) # Plotting trees
    library(doParallel) # parallel processing
    # by default
    # Multicore functionality on Unix (single machine only)
    # Snow functionality on Windows (cluster)


    library(pROC) # plot the ROC curve
    library(corrplot) # plot correlations

    #---------------------------
    # data(package="caret")
    data(segmentationData) # Load the segmentation data set
    dim(segmentationData)
    head(segmentationData) # Have a look at the data
    #[1] 2019 61
    trainIndex <- createDataPartition(segmentationData$Case,p=.5,list=FALSE)
    trainData <- segmentationData[trainIndex,]
    dim(trainData)
    #1010 61
    testData <- segmentationData[-trainIndex,]
    dim(testData)
    #1009 61
    #-------------------------------------------------------------------------------------
    # VISUALIZE CORRELATIONS
    trainV <- trainData[,4:61]
    corrplot(cor(trainV),order="hclust",tl.cex=.5,method="ellipse")

    #-----------------------------------------------------------------
    # BUILD AN ADABOOST MODEL WITH ADA
    form <- formula(Class ~ .)
    control <- rpart.control(maxdepth=30, # the maximum depth of any node of the final tree
    cp = 0.01, # complexity parameter. Any split that does not decrease the overall lack of fit by a factor of cp is not attempted.
    minsplit=20, # the minimum number of observations that must exist in a node in order for a split to be attempted
    xval=10) # number of cross-validations

    ada.model <- ada(formula=form,
    data=trainData,
    control=control,
    nu = .01, # shrinkage parameter for boosting
    iter=50)

    ada.model$model[[1]] # Look at the trees in the model
    ada.model # look at the model performance
    plot(ada.model,TRUE) # Plot error rate vs. iterations of the model
    varplot(ada.model) # Variable importance plot
    #----------------------------------------------------------------------
    # FIND THE "BEST" MODEL
    #
    # This is an interesting model, but how do you select the best values for the
    # for the three tuning parameters?
    # nu
    # iter
    # maxdepth
    #---------------------------------------------------------------------------------
    # Algorithm for training the model:
    # for each resampled data set do
    # hold out some samples
    # for each combination of the three tuning parameters
    # do
    # Fit the model on the resampled data set
    # Predict the values of class on the hold out samples
    # end
    # Calculate AUC: the area under the ROC for each sample
    # Select the combination of tuning parmeters that yields the best AUC
    #
    # caret provides the "train" function to do all of this
    #
    # The trainControl function to set the training method
    # Note the default method of picking the best model is accuracy and Cohen's Kappa
    #
    #-----------------------------------------------------------------------------------
    # Set up the parameters to run the boosting function
    ctrl <- trainControl(method="repeatedcv", # use repeated 10fold cross validation
    number=5, # the number of folds
    repeats=2, # do 2 repititions of 5-fold cv
    summaryFunction=twoClassSummary, # Use AUC to pick the best model
    classProbs=TRUE)
    # Use the expand.grid to specify the search space
    # Note that the default search grid selects 3 values of each tuning parameter
    #
    grid <- expand.grid(.nu=c(.1,1), #
    .iter=c(20,50),
    .maxdepth=c(20,30)) #
    #
    set.seed(1)
    #names(trainData)
    trainX <-trainData[,4:61]
    #-----------------------------------------------------------------
    # PARALLEL COMPUTING
    # vignette("gettingstartedParallel")

    cl <- makeCluster(4) # Use this to manually create a cluster
    # But, since I only have a single Windows machine
    # all I am doing is passing the number of cores to use to
    # registerDoParallel()
    registerDoParallel(cl) # Registrer a parallel backend for train
    getDoParWorkers()

    system.time(ada.tune <- train(x=trainX,y=trainData$Class,
    method = "ada",
    metric = "ROC",
    trControl = ctrl,
    control=control,
    tuneGrid=grid))
    #
    stopCluster(cl)

    #user system elapsed
    #14.33 0.02 206.25
    #-------------------------------------------------------------------------------
    # ADA RESULTS
    ada.tune # Look at the results for the training grid
    ada.tune$finalModel # Look at the performance of the final model
    plot(ada.tune) # Plot the performance of the training models
    #--------------------------------------------------------------------------------
    # ADA PREDICTIONS
    testX <- testData[,4:61]
    ada.pred <- predict(ada.tune,testX)
    #
    confusionMatrix(ada.pred,testData$Class)
    #-----------------------------------------------------------------
    # DRAW THE ROC CURVE
    # Use roc function from the pROC package
    ada.probs <- predict(ada.tune,testX,type="prob")
    ada.ROC <- roc(predictor=ada.probs$PS,
    response=testData$Class,
    levels=rev(levels(testData$Class)))
    plot(ada.ROC,col=2)
    ada.ROC$auc # Get the area under the curve
    #------------------------------------------------------------------------------------
    #
    # SUPPORT VECTOR MACHINE MODEL
    #
    set.seed(1)
    registerDoParallel(4,cores=4)
    getDoParWorkers()
    system.time(
    svm.tune <- train(x=trainX,
    y= trainData$Class,
    method = "svmRadial",
    tuneLength = 5, # 5 values of the cost function
    preProc = c("center","scale"),
    metric="ROC",
    trControl=ctrl) # same as for ada above
    )

    #user system elapsed
    #2.40 0.14 26.10


    #--------------------------------------------------------------
    # SVM RESULTS
    svm.tune # Look at the results for the training grid
    svm.tune$finalModel # Look at the performance of the final mode
    plot(svm.tune,
    metric="ROC",
    scales=list(x=list(log=2)))
    #---------------------------------------------------------------
    # SVM PREDICTIONS
    svm.pred <- predict(svm.tune,testX)
    confusionMatrix(svm.pred,testData$Class)
    #
    #----------------------------------------------------------------
    # COMPARE THE SVM AND ADA MODELS USING RESAMPLING
    #
    # Because we set the same seed before running the models we can compare the models using resampling
    # See Hothorn at al, "The design and analysis of benchmark experiments"
    # Journal of Computational and Graphical Statistics (2005) vol 14 (3) pp 675-699
    # for comparing models using resampling.
    #
    # The resamples function in caret collates the resampling results from the two models
    rValues <- resamples(list(svm=svm.tune,ada=ada.tune))
    rValues$values # Look at the resample values
    summary(rValues) # Summarize the resamples

    #---------------------------------------------
    xyplot(rValues,metric="ROC") # scatter plot
    bwplot(rValues,metric="ROC") # boxplot
    parallel(rValues,metric="ROC") # parallel plot
    dotplot(rValues,metric="ROC") # dotplot
    #
    113 changes: 113 additions & 0 deletions ROLL with RATTLE
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,113 @@
    ##############################################################################
    # REVOLUTION ANALYTICS WEBINAR: INTRODUCTION TO R FOR DATA MINING
    # February 14, 2013
    # Joseph B. Rickert
    # Technical Marketing Manager
    #
    # ROLL with RATTLE
    #
    # Copyright: Revolution Analytics
    # This script is licensed under the GPLv2 license
    # http://www.gnu.org/licenses/gpl-2.0.html

    #################################################################################
    #
    library(rattle) # load the rattle package
    rattle() # start the rattle user interface
    #
    #data() # see what data sets are available in all of the loaded packages
    data(package="rattle") # see what data sets are availeble in rattle
    ls("package:rattle") # See what functions are in the Rattle package
    #lsf.str("package:rattle") # see what functions are in rattle
    #
    # THE FOLLOWING INSTRUCTIONS SHOULD BE HELPFUL FOR EXPLORING THE RATTLE GUI
    #
    # LOAD THE WEATHER DATA SET.
    # The weather data set consists of observations made at a weather monitoring station
    # in Canberra, Australia. Each ovservation describs weather conditions on a particular day.
    # See page 25 of Gram Willians' Data Mining with Rattle, The Art of Excavating Data for
    # Knowlwdge Discovery, Springer 2011
    #
    # Go to the Data Tab and click on Execute
    # Rattle will ask if you want to use the weather data as default. Click yes.
    #
    # SUMMARY STATISTICS
    # Go to the Explore Tab
    # Select summary and basics
    # Hit Execute
    #
    # SCATTER PLOTS
    # Go to the Explore Tab
    # Select Distributions
    # Click on Execute
    #
    # LOOK AT A SINGLE VARIABLE
    # Go to Explore Tab
    # Select RainTomorrow Bar Plot
    # Hit Execute
    # This produces a bar plot of the target variable RainTomorrow
    # 84% of the observations have no rain
    # A model that always predicts no rain should be about 84% accurate
    #
    # INVESTIGATE MULTIPLE VARIABLES
    # Go to the Explore Tab
    # In the upper panel select Box Plot and Histogram for both
    # MaxTemp
    # Sunshine
    # Click Execute
    #
    # Boxplots top left: Temperature generally higher day before it rains
    # Boxplots top right: Less sunshine day before it rains
    #
    # CORRELATIONS
    # Go to the Explore Tab
    # Un select any variables that may be selected
    # Select Correlation
    # Click on Execute
    #
    #
    # INTERACTIVELY EXPLORE DATA
    # Select Interactive and then Lattiscist
    # In bottom center panel
    # Select MaxTemp for y axis and
    # Select Sunshine for x axis
    # Place crosshair on outlier and right click
    #
    # BUILD A TREE MODEL
    # Go to Model Tab
    # Select Tree
    # Click Execute
    # Click on Draw button to get the graph
    # Click on Rules button to see rules
    # Select Log Tab to look at R code
    #
    # EVALUATE THE MODEL
    # Go to the Evaluate tab
    # Select
    # Type = Error Matrix
    # Model = Tree
    # Data = Testing
    # Click on Execute
    #
    # Error matrix for the Decision Tree model on weather.csv [test] (counts):
    #
    #Predicted
    #Actual No Yes
    #No 35 6 False positive rate = FP/N = 6/(35+6) = .146 = negatives incorrectly classified / total negatives
    #Yes 5 10 True positive rate = TP/P = 10/(10+5) = .667 = positives correctly classified / total positives
    # = sensitivity = recall = hit rate
    # True negative rate = TN / (FP + TN) = 1 - FP rate = .854
    # = specificity
    #
    # False positives = 6 = Type I Error (Test rejects true null hypothesis)
    # False negatives = 5 = Type II Error (Test fails to reject false null hypothesis)

    #Error matrix for the Decision Tree model on weather.csv [test] (%):
    #
    #Predicted
    #Actual No Yes
    #No 62 11 62% (35/56) of cases model predicts it won't rain and it didn't
    #Yes 9 18 18% (10/56) of cases model predicts it would rain and it did
    # Accracy of test = 62% + 18% = 80%
    #
    #Overall error: 0.1964286
    100 changes: 100 additions & 0 deletions WORDCLOUD
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,100 @@

    #------------------------------------------------------------
    # REVOLUTION ANALYTICS WEBINAR: INTRODUCTION TO R FOR DATA MINING
    # February 14, 2013
    # Joseph B. Rickert
    # Technical Marketing Manager
    #
    # JUST FOR FUN - BUILD A WORD CLOUD
    #
    # Copyright: Revolution Analytics
    # This script is licensed under the GPLv2 license
    # http://www.gnu.org/licenses/gpl-2.0.html


    # From example at RDataMining
    # http://www.rdatamining.com/examples/text-mining
    # This page shows an example on text mining of Twitter
    #-----------------------------------------------------------------------------
    # Load the libraries necesary
    library(twitteR) # twitteR provides access to Twitter data
    library(tm) # tm provides functions for text mining
    library(Snowball) # Wrappers for Weka Java stemming funcitons
    library(wordcloud) # wordcloud visualizes the result with a word cloud
    library(RColorBrewer) # provides the rainbow colors
    #------------------------------------------------------------------------------
    # retrieve the first 100 tweets (or all tweets if fewer than 100)
    # from the user timeline of @rdatammining
    #
    Tweets <- searchTwitter("#rstats",n=100)
    n <- length(Tweets)
    # Tweets[1:3]
    #
    #-------------------------------------------------------------------------------
    #Transforming Text
    #The tweets are first converted to a data frame and then to a corpus.
    df <- do.call("rbind", lapply(Tweets, as.data.frame))
    #dim(df)
    # Just in case twitter is off-line
    #df <-read.csv("UseRTweets.csv",header=TRUE,row.names=1)
    #head(df)
    #
    # Build a corpus, which is a collection of text documents
    # VectorSource specifies that the source is character vectors.
    myCorpus <- Corpus(VectorSource(df$text))

    #After that, the corpus needs a couple of transformations, including
    #changing letters to lower case,
    #removing punctuations/numbers and removing stop words.
    #The general English stop-word list is tailored by
    #adding "available" and "via" and removing "r".

    myCorpus <- tm_map(myCorpus, tolower) # lower case
    myCorpus <- tm_map(myCorpus, removePunctuation) # remove punctuation
    myCorpus <- tm_map(myCorpus, removeNumbers) # remove numbers
    # keep "r" by removing it from stopwords
    myStopwords <- c(stopwords('english'), "available", "via")
    idx <- which(myStopwords == "r")
    myStopwords <- myStopwords[-idx]
    myCorpus <- tm_map(myCorpus, removeWords, myStopwords)
    #----------------------------------------------------------------------------
    #Stemming Words
    # In many cases, words need to be stemmed to retrieve their radicals.
    # For instance, "example" and "examples" are both stemmed to "exampl".
    # However, after that, one may want to complete the stems to their original
    # forms, so that the words would look "normal".

    dictCorpus <- myCorpus
    # stem words in a text document with the snowball stemmers,
    # which requires packages Snowball, RWeka, rJava, RWekajars
    myCorpus <- tm_map(myCorpus, stemDocument)

    #inspect(myCorpus[1:3]) # inspect the first three ``documents"
    #myCorpus <- tm_map(myCorpus, stemCompletion, dictionary=dictCorpus) # stem completion
    #
    #
    #inspect(myCorpus[1:3]) #Print the first three documents in the built corpus.
    #----------------------------------------------------------------------------------------
    #Building a Document-Term Matrix
    myDtm <- TermDocumentMatrix(myCorpus, control = list(minWordLength = 1))
    # inspect(myDtm[266:270,31:40])
    #
    # findFreqTerms(myDtm, lowfreq=10) #Frequent Terms and Associations
    # findAssocs(myDtm, 'analytics', 0.30) # which words are associated with "analytics"?
    #-----------------------------------------------------------------------------------------
    #Build the word cloud
    #After building a document-term matrix, we can show the importance of
    #words with a word cloud (also kown as a tag cloud) .
    m <- as.matrix(myDtm)
    # calculate the frequency of words
    v <- sort(rowSums(m), decreasing=TRUE)
    myNames <- names(v)
    d <- data.frame(word=myNames, freq=v)
    # Plot the word cloud
    pal <- brewer.pal(6,"Dark2")
    pal <- pal[-(1)]
    #random colors
    wordcloud(d$word,d$freq,c(4,1),2,,TRUE,TRUE,.15,pal)