yanping · July 18, 2014 17:09 · Feb 8, 2013
diff --git a/BIG DATA with RevoScale R b/BIG DATA with RevoScale R
@@ -0,0 +1,208 @@
+
+#------------------------------------------------------------ 
+# REVOLUTION ANALYTICS WEBINAR: INTRODUCTION TO R FOR DATA MINING
+# February 14, 2013
+# Joseph B. Rickert
+# Technical Marketing Manager
+#
+# BIG DATA with RevoScaleR
+#
+# Copyright: Revolution Analytics
+# This script is licensed under the GPLv2 license
+# http://www.gnu.org/licenses/gpl-2.0.html
+# ----------------------------------------------------------------------
+# LOOK AT THE MORTGATE DEFAULT DATA
+#------------------------------------------------------------------------
+dataDir <- "C:/Users/Joseph/Documents/DATA/Mortgage Data/mortDefault"
+mdata <- file.path(dataDir,"mortDefault.xdf")
+rxGetInfo(mdata,getVarInfo=TRUE)
+
+#-----------------------------------------------------------------------------------
+## Create a new data file having a variable with uniform random numbers
+# going from 1 to 10. This variable will be used to create the training and test
+# data sets.
+# A little note on how the random numbers are created:
+#		A transform should work on an arbitrary chunk of data.  Typically
+#		RevoScaleR functions will test transforms on a small chunk before
+#		fully processing. The internal variable (.rxNumRows) gives the size 
+#		of the chunk.  
+
+rxDataStep(inData = mdata, outFile = "mortDefault2",
+      transforms=list(urns = as.integer(runif(.rxNumRows,1,11))),
+      overwrite=TRUE)
+rxGetInfo("mortDefault2",getVarInfo=TRUE,numRows=3)
+
+#
+#------------------------------------------------------------
+# KMEANS ANALYSIS 
+#------------------------------------------------------------
+rxDataStep(inData="mortDefault2",outFile="mortDefault3",
+	       varsToDrop="default",
+		   overwrite=TRUE)
+rxGetInfo("mortDefault3",getVarInfo=TRUE,numRows=5)
+
+form <- formula(~ creditScore + houseAge + yearsEmploy + ccDebt + year)
+md.km <- rxKmeans(formula=form, 
+						data = "mortDefault3", 
+						numClusters = 3,
+ 						outFile = "mortDefault3",
+						algorithm = "lloyd",
+						overwrite=TRUE)
+rxGetInfo("mortDefault3",getVarInfo=TRUE,numRows=5)	
+md.km
+# Build a data frame to do a plot		
+mdDf <- rxXdfToDataFrame(file="mortDefault3",
+						 rowSelection=urns == 5,
+                         maxRowsByCols = 1000)
+
+plot(mdDf[,1:4],col=mdDf$.rxCluster)
+title(main="Clusters in Mortgage Default Data",line=3)
+
+###### SCRIPT TO BUILD LOGISTIC REGRESSION MODEL TO PREDICT MORTGAGE DEFAULTS #####
+#---------------------------------------------------------------------------
+# Some subsidary functions
+#---------------------------------------------------------------------------
+# Function to compute a "long form" of the confusion matrix
+Cmatrix <- function(df){
+	df <- as.data.frame(df)
+	df$Result <- c("True Negative","False Negative","False Positive","True Positive")
+	df$PCT <- round(df$Counts/sum(df$Counts),2)*100
+	df$Rates <- round(c(df$Counts[1]/(df$Counts[1]+df$Counts[3]),
+	               df$Counts[2]/(df$Counts[2]+df$Counts[4]),
+				   df$Counts[3]/(df$Counts[1]+df$Counts[3]),
+	               df$Counts[4]/(df$Counts[2]+df$Counts[4])),2)
+	names(df) <- c("Actual","Predicted","Counts","Results","Pct","Rates")
+	return(df)
+}
+#------------------------------------------------------------------------------
+##### CREATE TRAINING AND TEST FILES
+#-----------------------------------
+#info <- rxGetInfo(mdata)
+#N <- info$numRows
+#
+
+#-------------------------------------------------------------------------------
+# BUILD THE TRAINING FILE
+#------------------------
+rxDataStepXdf(inFile = "mortDefault2", 
+	          outFile = "mdTrain",
+              rowSelection = urns < 9,
+ 			  transforms=list(CS = creditScore,
+				              YR = year,
+						      yrE = yearsEmploy,
+						      HA = houseAge,
+							  ccD = ccDebt),
+			  blocksPerRead=20,
+			  rowsPerRead=500000,
+			  overwrite=TRUE )
+
+rxGetInfo("mdTrain",getVarInfo=TRUE,numRows=5)
+rxHistogram(~default,data="mdTrain")
+#-------------------------
+# BUILD THE TEST FILE
+#-------------------------
+rxDataStepXdf(inFile = "mortDefault2", 
+	          outFile = "mdTest",
+              rowSelection = urns > 8,
+ 			  transforms=list(CS = creditScore,
+				              YR = year,
+						      yrE = yearsEmploy,
+						      HA = houseAge,
+							  ccD = ccDebt),
+			  blocksPerRead=20,
+			  rowsPerRead=500000,
+			  overwrite=TRUE )
+#
+rxGetInfo("mdTest",getVarInfo=TRUE,numRows=5)
+rxHistogram(~default,data="mdTest")
+#---------------------------------------------------------------------------
+# BUILD A CLASSIFICATION MODEL USING LOGISTIC REGRESSION
+#---------------------------------------------------------------------------
+system.time(
+model <- rxLogit(default ~ F(houseAge) + F(year)+ creditScore + yearsEmploy + ccDebt, 
+	             data="mdTrain", 
+				 reportProgress=rxGetOption("reportProgress") )
+			)
+#
+#Elapsed computation time: 21.533 secs.
+   #user  system elapsed 
+  #56.15   12.02   21.55 
+
+
+#Elapsed computation time: 23.149 secs.
+   #user  system elapsed 
+  #56.81   10.58   23.17 
+#Elapsed computation time: 24.384 secs.
+   #user  system elapsed 
+  #59.29   10.31   24.48 
+
+summary(model)
+
+#----------------------------------------------------------------------
+# MAKE PREDICTIONS ON THE TEST DATA USING THE MODEL CREATED ABOVE
+#----------------------------------------------------------------------
+rxPredict(modelObject=model,data="mdTest",outData="mdTest",overwrite=TRUE,predVarNames="LogitPred")
+rxGetInfo("mdTest",getVarInfo=TRUE,numRows=5)
+#rxSummary(~default_Pred,data="mdTest")
+# Add a new prediction variable
+rxDataStep(inData="mdTest",outFile="mdTest",
+	            transforms=list(LogitPred.L = as.logical(round(LogitPred))),
+				overwrite=TRUE)
+#
+rxGetInfo("mdTest",getVarInfo=TRUE,numRows=5)
+
+#-------------------------------------------------------------------------------
+# GENERATE THE CONFUSION MATRIX
+#-------------------------------
+conMc <- rxCube(~ F(default):F(LogitPred.L),data="mdTest")
+Cmatrix(conMc)
+
+# Examine the performance of the model
+total.pct.correct <- round(100*(conMc$Counts[1]+conMc$Counts[4]) / sum(conMc$Counts),2)
+total.pct.correct
+#-----------------------------------------------------------------------------------
+# Generate the ROC Curve
+#
+rxRocCurve(actualVarName="default",predVarName="LogitPred",data="mdTest")
+#
+#-------------------------------------------------------------------------------------
+
+# BUILD A TREE MODEL
+system.time(
+model.tree <- rxDTree(default ~ HA + YR + CS + yrE + ccD, 
+	             data="mdTrain", 
+				 blocksPerRead = 1,
+				 maxDepth=5,
+				 reportProgress=rxGetOption("reportProgress") )
+			)
+##	
+
+#Elapsed time for RxDTreeBase: 89.545 secs.
+#
+   #user  system elapsed 
+ #245.13   12.50   89.57
+
+
+#Elapsed time for RxDTreeBase: 403.785 secs.
+# This was to fully build out the tree
+#user  system elapsed 
+#1092.37   75.89  403.83 
+
+model.tree
+#
+#----------------------------------------------------------------
+# Plot the Tree
+plot(rxAddInheritance(model.tree),uniform=TRUE)
+text(rxAddInheritance(model.tree),digits=2)
+title(main="Classification Tree for Mortgage Data",
+    sub=paste(format(Sys.time(), "%Y-%b-%d %H:%M:%S"), Sys.info()["user"]))
+#-------------------------------------------------------------------
+
+###### - END DEMO HERE - ###########
+
+
+
+
+
+
+
diff --git a/GETTING STARTED b/GETTING STARTED
@@ -0,0 +1,87 @@
+#
+#------------------------------------------------------------ 
+# REVOLUTION ANALYTICS WEBINAR: INTRODUCTION TO R FOR DATA MINING
+# February 14, 2013
+# Joseph B. Rickert
+# Technical Marketing Manager
+#
+# GETTING STARTED
+#
+# Copyright: Revolution Analytics
+# This script is licensed under the GPLv2 license
+# http://www.gnu.org/licenses/gpl-2.0.html
+
+#---------------------------------------------------------------------
+# Execute the following command to install all of the packages needed for the webinar
+#install.packages(c( "ada","boot","caret","corrplot","doParallel","ellipse",
+			        #"ISwR","partykit","pROC","rattle","RColorBrewer",
+					#"rpart","Snowball","ROCR","tm","twitteR","wordcloud"))
+#
+#----------------------------------------------------------------------
+# A First look at R
+# A simple regression example from
+# Statistics and Computing, Introductory Statistics with R
+# Peter Dalgaard, Springer 2002
+##
+library(ISwR)			# Load a library
+data()					# Have a look at what data sets are available
+data(thuesen)			# Load thuesen into the environment
+thuesen					# Have a look at it
+class(thuesen)          # Find out what kind of object thuesen is
+sapply(thuesen,class)   # See what kinds of animal the variables are
+#
+plot(short.velocity ~ blood.glucose, data=thuesen) #plot the data using the formula interface
+#
+plot(thuesen$blood.glucose,thuesen$short.velocity) # plot the data by indexining into the data frame
+#
+model <- lm(short.velocity ~ blood.glucose, data=thuesen) # build a linear model
+summary(model)			# Look at the results
+str(model)				# Look at the structure of the model object
+# Build a fancier plot
+plot(x=thuesen$blood.glucose,
+	 y=thuesen$short.velocity,
+	 xlab="blood glucose (mmol / l)",
+	 ylab = "circumferential shortening velocity (%/s)",
+	 main = "Thuesen Data set",
+	 col="blue",
+	 pch=19
+	 )
+abline(model,col="red")
+#
+par(mfrow=c(2,2))				# Set up for multiple plots
+plot(model, col="blue")			# look at some diagnostics
+
+#---------------------------------------------------------------------
+#
+# A FIRST LOOK AT FUNCTIONS
+#
+# Let's create a simple function
+joe.stats <- function(data){
+	           min <- min(data)
+			   max <- max(data)
+			   q <- quantile(data,probs=seq(0,1,0.25))
+			   res <- list(min,max,q)
+			   return(res)
+		}
+
+attach(thuesen)			 # make the columns of thuesen available
+                         # in the global environment as variables
+joe.stats(blood.glucose) # Run our function
+
+summary(blood.glucose)   # R does it better
+
+
+# Set up for later
+rm(list=ls())
+load("WEBINAR_2-14-13_Intro_R_DM_caret .RData")
+#--------------------------------------------------------------------------------
+#SOME ADDITIONAL ONLINE RESOURCES
+#An Introduction to R
+#Notes on R: A Programming Environment for Data Analysis and Graphics
+#Version 2.15.2 (2012-10-26)
+#http://cran.r-project.org/doc/manuals/R-intro.pdf
+#
+#Using R for Data Analysis and Graphics
+#Introduction, Code and Commentary
+#J H Maindonald
+#http://cran.r-project.org/doc/contrib/usingR.pdf
diff --git a/IN THE TREES b/IN THE TREES
@@ -0,0 +1,132 @@
+#------------------------------------------------------------------------
+# REVOLUTION ANALYTICS WEBINAR: INTRODUCTION TO R FOR DATA MINING
+# February 14, 2013
+# Joseph B. Rickert
+# Technical Marketing Manager
+#
+#### BUILD A TREE MODEL WITH RPART AND EVALUATE #####
+#
+# Copyright: Revolution Analytics
+# This script is licensed under the GPLv2 license
+# http://www.gnu.org/licenses/gpl-2.0.html
+#-------------------------------------------------------------------------
+# This script divides the data into training, validation and testing data,
+# builds two different decision trees (rpart) using the training data and
+# evaluates their performance using the test data set
+# An ROC curve is produced for the better model
+#------------------------------------------------------------------------
+library(rattle)
+library(rpart)
+library(ROCR)
+library(caret)
+# -----------------------------------------------------------------------
+# Read in the data from disk
+#	name <- "weather.csv"
+#	path <- file.path(getwd(),name)
+#	weather <- read.csv(path,header=TRUE)
+#	Show weather on the IDE editor
+data(weather)
+head(weather)
+#------------------------------------------------------------------------
+# Select variables for the model
+weather <- subset(weather,select=c(MinTemp:RainTomorrow))
+set.seed(42)											# Set seed
+#-------------------------------------------------------------------------
+# Determined the observations for the training,validate and test datasets.
+N <- nrow(weather)										# 366 observations
+train <- sample(N, 0.8*N)								# 292 observations
+test <-  setdiff(seq_len(N),train)                      #  74 observations not intrain
+#-------------------------------------------------------------------------
+# Build the model
+M <- ncol(weather)
+input  <- names(weather)[1:(M-2)]						# names of input variables
+target <- "RainTomorrow"								# name of target variable							
+form <- formula(RainTomorrow ~ .)						# Describe the model to R
+tree.m <- rpart(RainTomorrow ~ .,
+			    data=weather[train, c(input,target)],
+				method="class",
+				parms=list(split="information"),
+				control=rpart.control(usesurrogate=0, maxsurrogate=0))
+#---------------------------------------------------------------------------
+# Look at the textual description of the tree.
+tree.m						# print the model
+printcp(tree.m)				# print the table of optimal prunings based on the complexity parameter
+#----------------------------------------------------------------------------
+# Plot the tree
+drawTreeNodes(tree.m)
+title(main="Weather Data tree.m",
+    sub=paste(format(Sys.time(), "%Y-%b-%d %H:%M:%S"), Sys.info()["user"]))
+#----------------------------------------------------------------------------		
+# Evaluate performance
+# Run the tree model on the validate set
+  pred <- predict(tree.m, weather[test, c(input,target)], type="class")
+  levels(pred) <- c("Yes","No")								# change order of levesl to match documentation for confusionMatrix
+# Generate the confusion matrix
+  actual <- weather[test, c(input,target)]$RainTomorrow
+  levels(actual) <- c("Yes","No")							# change order of levels to match documantation for confusion matrix
+  AP <- c("Predicted","Actual")								# row names for CM
+  CM <- table(pred,actual,dnn=AP)							# CM counts
+  confusionMatrix(CM)										# from the caret package		
+  ?confusionMatrix											# Look at meaning of confusionMatrix outputs
+
+# Notes
+# The\no-information rate"shown on the output is the largest proportion of the observed classes
+# A one-sided hypothesis test is computed to evaluate whether the overall accuracy rate is greater
+# than the rate of the largest class. This is helpful for data sets where there is a large imbalance
+# between the classes.
+#
+# The kappa statistic yields a measure of how well the actual and predicted values agree
+# See http://www.chestx-ray.com/statistics/kappa.html or
+# http://en.wikipedia.org/wiki/Cohen%27s_kappa
+#
+# The null hypothesis for McNemar's chi squared test is that the actual and predicted
+# probabilities are the same
+# See http://en.wikipedia.org/wiki/McNemar%27s_test
+#
+#--------------------------------------------------------------------------------------------
+# Try another model using different variables
+form <- formula(RainTomorrow ~ Cloud9am + Pressure9am + WindDir9am + Temp9am + Humidity9am)
+tree.m2 <- rpart(form,
+			    data=weather[train, c(input,target)],
+				method="class",
+				parms=list(split="information"),
+				control=rpart.control(usesurrogate=2, 
+									  maxsurrogate=0,
+									  minsplit=30,
+									  maxdepth=20))
+#----------------------------------------------------------------------------------------------
+# Plot the new tree
+drawTreeNodes(tree.m2)
+title(main="Weather Data tree.m2",
+    sub=paste(format(Sys.time(), "%Y-%b-%d %H:%M:%S"), Sys.info()["user"]))
+
+tree.mod.p <- as.party(tree.m2)		# make the tree.mod object into a party object
+plot(tree.mod.p)
+
+#----------------------------------------------------------------------------		
+# Evaluate performance of the new model on the test set
+pred2 <- predict(tree.m2, weather[test, c(input,target)], type="class")	
+levels(pred2) <- c("Yes","No")
+CM2 <- table(pred2,actual,dnn=AP)
+confusionMatrix(CM2)
+# -----------------------------------------------------------------------------------
+#
+# GENERATE THE ROC CURVE FOR THE BEST MODEL
+prROC <- predict(tree.m, weather[test, c(input,target)])[,2]
+# 
+# Get vector RainTommorrow in test data set
+testRT <- weather[test, c(input,target)]$RainTomorrow
+pr <- prediction(prROC, testRT)
+#------------------------------------------------------------------------------------
+# Plot the ROC curve
+plot(performance(pr, "tpr", "fpr"), col="#CC0000FF", lty=1, lwd=2,add=FALSE)
+	#fpr: False positive rate. P(Yhat = + | Y = -). Estimated as: FP/N.
+    #tpr: True positive rate. P(Yhat = + | Y = +). Estimated as: TP/P.
+segments(0,0,1,1,col="blue",lwd=2)	
+# Add a legend to the plot.
+legend("bottomright", c("tree.m"), col=rainbow(1, 1, .8), lty=1:1, title="Models", inset=c(0.05, 0.05))
+# Add decorations to the plot.
+title(main="ROC Curve  weather.csv [test data]",
+      sub=paste(format(Sys.time(), "%Y-%b-%d %H:%M:%S"), Sys.info()["user"]))
+#
+
diff --git a/INTRO to CARET b/INTRO to CARET
@@ -0,0 +1,215 @@
+
+#------------------------------------------------------------------------------
+# REVOLUTION ANALYTICS WEBINAR: INTRODUCTION TO R FOR DATA MINING
+# February 14, 2013
+# Joseph B. Rickert
+# Technical Marketing Manager
+#
+# DATA MINING with CARET
+#
+# Copyright: Revolution Analytics
+# This script is licensed under the GPLv2 license
+# http://www.gnu.org/licenses/gpl-2.0.html
+#------------------------------------------------------------------------------
+# INTRODUCTION TO THE CARET PACKAGE
+# caret is a feature rich package for doing data mining in R.
+# This script explores caret's capabilities using data included in the 
+# package that was described in the paper:
+# Hill et al "Impact of image segmentation on high-content
+# screening data quality for SK-BR-3 cells"
+# BMC fioinformatics (2007) vol 8 (1) pp. 340
+#
+# Background
+# Well-segmented cells are cells for which location and size may be accurrately detremined
+# through optical measurements. Cells that are not Well-segmented (WS) are said to be 
+# Poorly-segmented (PS).
+#
+# Problem
+# Given a set of optical measurements can we predict which cells will be PS?
+# This is a classic classification problem
+#---------------------------
+library(ada)				# Boosting algorithms
+library(caret)
+library(rpart)				# CART algorithm for decision trees
+library(partykit)			# Plotting trees		
+library(doParallel)			# parallel processing
+                            # by default
+							#     Multicore functionality on Unix (single machine only)
+							#     Snow functionality on Windows (cluster)
+
+
+library(pROC)				# plot the ROC curve
+library(corrplot)			# plot correlations
+
+#---------------------------
+# data(package="caret")
+data(segmentationData)		# Load the segmentation data set
+dim(segmentationData)
+head(segmentationData)		# Have a look at the data
+#[1] 2019   61
+trainIndex <- createDataPartition(segmentationData$Case,p=.5,list=FALSE)
+trainData <- segmentationData[trainIndex,]
+dim(trainData)
+ #1010   61
+testData  <- segmentationData[-trainIndex,]
+dim(testData)
+ #1009   61
+#-------------------------------------------------------------------------------------
+# VISUALIZE CORRELATIONS
+trainV <- trainData[,4:61]
+corrplot(cor(trainV),order="hclust",tl.cex=.5,method="ellipse")
+
+#-----------------------------------------------------------------
+# BUILD AN ADABOOST MODEL WITH ADA
+form <- formula(Class ~ .)
+control <- rpart.control(maxdepth=30,   # the maximum depth of any node of the final tree
+	                     cp = 0.01,     # complexity parameter. Any split that does not decrease the overall lack of fit by a factor of cp is not attempted.
+						 minsplit=20,   # the minimum number of observations that must exist in a node in order for a split to be attempted
+						 xval=10)       # number of cross-validations
+
+ada.model <- ada(formula=form,
+	             data=trainData,
+				 control=control,
+				 nu = .01,				# shrinkage parameter for boosting
+				 iter=50)
+
+ada.model$model[[1]]					# Look at the trees in the model
+ada.model								# look at the model performance
+plot(ada.model,TRUE)					# Plot error rate vs. iterations of the model
+varplot(ada.model)						# Variable importance plot
+#----------------------------------------------------------------------
+# FIND THE "BEST" MODEL
+#
+# This is an interesting model, but how do you select the best values for the
+# for the three tuning parameters?
+#		nu
+#		iter
+#		maxdepth
+#---------------------------------------------------------------------------------
+# Algorithm for training the model:
+# for each resampled data set do
+#		hold out some samples
+#		for each combination of the three tuning parameters
+#			do
+#				Fit the model on the resampled data set
+#				Predict the values of class on the hold out samples
+#			end
+#		Calculate AUC: the area under the ROC for each sample
+#		Select the combination of tuning parmeters that yields the best AUC
+#
+# caret provides the "train" function to do all of this
+#
+# The trainControl function to set the training method
+# Note the default method of picking the best model is accuracy and Cohen's Kappa
+#
+#-----------------------------------------------------------------------------------
+# Set up the parameters to run the boosting function
+ctrl <- trainControl(method="repeatedcv",					# use repeated 10fold cross validation
+						number=5,							# the number of folds
+						repeats=2,							# do 2 repititions of 5-fold cv
+						summaryFunction=twoClassSummary,	# Use AUC to pick the best model
+						classProbs=TRUE)
+# Use the expand.grid to specify the search space	
+# Note that the default search grid selects 3 values of each tuning parameter
+#
+grid <- expand.grid(.nu=c(.1,1), # 
+					.iter=c(20,50),
+					.maxdepth=c(20,30))			# 
+#	
+set.seed(1)
+#names(trainData)
+trainX <-trainData[,4:61]
+#-----------------------------------------------------------------
+# PARALLEL COMPUTING
+# vignette("gettingstartedParallel")
+
+cl <- makeCluster(4)		 # Use this to manually create a cluster
+                             # But, since I only have a single Windows machine
+							 # all I am doing is passing the number of cores to use to 
+							 # registerDoParallel()
+registerDoParallel(cl)		 # Registrer a parallel backend for train
+getDoParWorkers()
+
+system.time(ada.tune <- train(x=trainX,y=trainData$Class,
+				method = "ada",
+				metric = "ROC",
+				trControl = ctrl,
+				control=control,
+				tuneGrid=grid))
+#
+stopCluster(cl)
+
+#user  system elapsed 
+#14.33    0.02  206.25 
+#-------------------------------------------------------------------------------
+# ADA RESULTS
+ada.tune						# Look at the results for the training grid				
+ada.tune$finalModel				# Look at the performance of the final model
+plot(ada.tune)					# Plot the performance of the training models
+#--------------------------------------------------------------------------------
+# ADA PREDICTIONS
+testX <- testData[,4:61]
+ada.pred <- predict(ada.tune,testX)
+#
+confusionMatrix(ada.pred,testData$Class)
+#-----------------------------------------------------------------
+# DRAW THE ROC CURVE
+# Use roc function from the pROC package
+ada.probs <- predict(ada.tune,testX,type="prob")
+ada.ROC <- roc(predictor=ada.probs$PS,
+				response=testData$Class,
+				levels=rev(levels(testData$Class)))
+plot(ada.ROC,col=2)
+ada.ROC$auc		# Get the area under the curve
+#------------------------------------------------------------------------------------
+#
+# SUPPORT VECTOR MACHINE MODEL
+#
+set.seed(1)
+registerDoParallel(4,cores=4)
+getDoParWorkers()
+system.time(
+	svm.tune <- train(x=trainX,
+					y= trainData$Class,
+					method = "svmRadial",
+					tuneLength = 5,					# 5 values of the cost function
+					preProc = c("center","scale"),
+					metric="ROC",
+					trControl=ctrl)					# same as for ada above
+)	
+
+#user  system elapsed 
+   #2.40    0.14   26.10 
+
+
+#--------------------------------------------------------------
+# SVM RESULTS
+svm.tune						# Look at the results for the training grid				
+svm.tune$finalModel				# Look at the performance of the final mode
+plot(svm.tune,
+	metric="ROC",
+	scales=list(x=list(log=2)))
+#---------------------------------------------------------------
+# SVM PREDICTIONS
+svm.pred <- predict(svm.tune,testX)
+confusionMatrix(svm.pred,testData$Class)
+#
+#----------------------------------------------------------------
+# COMPARE THE SVM AND ADA MODELS USING RESAMPLING
+#
+# Because we set the same seed before running the models we can compare the models using resampling
+# See Hothorn at al, "The design and analysis of benchmark experiments"
+# Journal of Computational and Graphical Statistics (2005) vol 14 (3) pp 675-699
+# for comparing models using resampling.
+#
+# The resamples function in caret collates the resampling results from the two models
+rValues <- resamples(list(svm=svm.tune,ada=ada.tune))		
+rValues$values					# Look at the resample values	
+summary(rValues)				# Summarize the resamples
+
+#---------------------------------------------
+xyplot(rValues,metric="ROC")		# scatter plot
+bwplot(rValues,metric="ROC")		# boxplot
+parallel(rValues,metric="ROC")		# parallel plot
+dotplot(rValues,metric="ROC")		# dotplot
+#
diff --git a/ROLL with RATTLE b/ROLL with RATTLE
@@ -0,0 +1,113 @@
+##############################################################################
+# REVOLUTION ANALYTICS WEBINAR: INTRODUCTION TO R FOR DATA MINING
+# February 14, 2013
+# Joseph B. Rickert
+# Technical Marketing Manager
+#
+# ROLL with RATTLE
+#
+# Copyright: Revolution Analytics
+# This script is licensed under the GPLv2 license
+# http://www.gnu.org/licenses/gpl-2.0.html
+
+#################################################################################
+#
+library(rattle)									# load the rattle package
+rattle()										# start the rattle user interface
+#
+#data()											# see what data sets are available in all of the loaded packages
+data(package="rattle")							# see what data sets are availeble in rattle
+ls("package:rattle")							# See what functions are in the Rattle package
+#lsf.str("package:rattle")						# see what functions are in rattle
+#
+# THE FOLLOWING INSTRUCTIONS SHOULD BE HELPFUL FOR EXPLORING THE RATTLE GUI
+#
+# LOAD THE WEATHER DATA SET.
+# The weather data set consists of observations made at a weather monitoring station
+# in Canberra, Australia. Each ovservation describs weather conditions on a particular day.
+# See page 25 of Gram Willians' Data Mining with Rattle, The Art of Excavating Data for 
+# Knowlwdge Discovery, Springer 2011
+#
+#		Go to the Data Tab and click on Execute
+#       Rattle will ask if you want to use the weather data as default. Click yes.
+# 
+# SUMMARY STATISTICS
+#		Go to the Explore Tab
+#		Select summary and basics
+#       Hit Execute
+# 
+# SCATTER PLOTS
+#		Go to the Explore Tab
+#		Select Distributions
+#		Click on Execute
+#
+# LOOK AT A SINGLE VARIABLE
+#		Go to Explore Tab
+#		Select RainTomorrow Bar Plot
+#		Hit Execute
+#		This produces a bar plot of the target variable RainTomorrow
+#		84% of the observations have no rain
+#		A model that always predicts no rain should be about 84% accurate
+#
+# INVESTIGATE MULTIPLE VARIABLES
+#		Go to the Explore Tab
+#       In the upper panel select Box Plot and Histogram for both
+#			MaxTemp
+#			Sunshine
+#		Click Execute
+#
+#		Boxplots top left: Temperature generally higher day before it rains
+#		Boxplots top right: Less sunshine day before it rains
+#
+# CORRELATIONS
+#		Go to the Explore Tab
+#		Un select any variables that may be selected
+#		Select Correlation
+#		Click on Execute
+#
+#
+# INTERACTIVELY EXPLORE DATA
+#	Select Interactive and then Lattiscist
+#	In bottom center panel
+#		Select MaxTemp for y axis and	
+#		Select Sunshine for x axis
+#		Place crosshair on outlier and right click
+#
+# BUILD A TREE MODEL
+#	Go to Model Tab 
+#	Select Tree
+#	Click Execute
+#	Click on Draw button to get the graph
+#	Click on Rules button to see rules
+#	Select Log Tab to look at R code
+#
+# EVALUATE THE MODEL
+#	Go to the Evaluate tab
+#	Select 
+#		Type = Error Matrix
+#		Model = Tree
+#		Data = Testing
+#	Click on Execute
+#
+# Error matrix for the Decision Tree model on weather.csv [test] (counts):
+#
+      #Predicted
+#Actual No Yes
+   #No  35   6  False positive rate = FP/N =  6/(35+6) = .146   = negatives incorrectly classified / total negatives
+   #Yes  5  10  True positive rate  = TP/P = 10/(10+5) = .667   = positives correctly classified / total positives
+#                                   = sensitivity  = recall = hit rate
+#               True negative rate  = TN / (FP + TN) = 1 - FP rate  = .854
+#                                   = specificity
+#
+#               False positives = 6 = Type  I Error (Test rejects true null hypothesis)
+#               False negatives = 5 = Type II Error (Test fails to reject false null hypothesis)
+
+#Error matrix for the Decision Tree model on weather.csv [test] (%):
+#
+      #Predicted
+#Actual No Yes
+   #No  62  11    62% (35/56) of cases model predicts it won't rain and it didn't
+   #Yes  9  18    18% (10/56) of cases model predicts it would rain and it did
+#                 Accracy of test = 62% + 18% = 80%
+#
+#Overall error: 0.1964286
diff --git a/WORDCLOUD b/WORDCLOUD
@@ -0,0 +1,100 @@
+
+#------------------------------------------------------------ 
+# REVOLUTION ANALYTICS WEBINAR: INTRODUCTION TO R FOR DATA MINING
+# February 14, 2013
+# Joseph B. Rickert
+# Technical Marketing Manager
+#
+# JUST FOR FUN - BUILD A WORD CLOUD
+#
+# Copyright: Revolution Analytics
+# This script is licensed under the GPLv2 license
+# http://www.gnu.org/licenses/gpl-2.0.html
+
+
+# From example at RDataMining
+# http://www.rdatamining.com/examples/text-mining
+# This page shows an example on text mining of Twitter 
+#-----------------------------------------------------------------------------
+# Load the libraries necesary
+library(twitteR)			# twitteR provides access to Twitter data
+library(tm)					# tm provides functions for text mining
+library(Snowball)           # Wrappers for Weka Java stemming funcitons
+library(wordcloud)          # wordcloud visualizes the result with a word cloud
+library(RColorBrewer)       # provides the rainbow colors
+#------------------------------------------------------------------------------
+# retrieve the first 100 tweets (or all tweets if fewer than 100)
+# from the user timeline of @rdatammining
+#
+Tweets <- searchTwitter("#rstats",n=100)
+n <- length(Tweets)
+# Tweets[1:3]
+#
+#-------------------------------------------------------------------------------
+#Transforming Text
+#The tweets are first converted to a data frame and then to a corpus.
+df <- do.call("rbind", lapply(Tweets, as.data.frame))
+#dim(df)
+# Just in case twitter is off-line
+#df <-read.csv("UseRTweets.csv",header=TRUE,row.names=1)
+#head(df)
+#
+# Build a corpus, which is a collection of text documents
+# VectorSource specifies that the source is character vectors.
+myCorpus <- Corpus(VectorSource(df$text))
+
+#After that, the corpus needs a couple of transformations, including 
+   #changing letters to lower case, 
+   #removing punctuations/numbers and removing stop words. 
+#The general English stop-word list is tailored by 
+#adding "available" and "via" and removing "r".
+
+myCorpus <- tm_map(myCorpus, tolower)					# lower case
+myCorpus <- tm_map(myCorpus, removePunctuation)			# remove punctuation
+myCorpus <- tm_map(myCorpus, removeNumbers)				# remove numbers
+# keep "r" by removing it from stopwords
+myStopwords <- c(stopwords('english'), "available", "via")
+idx <- which(myStopwords == "r")
+myStopwords <- myStopwords[-idx]
+myCorpus <- tm_map(myCorpus, removeWords, myStopwords)
+#----------------------------------------------------------------------------
+#Stemming Words
+#  In many cases, words need to be stemmed to retrieve their radicals. 
+#  For instance, "example" and "examples" are both stemmed to "exampl". 
+#  However, after that, one may want to complete the stems to their original 
+#  forms, so that the words would look "normal".
+
+dictCorpus <- myCorpus
+# stem words in a text document with the snowball stemmers,
+# which requires packages Snowball, RWeka, rJava, RWekajars
+myCorpus <- tm_map(myCorpus, stemDocument)
+
+#inspect(myCorpus[1:3])   # inspect the first three ``documents"
+#myCorpus <- tm_map(myCorpus, stemCompletion, dictionary=dictCorpus) # stem completion
+#
+#
+#inspect(myCorpus[1:3])   #Print the first three documents in the built corpus.
+#----------------------------------------------------------------------------------------
+#Building a Document-Term Matrix
+myDtm <- TermDocumentMatrix(myCorpus, control = list(minWordLength = 1))
+# inspect(myDtm[266:270,31:40])
+#
+# findFreqTerms(myDtm, lowfreq=10)		#Frequent Terms and Associations
+# findAssocs(myDtm, 'analytics', 0.30)    # which words are associated with "analytics"?
+#-----------------------------------------------------------------------------------------
+#Build the word cloud
+#After building a document-term matrix, we can show the importance of 
+#words with a word cloud (also kown as a tag cloud) . 
+m <- as.matrix(myDtm)
+# calculate the frequency of words
+v <- sort(rowSums(m), decreasing=TRUE)
+myNames <- names(v)
+d <- data.frame(word=myNames, freq=v)
+# Plot the word cloud
+pal <- brewer.pal(6,"Dark2")
+pal <- pal[-(1)]	
+#random colors
+wordcloud(d$word,d$freq,c(4,1),2,,TRUE,TRUE,.15,pal)
+
+
+