Created
May 24, 2016 16:19
-
-
Save jilmun/d2a7e6d053106f9c951d2e970edcb8e9 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# create dummy data ------------------------------------------------------- | |
set.seed(1) | |
d <- data.frame(col1 = sample(letters[1:3], 10, replace=T), | |
col2 = sample(letters[24:26], 10, replace=T), | |
col3 = runif(10) * 10, | |
stringsAsFactors = FALSE) | |
d$col1 <- as.factor(d$col1) | |
d$col4 = d$col3 + runif(10) | |
d; str(d) | |
# col1 col2 col3 col4 | |
# 1 a x 9.3470523 9.8291324 | |
# 2 b x 2.1214252 2.7209910 | |
# 3 b z 6.5167377 7.0102790 | |
# 4 c y 1.2555510 1.4417686 | |
# 5 a z 2.6722067 3.4995800 | |
# 6 c y 3.8611409 4.5296077 | |
# 7 c z 0.1339033 0.9281432 | |
# 8 b z 3.8238796 3.9318232 | |
# 9 b y 8.6969085 9.4206194 | |
# 10 a z 3.4034900 3.8147644 | |
# 'data.frame': 10 obs. of 4 variables: | |
# $ col1: Factor w/ 3 levels "a","b","c": 1 2 2 3 1 3 3 2 2 1 | |
# $ col2: chr "x" "x" "z" "y" ... | |
# $ col3: num 9.35 2.12 6.52 1.26 2.67 ... | |
# $ col4: num 9.83 2.72 7.01 1.44 3.5 ... | |
# data.matrix ------------------------------------------------------------- | |
# data.matrix converts dataframe into numerical matrix | |
data.matrix(d) # string columns return NA | |
# col1 col2 col3 col4 | |
# [1,] 1 NA 9.3470523 9.8291324 | |
# [2,] 2 NA 2.1214252 2.7209910 | |
# [3,] 2 NA 6.5167377 7.0102790 | |
# [4,] 3 NA 1.2555510 1.4417686 | |
# [5,] 1 NA 2.6722067 3.4995800 | |
# [6,] 3 NA 3.8611409 4.5296077 | |
# [7,] 3 NA 0.1339033 0.9281432 | |
# [8,] 2 NA 3.8238796 3.9318232 | |
# [9,] 2 NA 8.6969085 9.4206194 | |
# [10,] 1 NA 3.4034900 3.8147644 | |
# model.matrix ------------------------------------------------------------ | |
model.matrix(~col3, data=d) | |
# (Intercept) col3 | |
# 1 1 9.3470523 | |
# 2 1 2.1214252 | |
# 3 1 6.5167377 | |
# 4 1 1.2555510 | |
# 5 1 2.6722067 | |
# 6 1 3.8611409 | |
# 7 1 0.1339033 | |
# 8 1 3.8238796 | |
# 9 1 8.6969085 | |
# 10 1 3.4034900 | |
# attr(,"assign") | |
# [1] 0 1 | |
model.matrix(col4~col3, data=d) # left side is ignored | |
# (Intercept) col3 | |
# 1 1 9.3470523 | |
# 2 1 2.1214252 | |
# 3 1 6.5167377 | |
# 4 1 1.2555510 | |
# 5 1 2.6722067 | |
# 6 1 3.8611409 | |
# 7 1 0.1339033 | |
# 8 1 3.8238796 | |
# 9 1 8.6969085 | |
# 10 1 3.4034900 | |
# attr(,"assign") | |
# [1] 0 1 | |
model.matrix(~col1+col2, data=d) # intercept replaces 1 column (col1a) | |
# (Intercept) col1b col1c col2y col2z | |
# 1 1 0 0 0 0 | |
# 2 1 1 0 0 0 | |
# 3 1 1 0 0 1 | |
# 4 1 0 1 1 0 | |
# 5 1 0 0 0 1 | |
# 6 1 0 1 1 0 | |
# 7 1 0 1 0 1 | |
# 8 1 1 0 0 1 | |
# 9 1 1 0 1 0 | |
# 10 1 0 0 0 1 | |
# attr(,"assign") | |
# [1] 0 1 1 2 2 | |
# attr(,"contrasts") | |
# attr(,"contrasts")$col1 | |
# [1] "contr.treatment" | |
# | |
# attr(,"contrasts")$col2 | |
# [1] "contr.treatment" | |
model.matrix(~col1+col2-1, data=d) # "+0" or "-1" drops intercept | |
# col1a col1b col1c col2y col2z | |
# 1 1 0 0 0 0 | |
# 2 0 1 0 0 0 | |
# 3 0 1 0 0 1 | |
# 4 0 0 1 1 0 | |
# 5 1 0 0 0 1 | |
# 6 0 0 1 1 0 | |
# 7 0 0 1 0 1 | |
# 8 0 1 0 0 1 | |
# 9 0 1 0 1 0 | |
# 10 1 0 0 0 1 | |
# attr(,"assign") | |
# [1] 1 1 1 2 2 | |
# attr(,"contrasts") | |
# attr(,"contrasts")$col1 | |
# [1] "contr.treatment" | |
# | |
# attr(,"contrasts")$col2 | |
# [1] "contr.treatment" | |
model.matrix(col4~.+0-col3, data=d) # same results as above | |
# col1a col1b col1c col2y col2z | |
# 1 1 0 0 0 0 | |
# 2 0 1 0 0 0 | |
# 3 0 1 0 0 1 | |
# 4 0 0 1 1 0 | |
# 5 1 0 0 0 1 | |
# 6 0 0 1 1 0 | |
# 7 0 0 1 0 1 | |
# 8 0 1 0 0 1 | |
# 9 0 1 0 1 0 | |
# 10 1 0 0 0 1 | |
# attr(,"assign") | |
# [1] 1 1 1 2 2 | |
# attr(,"contrasts") | |
# attr(,"contrasts")$col1 | |
# [1] "contr.treatment" | |
# | |
# attr(,"contrasts")$col2 | |
# [1] "contr.treatment" | |
# one hot encoding -------------------------------------------------------- | |
# create new contrast function with 'contrasts=FALSE' | |
contr.onehot <- function (n, contrasts, sparse=FALSE) { | |
contr.sum(n=n, contrasts=FALSE, sparse=sparse) | |
} | |
# set options | |
options(contrasts = c("contr.onehot", "contr.onehot")) | |
# use model.matrix function | |
model.matrix(~col1+col2-1, data=d) | |
# col1a col1b col1c col2x col2y col2z | |
# 1 1 0 0 1 0 0 | |
# 2 0 1 0 1 0 0 | |
# 3 0 1 0 0 0 1 | |
# 4 0 0 1 0 1 0 | |
# 5 1 0 0 0 0 1 | |
# 6 0 0 1 0 1 0 | |
# 7 0 0 1 0 0 1 | |
# 8 0 1 0 0 0 1 | |
# 9 0 1 0 0 1 0 | |
# 10 1 0 0 0 0 1 | |
# attr(,"assign") | |
# [1] 1 1 1 2 2 2 | |
# attr(,"contrasts") | |
# attr(,"contrasts")$col1 | |
# [1] "contr.onehot" | |
# | |
# attr(,"contrasts")$col2 | |
# [1] "contr.onehot" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment