Skip to content

Instantly share code, notes, and snippets.

@smc77
Created October 26, 2011 01:45

Revisions

  1. smc77 revised this gist Nov 1, 2011. 1 changed file with 5 additions and 5 deletions.
    10 changes: 5 additions & 5 deletions logistic_regression.R
    Original file line number Diff line number Diff line change
    @@ -8,11 +8,11 @@ sa.heart <- read.table("http://www-stat.stanford.edu/~tibs/ElemStatLearn/dataset
    # Pretty plot
    pairs(sa.heart[1:9],pch=21,bg=c("red","green")[factor(sa.heart$chd)])

    # Apply logistic regression to South African heart data from ESL
    sa.heart.fit <- glm(chd ~ ., family = binomial, data=sa.heart)
    summary(sa.heart.fit)

    # Use stepwise logistic regression to reduce the dimensions
    library(MASS)
    sa.heart.step <- stepAIC(sa.heart)
    sa.heart.step <- stepAIC(glm(chd ~ ., data=sa.heart))
    summary(sa.heart.step)

    # Apply logistic regression to South African heart data from ESL
    sa.heart.fit <- glm(chd ~ sbp + tobacco + ldl + famhist + obesity + alcohol + age , family = binomial, data=sa.heart)
    summary(sa.heart.fit)
  2. smc77 revised this gist Oct 28, 2011. 1 changed file with 0 additions and 70 deletions.
    70 changes: 0 additions & 70 deletions logistic_regression.R
    Original file line number Diff line number Diff line change
    @@ -1,4 +1,3 @@

    # Plot the sigmoid function
    library(ggplot2)
    qplot(-10:10, 1/(1 + exp(-(-10:10))), geom="line", xlab="z", ylab="sigmoid function")
    @@ -9,75 +8,6 @@ sa.heart <- read.table("http://www-stat.stanford.edu/~tibs/ElemStatLearn/dataset
    # Pretty plot
    pairs(sa.heart[1:9],pch=21,bg=c("red","green")[factor(sa.heart$chd)])

    num.iterations <- 1000

    # Download South African heart disease data
    sa.heart <- read.table("http://www-stat.stanford.edu/~tibs/ElemStatLearn/datasets/SAheart.data", sep=",",head=T,row.names=1)

    x <- sa.heart[,c("age", "ldl")]
    y <- sa.heart$chd
    plot(x, pch=21, bg=c("red","green")[factor(y)])

    # Function to standardize input values
    zscore <- function(x, mean.val=NA) {
    if(is.matrix(x)) return(apply(x, 2, zscore, mean.val=mean.val))
    if(is.data.frame(x)) return(data.frame(apply(x, 2, zscore, mean.val=mean.val)))
    if(is.na(mean.val)) mean.val <- mean(x)
    sd.val <- sd(x)
    if(all(sd.val == 0)) return(x) # if all the values are the same
    (x - mean.val) / sd.val
    }

    # Standardize the features
    x.scaled <- zscore(x)

    # Gradient descent function
    grad <- function(x, y, theta) {
    gradient <- (1 / nrow(y)) * (t(x) %*% (1/(1 + exp(-x %*% t(theta))) - y))
    return(t(gradient))
    }

    gradient.descent <- function(x, y, alpha=0.1, num.iterations=500, threshold=1e-5, output.path=FALSE) {

    # Add x_0 = 1 as the first column
    m <- if(is.vector(x)) length(x) else nrow(x)
    if(is.vector(x) || (!all(x[,1] == 1))) x <- cbind(rep(1, m), x)
    if(is.vector(y)) y <- matrix(y)
    x <- apply(x, 2, as.numeric)

    num.features <- ncol(x)

    # Initialize the parameters
    theta <- matrix(rep(0, num.features), nrow=1)

    # Look at the values over each iteration
    theta.path <- theta
    for (i in 1:num.iterations) {
    theta <- theta - alpha * grad(x, y, theta)
    if(all(is.na(theta))) break
    theta.path <- rbind(theta.path, theta)
    if(i > 2) if(all(abs(theta - theta.path[i-1,]) < threshold)) break
    }

    if(output.path) return(theta.path) else return(theta.path[nrow(theta.path),])
    }

    unscaled.theta <- gradient.descent(x=x, y=y, num.iterations=num.iterations, output.path=TRUE)
    scaled.theta <- gradient.descent(x=x.scaled, y=y, num.iterations=num.iterations, output.path=TRUE)

    summary(glm(chd ~ age + ldl, family = binomial, data=sa.heart))

    qplot(1:(nrow(scaled.theta)), scaled.theta[,1], geom=c("line"), xlab="iteration", ylab="theta_1")
    qplot(1:(nrow(scaled.theta)), scaled.theta[,2], geom=c("line"), xlab="iteration", ylab="theta_2")

    # Look at output for various different alpha values
    vary.alpha <- lapply(c(1e-12, 1e-9, 1e-7, 1e-3, 0.1, 0.9), function(alpha) gradient.descent(x=x.scaled, y=y, alpha=alpha, num.iterations=num.iterations, output.path=TRUE))

    par(mfrow = c(2, 3))
    for (j in 1:6) {
    plot(vary.alpha[[j]][,2], ylab="area (alpha=1e-9)", xlab="iteration", type="l")
    }

    # Use stepwise logistic regression to reduce the dimensions
    library(MASS)
    sa.heart.step <- stepAIC(sa.heart)
  3. smc77 revised this gist Oct 28, 2011. 1 changed file with 75 additions and 15 deletions.
    90 changes: 75 additions & 15 deletions logistic_regression.R
    Original file line number Diff line number Diff line change
    @@ -1,28 +1,88 @@

    # Plot the sigmoid function
    library(ggplot2)
    qplot(-10:10, 1/(1 + exp(-(-10:10))), geom="line", xlab="z", ylab="sigmoid function")

    # Apply logistic regression to South African heart data from ESL
    sa.heart <- glm(chd ~ sbp + tobacco + ldl + famhist + obesity + alcohol + age , family = binomial , data=hr)
    summary(sa.heart)

    # Download South African heart disease data
    hr <- read.table("http://www-stat.stanford.edu/~tibs/ElemStatLearn/datasets/SAheart.data", sep=",",head=T,row.names=1)
    sa.heart <- read.table("http://www-stat.stanford.edu/~tibs/ElemStatLearn/datasets/SAheart.data", sep=",",head=T,row.names=1)

    # Pretty plot
    pairs(hr[1:9],pch=21,bg=c("red","green")[factor(hr$chd)])
    pairs(sa.heart[1:9],pch=21,bg=c("red","green")[factor(sa.heart$chd)])

    num.iterations <- 1000

    # Download South African heart disease data
    sa.heart <- read.table("http://www-stat.stanford.edu/~tibs/ElemStatLearn/datasets/SAheart.data", sep=",",head=T,row.names=1)

    x <- sa.heart[,c("age", "ldl")]
    y <- sa.heart$chd
    plot(x, pch=21, bg=c("red","green")[factor(y)])

    # Function to standardize input values
    zscore <- function(x, mean.val=NA) {
    if(is.matrix(x)) return(apply(x, 2, zscore, mean.val=mean.val))
    if(is.data.frame(x)) return(data.frame(apply(x, 2, zscore, mean.val=mean.val)))
    if(is.na(mean.val)) mean.val <- mean(x)
    sd.val <- sd(x)
    if(all(sd.val == 0)) return(x) # if all the values are the same
    (x - mean.val) / sd.val
    }

    # Standardize the features
    x.scaled <- zscore(x)

    # Gradient descent function
    grad <- function(x, y, theta) {
    gradient <- (1 / nrow(y)) * (t(x) %*% (1/(1 + exp(-x %*% t(theta))) - y))
    return(t(gradient))
    }

    # Scale data and create a random train / test split
    n <- nrow(hr)
    p <- ncol(hr)-1
    test.ratio <- .2 # ratio of test/train samples
    n.test <- ceiling(n*test.ratio)
    testi <- sample(1:n,n.test)
    traini <- setdiff(1:n,testi)
    gradient.descent <- function(x, y, alpha=0.1, num.iterations=500, threshold=1e-5, output.path=FALSE) {

    data.train <- hr[traini,]
    data.test <- hr[testi,]
    # Add x_0 = 1 as the first column
    m <- if(is.vector(x)) length(x) else nrow(x)
    if(is.vector(x) || (!all(x[,1] == 1))) x <- cbind(rep(1, m), x)
    if(is.vector(y)) y <- matrix(y)
    x <- apply(x, 2, as.numeric)

    num.features <- ncol(x)

    # Initialize the parameters
    theta <- matrix(rep(0, num.features), nrow=1)

    # Look at the values over each iteration
    theta.path <- theta
    for (i in 1:num.iterations) {
    theta <- theta - alpha * grad(x, y, theta)
    if(all(is.na(theta))) break
    theta.path <- rbind(theta.path, theta)
    if(i > 2) if(all(abs(theta - theta.path[i-1,]) < threshold)) break
    }

    if(output.path) return(theta.path) else return(theta.path[nrow(theta.path),])
    }

    unscaled.theta <- gradient.descent(x=x, y=y, num.iterations=num.iterations, output.path=TRUE)
    scaled.theta <- gradient.descent(x=x.scaled, y=y, num.iterations=num.iterations, output.path=TRUE)

    summary(glm(chd ~ age + ldl, family = binomial, data=sa.heart))

    qplot(1:(nrow(scaled.theta)), scaled.theta[,1], geom=c("line"), xlab="iteration", ylab="theta_1")
    qplot(1:(nrow(scaled.theta)), scaled.theta[,2], geom=c("line"), xlab="iteration", ylab="theta_2")

    # Look at output for various different alpha values
    vary.alpha <- lapply(c(1e-12, 1e-9, 1e-7, 1e-3, 0.1, 0.9), function(alpha) gradient.descent(x=x.scaled, y=y, alpha=alpha, num.iterations=num.iterations, output.path=TRUE))

    par(mfrow = c(2, 3))
    for (j in 1:6) {
    plot(vary.alpha[[j]][,2], ylab="area (alpha=1e-9)", xlab="iteration", type="l")
    }

    # Use stepwise logistic regression to reduce the dimensions
    library(MASS)
    sa.heart.step <- stepAIC(sa.heart)
    summary(sa.heart.step)

    # Apply logistic regression to South African heart data from ESL
    sa.heart.fit <- glm(chd ~ sbp + tobacco + ldl + famhist + obesity + alcohol + age , family = binomial, data=sa.heart)
    summary(sa.heart.fit)
  4. smc77 revised this gist Oct 27, 2011. 1 changed file with 18 additions and 1 deletion.
    19 changes: 18 additions & 1 deletion logistic_regression.R
    Original file line number Diff line number Diff line change
    @@ -5,7 +5,24 @@ qplot(-10:10, 1/(1 + exp(-(-10:10))), geom="line", xlab="z", ylab="sigmoid funct
    sa.heart <- glm(chd ~ sbp + tobacco + ldl + famhist + obesity + alcohol + age , family = binomial , data=hr)
    summary(sa.heart)

    # Download South African heart disease data
    hr <- read.table("http://www-stat.stanford.edu/~tibs/ElemStatLearn/datasets/SAheart.data", sep=",",head=T,row.names=1)

    # Pretty plot
    pairs(hr[1:9],pch=21,bg=c("red","green")[factor(hr$chd)])

    # Scale data and create a random train / test split
    n <- nrow(hr)
    p <- ncol(hr)-1
    test.ratio <- .2 # ratio of test/train samples
    n.test <- ceiling(n*test.ratio)
    testi <- sample(1:n,n.test)
    traini <- setdiff(1:n,testi)

    data.train <- hr[traini,]
    data.test <- hr[testi,]

    # Use stepwise logistic regression to reduce the dimensions
    library(MASS)
    sa.heart.step <- stepAIC(sa.heart)
    summary(sa.heart.step)
    summary(sa.heart.step)
  5. smc77 revised this gist Oct 26, 2011. 1 changed file with 4 additions and 1 deletion.
    5 changes: 4 additions & 1 deletion logistic_regression.R
    Original file line number Diff line number Diff line change
    @@ -1,8 +1,11 @@
    # Plot the sigmoid function
    qplot(-10:10, 1/(1 + exp(-(-10:10))), geom="line", xlab="z", ylab="sigmoid function")

    # Apply logistic regression to South African heart data from ESL
    sa.heart <- glm(chd ~ sbp + tobacco + ldl + famhist + obesity + alcohol + age , family = binomial , data=hr)

    summary(sa.heart)

    # Use stepwise logistic regression to reduce the dimensions
    library(MASS)
    sa.heart.step <- stepAIC(sa.heart)
    summary(sa.heart.step)
  6. smc77 revised this gist Oct 26, 2011. 1 changed file with 4 additions and 0 deletions.
    4 changes: 4 additions & 0 deletions logistic_regression.R
    Original file line number Diff line number Diff line change
    @@ -2,3 +2,7 @@
    sa.heart <- glm(chd ~ sbp + tobacco + ldl + famhist + obesity + alcohol + age , family = binomial , data=hr)

    summary(sa.heart)

    library(MASS)
    sa.heart.step <- stepAIC(sa.heart)
    summary(sa.heart.step)
  7. smc77 created this gist Oct 26, 2011.
    4 changes: 4 additions & 0 deletions logistic_regression.R
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,4 @@
    # Apply logistic regression to South African heart data from ESL
    sa.heart <- glm(chd ~ sbp + tobacco + ldl + famhist + obesity + alcohol + age , family = binomial , data=hr)

    summary(sa.heart)