Created
July 10, 2024 19:01
-
-
Save DannyArends/5fb5258d3d524c3f0bb271c6603197c2 to your computer and use it in GitHub Desktop.
Code for the YouTube lecture "Principal Component Analysis from Scratch" (https://www.youtube.com/live/y9zS7Xm0iEU)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Unit-Variance Scaling or Autoscaling | |
autoscale <- function(X) { | |
return(apply(X,2, function(x){ (x- mean(x)) / sd(x) })) | |
} | |
# Compute the covariance matrix | |
covariance <- function(X) { | |
cM <- matrix(NA, ncol(X), ncol(X), dimnames = list(colnames(X), colnames(X))) | |
for(x in colnames(X)) { | |
for(y in colnames(X)) { | |
cM[x,y] <- (sum((X[,x] - mean(X[,x]))*(X[,y] - mean(X[,y])))) / (nrow(X)-1) | |
} | |
} | |
return(cM) | |
} | |
# Compute eigen vectors through QR decomposition | |
eigenvectors <- function(X, n.iter = 100) { | |
pQ <- diag(1, ncol(X)); # Set pQ as the Identity matrix | |
for (i in 1:n.iter) { | |
d <- qr(X); # QR decomposition | |
Q <- qr.Q(d); # Reconstruct the Q matrix | |
pQ <- pQ %*% Q; # Update the product of Q | |
X <- qr.R(d) %*% Q; # Reconstruct X from the R matrix multiplied by Q | |
} | |
return(list("values" = diag(X), "vectors" = pQ)) | |
} | |
data(iris) | |
values <- as.matrix(iris[,1:4]) | |
labels <- as.factor(iris[,5]) | |
# Compute Unit-Variance Scaling | |
std <- autoscale(values) | |
# Compute covariance matrix (from scratch) | |
cov <- covariance(std) | |
# Compute the eigen values and vectors | |
evs <- eigenvectors(cov) | |
# Compute (cumulative) variance explained | |
ve <- round((evs$values / sum(evs$values)) * 100, 1) | |
cve <- cumsum(ve) | |
plot(cve, main = "Cumulative variance explained", xlab = "Component", ylab = "%", t = "b") | |
# Compute the projection (P) | |
W <- evs$vectors | |
P <- std %*% W | |
colnames(P) <- paste0("PC", 1:ncol(P)) | |
# Plot the computed principal components compared to the R default function | |
op <- par(mfrow = c(1,2)) | |
plot(P[,c("PC1","PC2")], col = labels, main = "From Scratch", pch=19) | |
legend("bottomright", levels(labels), col = unique(labels), pch=19) | |
plot(prcomp(values, scale = TRUE)$x[,c("PC1","PC2")], col = labels, main ="Function prcomp()", pch=19) | |
legend("topright", levels(labels), col = unique(labels), pch=19) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment