Created
November 9, 2012 13:42
-
-
Save nicebread/4045717 to your computer and use it in GitHub Desktop.
Ruscio - Code for generating correlating variables with arbitrary distributions
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## This code snippet is taken from Ruscio, J., & Kaczetow, W. (2008). Simulating Multivariate Nonnormal Data Using an Iterative Algorithm. Multivariate Behavioral Research, 43(3), 355–381. | |
## In the original code were some errors; this is a cleaned version. | |
## 2012 Felix Schönbrodt | |
#' @param Pop List with empirical samples | |
#' @param rho The correlation to be induced | |
#' @examples | |
#' Pop <- list(X=rnorm(1000), Y=rlnorm(3000)) | |
#' r1 <- GenData(Pop, rho=.4) | |
#' plot(r1) | |
#' cor(r1, method="p") | |
################################################################################################################ | |
GenData <- function(Pop, rho, N=1000, N.Factors = 0, Max.Trials = 5, Initial.Multiplier = 1, seed = NA) | |
{ | |
# Initialize variables and (if applicable) set random number seed (step 1) ------------------------------------- | |
k <- length(Pop) | |
Data <- matrix(0, nrow = N, ncol = k) # Matrix to store the simulated data | |
Iteration <- 0 # Iteration counter | |
Best.RMSR <- 1 # Lowest RMSR correlation | |
Trials.Without.Improvement <- 0 # Trial counter | |
if (!is.na(seed)) set.seed(seed) # If user specified a nonzero seed, set it | |
Distributions <- matrix(NA, nrow=N, ncol=k) | |
Target.Corr <- matrix(c(1, rho, rho, 1), nrow=2) | |
# Generate distribution for each variable (step 2) ------------------------------------------------------------- | |
for (i in 1:k) { | |
Distributions[, i] <- sort(sample(Pop[[i]], N, replace = TRUE)) | |
} | |
# This implementation of GenData bootstraps each variable's score distribution from a supplied data set. | |
# Users should modify this block of the program, as needed, to generate the desired distribution(s). | |
# | |
# For example, to sample from chi-square distributions with 2 df, replace the 2nd line in this block with: | |
# Distributions[,i] <- sort(rchisq(N, df = 2)) | |
# | |
# Or, one can drop the loop and use a series of commands that samples variables from specified populations: | |
# Distributions[,1] <- sort(rnorm(N, 0, 1)) # Standard normal distribution | |
# Distributions[,2] <- sort(runif(N, 0, 1)) # Uniform distribution ranging from 0 - 1 | |
# Distributions[,3] <- sort(rlnorm(N, 0, 1)) # Log-normal distribution, log scale M = 0, SD = 1 | |
# Distributions[,4] <- sort(rexp(N, rate = 1)) # Exponential distribution with rate = 1 | |
# Distributions[,5] <- sort(rpois(N, lambda = 4)) # Poisson distribution with lambda = 4 | |
# Distributions[,6] <- sort(rbinom(N, 10, .25) # Binominal distribution, size = 10 and p = .25 | |
# Distributions[,7] <- sort(rbinom(N, 2, .25) # Binary distribution with p = .25 | |
# | |
# All of the commands shown above draw random samples from specified population distributions. As an | |
# alternative, one can reproduce distributions without sampling error. For example, working with a | |
# supplied data set, one can replace the 2nd line in this block with: | |
# Disrributions[,i] <- Supplied.Data[,i] | |
# Alternatively, idealized distributions can be reproduced. For example, uniform quantiles can be | |
# created and used to generate data from common distributions: | |
# Uniform.Quantiles <- seq(from = 0, to = 1, length = (N + 2))[2:(N + 1)] # quantiles 0, 1 dropped | |
# Distributions[,1] <- qnorm(Uniform.Quantiles, 0, 1) # Standard normal distribution | |
# Distributions[,2] <- qunif(Uniform.Quantiles, 0, 1) # Uniform distribution ranging from 0 to 1 | |
# Distributions[,3] <- qchisq(Uniform.Quantiles, df = 2) # Chi-square distribution with 2 df | |
# | |
# Note that when score distributions are generated from specified populations rather than bootstrapped from | |
# a supplied data set, the user must provide the target correlation matrix (see the next block). This is | |
# true regardless of whether the distributions incorporate sampling error. | |
# Calculate and store a copy of the target correlation matrix (step 3) ----------------------------------------- | |
#Target.Corr <- cor(Supplied.Data) | |
Intermediate.Corr <- Target.Corr | |
# This implementation of GenData calculates the target correlation matrix from a supplied data set. | |
# Alternatively, the user can modify the program to generate data with user-defined sample size, number of | |
# variables, and target correlation matrix by redefining the function as follows: | |
# GenData <- function(N, k, Target.Corr, N.Factors = 0, Max.Trials = 5, Initial.Multiplier = 1, seed = 0) | |
# In this case, one would also remove the program lines that calculate N, k, and Target.Corr. | |
# To generate data in which variables are uncorrelated, one would remove the “sort” function from step 2 | |
# and terminate the program before step 3 begins by returning the Distributions object as the data set. | |
# If number of latent factors was not specified, determine it through parallel analysis (step 4) --------------- | |
if (N.Factors == 0) | |
{ | |
Eigenvalues.Observed <- eigen(Intermediate.Corr)$values | |
Eigenvalues.Random <- matrix(0, nrow = 100, ncol = k) | |
Random.Data <- matrix(0, nrow = N, ncol = k) | |
for (i in 1:100) | |
{ | |
for (j in 1:k) | |
Random.Data[,j] <- sample(Distributions[,j], size = N, replace = TRUE) | |
Eigenvalues.Random[i,] <- eigen(cor(Random.Data))$values | |
} | |
Eigenvalues.Random <- apply(Eigenvalues.Random, 2, mean) # calculate mean eigenvalue for each factor | |
N.Factors <- max(1, sum(Eigenvalues.Observed > Eigenvalues.Random)) | |
} | |
# Generate random normal data for shared and unique components, initialize factor loadings (steps 5, 6) -------- | |
Shared.Comp <- matrix(rnorm(N * N.Factors, 0, 1), nrow = N, ncol = N.Factors) | |
Unique.Comp <- matrix(rnorm(N * k, 0, 1), nrow = N, ncol = k) | |
Shared.Load <- matrix(0, nrow = k, ncol = N.Factors) | |
Unique.Load <- matrix(0, nrow = k, ncol = 1) | |
# Begin loop that ends when specified number of iterations pass without improvement in RMSR correlation -------- | |
while (Trials.Without.Improvement < Max.Trials) | |
{ | |
Iteration <- Iteration + 1 | |
# Calculate factor loadings and apply to reproduce desired correlations (steps 7, 8) --------------------------- | |
Fact.Anal <- Factor.Analysis(Intermediate.Corr, Corr.Matrix = TRUE, N.Factors = N.Factors) | |
if (N.Factors == 1) { | |
Shared.Load[,1] <- Fact.Anal$loadings | |
} else { | |
Shared.Load <- Fact.Anal$loadings | |
} | |
Shared.Load[Shared.Load > 1] <- 1 | |
Shared.Load[Shared.Load < -1] <- -1 | |
if (Shared.Load[1,1] < 0) Shared.Load <- Shared.Load * -1 | |
Shared.Load.sq <- Shared.Load * Shared.Load | |
for (i in 1:k) | |
if (sum(Shared.Load.sq[i,]) < 1) { | |
Unique.Load[i,1] <- (1 - sum(Shared.Load.sq[i,])) | |
} else { | |
Unique.Load[i,1] <- 0 | |
} | |
Unique.Load <- sqrt(Unique.Load) | |
for (i in 1:k) { | |
Data[,i] <- (Shared.Comp %*% t(Shared.Load))[,i] + Unique.Comp[,i] * Unique.Load[i,1] | |
} | |
# the %*% operator = matrix multiplication, and the t() function = transpose (both used again in step 13) | |
# Replace normal with nonnormal distributions (step 9) --------------------------------------------------------- | |
for (i in 1:k) | |
{ | |
Data <- Data[sort.list(Data[,i]),] | |
Data[,i] <- Distributions[,i] | |
} | |
# Calculate RMSR correlation, compare to lowest value, take appropriate action (steps 10, 11, 12) -------------- | |
Reproduced.Corr <- cor(Data) | |
Residual.Corr <- Target.Corr - Reproduced.Corr | |
RMSR <- sqrt(sum(Residual.Corr[lower.tri(Residual.Corr)] * Residual.Corr[lower.tri(Residual.Corr)]) / | |
(.5 * (k * k - k))) | |
if (RMSR < Best.RMSR) { | |
Best.RMSR <- RMSR | |
Best.Corr <- Intermediate.Corr | |
Best.Res <- Residual.Corr | |
Intermediate.Corr <- Intermediate.Corr + Initial.Multiplier * Residual.Corr | |
Trials.Without.Improvement <- 0 | |
} else { | |
Trials.Without.Improvement <- Trials.Without.Improvement + 1 | |
Current.Multiplier <- Initial.Multiplier * .5 ^ Trials.Without.Improvement | |
Intermediate.Corr <- Best.Corr + Current.Multiplier * Best.Res | |
} | |
} # end of the while loop | |
# Construct the data set with the lowest RMSR correlation (step 13) -------------------------------------------- | |
Fact.Anal <- Factor.Analysis(Best.Corr, Corr.Matrix = TRUE, N.Factors = N.Factors) | |
if (N.Factors == 1) { | |
Shared.Load[,1] <- Fact.Anal$loadings | |
} else { | |
Shared.Load <- Fact.Anal$loadings | |
} | |
Shared.Load[Shared.Load > 1] <- 1 | |
Shared.Load[Shared.Load < -1] <- -1 | |
if (Shared.Load[1,1] < 0) {Shared.Load <- Shared.Load * -1} | |
Shared.Load.sq <- Shared.Load * Shared.Load | |
for (i in 1:k) { | |
if (sum(Shared.Load.sq[i,]) < 1) { | |
Unique.Load[i,1] <- (1 - sum(Shared.Load.sq[i,])) | |
} else { | |
Unique.Load[i,1] <- 0 | |
} | |
} | |
Unique.Load <- sqrt(Unique.Load) | |
for (i in 1:k) { | |
Data[,i] <- (Shared.Comp %*% t(Shared.Load))[,i] + Unique.Comp[,i] * Unique.Load[i,1] | |
} | |
Data <- apply(Data, 2, scale) # standardizes each variable in the matrix | |
for (i in 1:k) | |
{ | |
Data <- Data[sort.list(Data[,i]),] | |
Data[,i] <- Distributions[,i] | |
} | |
Data <- Data[sample(1:N, N, replace = FALSE), ] # randomize order of cases | |
# Report the results and return the simulated data set (step 14) ----------------------------------------------- | |
Iteration <- Iteration - Max.Trials | |
cat("\nN =",N,", k =",k,",",Iteration,"Iterations,",N.Factors,"Factors, RMSR r =",round(Best.RMSR,3),"\n") | |
return(Data) | |
} | |
################################################################################################################ | |
Factor.Analysis <- function(Data, Corr.Matrix = FALSE, Max.Iter = 50, N.Factors = 0) | |
{ | |
Data <- as.matrix(Data) | |
k <- dim(Data)[2] | |
if (N.Factors == 0) N.Factors <- k | |
if (!Corr.Matrix) Cor.Matrix <- cor(Data) | |
else Cor.Matrix <- Data | |
Criterion <- .001 | |
Old.H2 <- rep(99, k) | |
H2 <- rep(0, k) | |
Change <- 1 | |
Iter <- 0 | |
Factor.Loadings <- matrix(nrow = k, ncol = N.Factors) | |
while ((Change >= Criterion) & (Iter < Max.Iter)) | |
{ | |
Iter <- Iter + 1 | |
Eig <- eigen(Cor.Matrix) | |
L <- sqrt(Eig$values[1:N.Factors]) | |
for (i in 1:N.Factors) | |
Factor.Loadings[,i] <- Eig$vectors[,i] * L[i] | |
for (i in 1:k) | |
H2[i] <- sum(Factor.Loadings[i,] * Factor.Loadings[i,]) | |
Change <- max(abs(Old.H2 - H2)) | |
Old.H2 <- H2 | |
diag(Cor.Matrix) <- H2 | |
} | |
if (N.Factors == k) N.Factors <- sum(Eig$values > 1) | |
return(list(loadings = Factor.Loadings[,1:N.Factors], factors = N.Factors)) | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment