Created
December 12, 2020 21:08
-
-
Save jschoeley/45a0e05b54512744e85d3eb00eeeae43 to your computer and use it in GitHub Desktop.
Disaggregating counts from Lexis surface with irregular grouping via PCLMM
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Disaggregation of 2D histogram of Poisson-counts with irregular bins | |
# based upon the penalized composite link mixed model | |
# | |
# Jonas Schöley, based on code from | |
# Deirdre I.R. Douma: The Penalized Composite Link Mixed Model. | |
# see further references in code | |
# Given is a matrix of observed counts over 2 dimensions (here | |
# year and age, i.e. a so-called Lexis surface). | |
# Each entry in the matrix corresponds to a rectangular region on the | |
# 2D surface. The regions do not overlap, but may be of irregular | |
# dimensions. The regions completely fill the rectangle given by the | |
# range of the age and year coordinates. | |
# | |
# It is known that the counts have been aggregated into the regions | |
# from counts observed on a smaller grid. The goal is to estimate those | |
# original counts on a smaller grid. This is achieved by the penalized | |
# composite link (mixed) model. Part of this model is a matrix <C> which | |
# defines the aggregation pattern, i.e. the association between regions on | |
# the original, fine grid and regions on the aggregation grid. | |
# Init ------------------------------------------------------------ | |
set.seed(1987) | |
library(Matrix) | |
cnst <- list( | |
# number of year-cells on the fine grid | |
years = 120, | |
# number of age-cells on the fine grid | |
ages = 100, | |
# two different age groupings | |
age_grouping_a = | |
rep(1:5, c(20, 10, 10, 10, 50)), | |
age_grouping_b = | |
rep(1:4, c(20, 20, 10, 50)) | |
) | |
# Construct composition matrix ------------------------------------ | |
# number of cells in ungrouped lexis grid | |
J = cnst$years*cnst$ages | |
# Lexis grid cells are sequentially enumerated in column-major order | |
lexis_grid_ungrouped <- | |
matrix(1:J, nrow = cnst$ages, ncol = cnst$years) | |
# superimpose the aggregation pattern on top of the ungrouped lexis grid | |
lexis_grid_aggregation_pattern <- matrix( | |
c(rep(cnst$age_grouping_a, 100), rep(cnst$age_grouping_b, 20)), | |
nrow = cnst$ages, ncol = cnst$years | |
) | |
# plot the aggregation pattern | |
image(t(lexis_grid_aggregation_pattern)) | |
# give a unique label to each aggregation region on the ungrouped lexis grid | |
lexis_grid_aggregation_key <- lexis_grid_aggregation_pattern | |
for (j in 2:cnst$years) { | |
lexis_grid_aggregation_key[,j] <- | |
lexis_grid_aggregation_key[cnst$ages,j-1] + | |
lexis_grid_aggregation_key[,j] | |
} | |
# number of cells in grouped lexis surface | |
I = max(lexis_grid_aggregation_key) | |
# composition matrix, associates ungrouped Lexis grid cells (columns) | |
# with Lexis aggregation regions (rows) | |
C = matrix(0, nrow = I, ncol = J) | |
for (i in 1:I) { | |
C[i,] <- c(lexis_grid_aggregation_key)==i | |
} | |
# Simulate aggregated data ---------------------------------------- | |
# simulate ungrouped (true) counts from Poisson distribution with | |
# linearly increasing trend with age | |
lexis_grid_ungrouped_counts <- | |
matrix(rpois(J, 1:cnst$ages), nrow = cnst$ages, ncol = cnst$years) | |
# aggregate ungrouped counts into vector according to the aggregation | |
# pattern defined in the composition matrix | |
# the indices of the vector correspond to the labels of the | |
# aggregation regions | |
y = C%*%c(lexis_grid_ungrouped_counts) | |
#sum(y) == sum(lexis_grid_ungrouped_counts) | |
# plot ungrouped counts | |
image(t(lexis_grid_ungrouped_counts)) | |
# Construct B-spline basis over ungrouped Lexis grid -------------- | |
# All code in this section copied and slightly changed by Jonas Schöley from | |
# Deirdre I.R. Douma: The Penalized Composite Link Mixed Model. Master Thesis. | |
# https://www.universiteitleiden.nl/binaries/content/assets/science/mi/scripties/statscience/douma-d-master-thesiss1577921.pdf | |
rowwisekr <- function (X1, X2) { | |
one.1 <- matrix(1, 1, ncol(X1)) | |
one.2 <- matrix(1, 1, ncol(X2)) | |
kronecker(X1, one.2) * kronecker(one.1, X2) | |
} | |
# 2 column matrix giving the year and age coordinates for the centroids | |
# of the cells on the fine Lexis grid. Row numbers correspond to cell_id. | |
coord <- cbind( | |
rep(1:cnst$years, each = cnst$ages), | |
rep(1:cnst$ages, cnst$years) | |
) | |
k = 15; deg = 3; ord_D = 2 | |
# b-spline basis for year coordinates | |
coord1.min <- min(coord[, 1]) | |
coord1.max <- max(coord[, 1]) | |
d1 <- (coord1.max - coord1.min)/(k - deg) | |
knots1 <- seq(coord1.min - deg * d1, coord1.max + deg * d1, by = d1) | |
B1 <- splines::splineDesign( | |
x = coord[, 1], knots = knots1, ord = deg + 1, | |
outer.ok = TRUE, sparse = TRUE | |
) | |
# b-spline basis for age coordinates | |
coord2.min <- min(coord[, 2]) | |
coord2.max <- max(coord[, 2]) | |
d2 <- (coord2.max - coord2.min)/(k - deg) | |
knots2 <- seq(coord2.min - deg * d2, coord2.max + deg * d2, by = d2) | |
B2 <- splines::splineDesign( | |
x = coord[, 2], knots = knots2, ord = deg + 1, | |
outer.ok = TRUE, sparse = TRUE | |
) | |
# difference matrix | |
D <- diff(diag(k), diff = ord_D) | |
# reparametrisation - Mixed model decomposition | |
# SVD of P and constructing X and Z for two dimensions | |
# This is really clever as the mixed model formulation gets rid of | |
# the need to do an expensive grid search for the optimal lambda | |
# smoothing parameter. Instead one assumes the parameters of the | |
# b-spline to originate from a multivariate normal distribution, thereby | |
# performing the regularization. | |
# - Jonas Schöley | |
P <- t(D)%*%D #kxk | |
eigen.P <- eigen(P) | |
U <- eigen.P$vectors[, 1:(k - ord_D)] | |
Sigma <- eigen.P$values[1:(k - ord_D)] | |
L <- U %*% diag(sqrt(Sigma)) | |
Z1 <- B1 %*% L %*% solve(t(L) %*% L) | |
Z2 <- B2 %*% L %*% solve(t(L) %*% L) | |
X_mat <- cbind(rep(1, k), 1:k) #kxp | |
X1 <- B1 %*% X_mat #Jxp | |
X2 <- B2 %*% X_mat #Jxp | |
# Constructing X and Z with row-wise Kronecker product | |
X <- rowwisekr(X2, X1) | |
Z <- cbind(rowwisekr(Z2, X1), rowwisekr(X2, Z1), rowwisekr(Z2, Z1)) | |
# Fit PCLM -------------------------------------------------------- | |
# All code in this section copied and slightly changed from | |
# Deirdre I.R. Douma: The Penalized Composite Link Mixed Model. Master Thesis. | |
# https://www.universiteitleiden.nl/binaries/content/assets/science/mi/scripties/statscience/douma-d-master-thesiss1577921.pdf | |
I <- nrow(y) | |
J <- nrow(X) | |
q <- ncol(Z) | |
y <- as.matrix(y, ncol = 1) | |
# Obtaining initial estimates | |
C <- Matrix(C, sparse = TRUE) | |
row_C <- rowSums(C) # cells contributing to aggregate count | |
mean_row <- y / row_C # counts per contributing cell if contributions to aggregate are uniform | |
y_long <- rep(mean_row, row_C) # estimated cell counts under uniform contributions to aggregate | |
fit <- glm(floor(y_long) ~ 0 + as.matrix(X), family = 'poisson') | |
# Setting up preliminaries for loop | |
dif <- 10 | |
theta <- c(0) | |
beta_old <- fit$coefficients #Vector of p | |
u_old <- c(rep(0, q)) #Vector of q | |
# I changed this to log-fitted values as seemed to be intended in the | |
# original code. otherwise the exp in the next step does not make sense. | |
# I also changed the glm step above to poisson, mainly due to the log-link | |
# - Jonas Schöley | |
eta_init <- log(fit$fitted.values) #Jx1 | |
gamm <- exp(eta_init) #Jx1 | |
Gamm <- .sparseDiagonal(n = J, x = as.vector(gamm)) #JxJ | |
H <- numeric(1) | |
s <- numeric(1) | |
mu <- C %*% gamm #Ix1 | |
mu <- Matrix(mu, sparse = TRUE) | |
W <- .sparseDiagonal(n = I, x = as.vector(mu)) #IxI | |
Winv <- .sparseDiagonal(n = I, x = (1 / as.vector(mu))) #IxI | |
X_breve <- Winv %*% C %*% Gamm %*% X #Ixp | |
Z_breve <- Winv %*% C %*% Gamm %*% Z #Ixq | |
z <- X_breve %*% beta_old + Z_breve %*% u_old + Winv %*% (y - mu) #Ix1 | |
G <- .sparseDiagonal(n = q, x = theta) #qxq | |
ZGZ <- Z_breve %*% G %*% t(Z_breve) #IxI | |
V <- Winv + ZGZ #IxI | |
V <- Matrix(V, sparse = TRUE) | |
V_inv <- solve(V) #IxI | |
while(any(dif > 1e-6)){ | |
#Estimating Coefficients | |
beta <- solve(t(X_breve) %*% V_inv %*% X_breve) %*% t(X_breve) %*% V_inv %*% z #p | |
u <- G %*% t(Z_breve) %*% V_inv %*% (z - X_breve %*% beta_old) #q | |
eta <- X %*% beta + Z %*% u #Jx1 | |
# Applying Newton-Raphson to calculate theta | |
P <- V_inv - V_inv %*% X_breve %*% solve(t(X_breve) %*% V_inv %*% X_breve) %*% | |
t(X_breve) %*% V_inv | |
# Hessian matrix | |
H <- (1/2) %*% sum(diag((t(Z_breve) %*% P %*% Z_breve %*% t(Z_breve)%*% P %*% Z_breve))) - | |
t(z) %*% P %*% Z_breve %*% t(Z_breve) %*% P%*% Z_breve %*% t(Z_breve) %*% P %*% z | |
s <- -(1/2) %*% sum(diag((t(Z_breve) %*% P %*% Z_breve))) + | |
(1/2) %*%(t(z) %*% P %*% Z_breve %*% t(Z_breve) %*% P %*% z) | |
theta_new <- as.numeric(theta - solve(H) * s) | |
gamm <- as.numeric(exp(eta)) | |
Gamm <- .sparseDiagonal(n = J, x = as.vector(gamm)) #JxJ | |
mu <- C %*% gamm #Ix1 | |
mu <- Matrix(mu, sparse = TRUE) | |
W <- .sparseDiagonal(n = I, x = as.vector(mu)) #IxI | |
Winv <- .sparseDiagonal(n = I, x = (1 / as.vector(mu))) #IxI | |
X_breve <- Winv %*% C %*% Gamm %*% X #Ixp | |
Z_breve <- Winv %*% C %*% Gamm %*% Z #Ixq | |
z <- X_breve %*% beta + Z_breve %*% u + Winv %*% (y - mu) #Ix1 | |
G <- .sparseDiagonal(n = q, x = theta_new) #qxq | |
ZGZ <- Z_breve %*% G %*% t(Z_breve) #IxI | |
V <- Winv + ZGZ #IxI | |
V <- Matrix(V, sparse = TRUE) | |
V_inv <- solve(V) #IxI | |
# Updating Parameters | |
dif <- abs(beta - beta_old) | |
beta_old <- beta | |
theta <- theta_new | |
} | |
# Show ungrouped counts ------------------------------------------- | |
plot(mu-y) | |
abs(sum(gamm) - sum(y)) | |
lexis_grid_ungrouped_counts_predicted <- | |
matrix(gamm, nrow = cnst$ages, ncol = cnst$years) | |
# true ungrouped (points) vs. predicted ungrouped (surface) | |
rgl::persp3d( | |
x = 1:cnst$ages, y = 1:cnst$years, lexis_grid_ungrouped_counts_predicted, | |
color = 'grey' | |
) | |
rgl::points3d( | |
x = coord[,2], y = coord[,1], z = c(lexis_grid_ungrouped_counts), | |
col = 'black', alpha = 0.2 | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment