This vignette introduces the usage of CMGFM for the analysis of high-dimensional multimodality data with additional covariates, by comparison with other methods.
The package can be loaded with the command:
First, we generate the data simulated data.
pveclist <- list('gaussian'=c(50, 150),'poisson'=c(50, 150),
'binomial'=c(100,60))
q <- 6
sigmavec <- rep(1,3)
pvec <- unlist(pveclist)
methodNames <- c("CMGFM", "GFM", "MRRR", "LFR" , "GPCA", "COAP")
datlist <- gendata_cmgfm(pveclist = pveclist, seed = 1, n = 300,d = 3,
q = q, rho = rep(1,length(pveclist)), rho_z=0.2,
sigmavec=sigmavec, sigma_eps=1)
str(datlist)
XList <- datlist$XList
Z <- datlist$Z
numvarmat <- datlist$numvarmat
Uplist <- list()
k <- 1
for(id in 1:length(datlist$Uplist)){
for(im in 1:length(datlist$Uplist[[id]])){
Uplist[[k]] <- datlist$Uplist[[id]][[im]]
k <- k + 1
}
}
types <- datlist$types
Fit the CMGFM model using the function CMGFM()
in the R
package CMGFM
. Users can use ?CMGFM
to see the
details about this function
system.time({
tic <- proc.time()
rlist <- CMGFM(XList, Z, types=types, q=q, numvarmat=numvarmat)
toc <- proc.time()
time_smgfm <- toc[3] - tic[3]
})
Check the increased property of the evidence lower bound function.
library(ggplot2)
dat_iter <- data.frame(iter=1:length(rlist$ELBO_seq), ELBO=rlist$ELBO_seq)
ggplot(data=dat_iter, aes(x=iter, y=ELBO)) + geom_line() + geom_point() + theme_bw(base_size = 20)
We calculate the metrics to measure the estimation accuracy, where
the trace statistic is used to measure the estimation accuracy of
loading matrix and prediction accuracy of factor matrix, which is
evaluated by the function measurefun()
in the R package
GFM
, and the root of mean square error is adopted to
measure the estimation error of bbeta.
library(GFM)
hUplist <- lapply(seq_along(rlist$Bf), function(m) cbind(rlist$muf[[m]], rlist$Bf[[m]]))
metricList <- list()
metricList$CMGFM <- list()
meanTr <- function(hBlist, Blist, type='trace_statistic'){
###It is noted that the trace statistics is not symmetric, the true value must be in last
trvec <- sapply(1:length(Blist), function(j) measurefun(hBlist[[j]], Blist[[j]], type = type))
return(mean(trvec))
}
normvec <- function(x) sqrt(sum(x^2/ length(x)))
metricList$CMGFM$Tr_F <- measurefun(rlist$M, datlist$F0)
metricList$CMGFM$Tr_Upsilon <- meanTr(hUplist, Uplist)
metricList$CMGFM$Tr_U <- measurefun(Reduce(cbind,rlist$Xif), Reduce(cbind, datlist$U0))
metricList$CMGFM$beta_norm <- normvec(as.vector(Reduce(cbind,rlist$betaf)- Reduce(cbind,datlist$beta0List)))
metricList$CMGFM$Time <- rlist$time_use
We compare CMGFM with various prominent methods in the literature. They are (1) High-dimensional LFM (Bai and Ng 2002) implemented in the R package GFM; (2) PoissonPCA (Kenney et al. 2021) implemented in the R package PoissonPCA; (3) Zero-inflated Poisson factor model (ZIPFA, Xu et al. 2021) implemented in the R package ZIPFA; (4) Generalized factor model (Liu et al. 2023) implemented in the R package GFM; (5) PLNPCA (Chiquet et al. 2018) implemented in the R package PLNmodels; (6) Generalized Linear Latent Variable Models (GLLVM, Hui et al. 2017) implemented in the R package gllvm. (7) Poisson regression model for each xij, (j = 1, · · ·, p), implemented in stats R package; (8) Multi-response reduced-rank Poisson regression model (MMMR, Luo et al. 2018) implemented in rrpack R package.
(1). First, we implemented the generalized factor model (LFM) and record the metrics that measure the estimation accuracy and computational cost.
metricList$GFM <- list()
tic <- proc.time()
res_gfm <- gfm(XList, types=types, q=q)
toc <- proc.time()
time_gfm <- toc[3] - tic[3]
mat2list <- function(B, pvec, by_row=TRUE){
Blist <- list()
pcum = c(0, cumsum(pvec))
for(i in 1:length(pvec)){
if(by_row){
Blist[[i]] <- B[(pcum[i]+1):pcum[i+1],]
}else{
Blist[[i]] <- B[, (pcum[i]+1):pcum[i+1]]
}
}
return(Blist)
}
metricList$GFM$Tr_F <- measurefun(res_gfm$hH, datlist$F0)
metricList$GFM$Tr_Upsilon <- meanTr(mat2list(cbind(res_gfm$hmu,res_gfm$hB), pvec), Uplist)
metricList$GFM$Tr_U <- NA
metricList$GFM$beta_norm <- NA
metricList$GFM$Time <- time_gfm
mrrr_run <- function(Y, Z, numvarmat, rank0,family=list(poisson()),
familygroup, epsilon = 1e-4, sv.tol = 1e-2,
lambdaSVD=0.1, maxIter = 2000, trace=TRUE, trunc=500){
# epsilon = 1e-4; sv.tol = 1e-2; maxIter = 30; trace=TRUE,lambdaSVD=0.1
Diag <- function(vec){
q <- length(vec)
if(q > 1){
y <- diag(vec)
}else{
y <- matrix(vec, 1,1)
}
return(y)
}
require(rrpack)
q <- rank0
n <- nrow(Y); p <- ncol(Y)
X <- cbind(cbind(1, Z), diag(n))
d <- ncol(Z)
## Trunction
Y[Y>trunc] <- trunc
Y[Y< -trunc] <- -trunc
tic <- proc.time()
pvec <- as.vector(t(numvarmat))
pvec <- pvec[pvec>0]
pcums <- cumsum(pvec)
idxlist <- list()
idxlist[[1]] <- 1:pcums[1]
if(length(pvec)>1){
for(i in 2:length(pvec)){
idxlist[[i]] <- (pcums[i-1]+1):pcums[i]
}
}
svdX0d1 <- svd(X)$d[1]
init1 = list(kappaC0 = svdX0d1 * 5)
offset = NULL
control = list(epsilon = epsilon, sv.tol = sv.tol, maxit = maxIter,
trace = trace, gammaC0 = 1.1, plot.cv = TRUE,
conv.obj = TRUE)
res_mrrr <- mrrr(Y=Y, X=X[,-1], family = family, familygroup = familygroup,
penstr = list(penaltySVD = "rankCon", lambdaSVD = lambdaSVD),
control = control, init = init1, maxrank = rank0+d) #
hmu <- res_mrrr$coef[1,]
hbeta <- t(res_mrrr$coef[2:(d+1),])
hTheta <- res_mrrr$coef[-c(1:(d+1)),]
hbeta_rf <- NULL
for(i in seq_along(pvec)){
hbeta_rf <- cbind(hbeta_rf, colMeans(hbeta[idxlist[[i]],]))
}
# Matrix::rankMatrix(hTheta)
svd_Theta <- svd(hTheta, nu=q, nv=q)
hH <- svd_Theta$u
hB <- svd_Theta$v %*% Diag(svd_Theta$d[1:q])
toc <- proc.time()
time_mrrr <- toc[3] - tic[3]
return(list(hH=hH, hB=hB, hmu= hmu, beta=hbeta_rf, time_use=time_mrrr))
}
family_use <- list(gaussian(), poisson(), binomial())
familygroup <- sapply(seq_along(datlist$XList), function(j) rep(j, ncol(datlist$XList[[j]])))
res_mrrr <- mrrr_run(Reduce(cbind, datlist$XList), Z = Z, numvarmat, rank0=q, family=family_use,
familygroup = unlist(familygroup), maxIter=100) #
metricList$MRRR <- list()
metricList$MRRR$Tr_F <- measurefun(res_mrrr$hH, datlist$F0)
metricList$MRRR$Tr_Upsilon <-meanTr(mat2list(cbind(res_mrrr$hmu,res_mrrr$hB), pvec), Uplist)
metricList$MRRR$Tr_U <-NA
metricList$MRRR$beta_norm <- normvec(as.vector(res_mrrr$beta- Reduce(cbind,datlist$beta0List)))
metricList$MRRR$Time <- res_mrrr$time_use
Next, we summarized the metrics for COAP and other compared methods in a dataframe object.
list2vec <- function(xlist){
nn <- length(xlist)
me <- rep(NA, nn)
idx_noNA <- which(sapply(xlist, function(x) !is.null(x)))
for(r in idx_noNA) me[r] <- xlist[[r]]
return(me)
}
dat_metric <- data.frame(Tr_F = sapply(metricList, function(x) x$Tr_F),
Tr_Upsilon = sapply(metricList, function(x) x$Tr_Upsilon),
Tr_U = sapply(metricList, function(x) x$Tr_U),
beta_norm =sapply(metricList, function(x) x$beta_norm),
Time = sapply(metricList, function(x) x$Time),
Method = names(metricList))
dat_metric
Plot the results for COAP and other methods, which suggests that CMGFM achieves better estimation accuracy for the quantities of interest.
library(cowplot)
p1 <- ggplot(data=subset(dat_metric, !is.na(Tr_F)), aes(x= Method, y=Tr_F, fill=Method)) + geom_bar(stat="identity") + xlab(NULL) + scale_x_discrete(breaks=NULL) + theme_bw(base_size = 16)
p2 <- ggplot(data=subset(dat_metric, !is.na(Tr_Upsilon)), aes(x= Method, y=Tr_Upsilon, fill=Method)) + geom_bar(stat="identity") + xlab(NULL) + scale_x_discrete(breaks=NULL)+ theme_bw(base_size = 16)
p3 <- ggplot(data=subset(dat_metric, !is.na(beta_norm)), aes(x= Method, y=beta_norm, fill=Method)) + geom_bar(stat="identity") + xlab(NULL) + scale_x_discrete(breaks=NULL)+ theme_bw(base_size = 16)
p4 <- ggplot(data=subset(dat_metric, !is.na(Time)), aes(x= Method, y=Time, fill=Method)) + geom_bar(stat="identity") + xlab(NULL) + scale_x_discrete(breaks=NULL)+ theme_bw(base_size = 16)
plot_grid(p1,p2,p3, p4, nrow=2, ncol=2)
We applied the maximum singular value ratio based method to select the number of factors. The results showed that the maximum SVR method has the potential to identify the true values.
sessionInfo()
#> R version 4.4.1 (2024-06-14)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Ubuntu 24.04.1 LTS
#>
#> Matrix products: default
#> BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
#> LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so; LAPACK version 3.12.0
#>
#> locale:
#> [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
#> [3] LC_TIME=en_US.UTF-8 LC_COLLATE=C
#> [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
#> [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
#> [9] LC_ADDRESS=C LC_TELEPHONE=C
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
#>
#> time zone: Etc/UTC
#> tzcode source: system (glibc)
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] rmarkdown_2.28
#>
#> loaded via a namespace (and not attached):
#> [1] digest_0.6.37 R6_2.5.1 fastmap_1.2.0 xfun_0.48
#> [5] maketools_1.3.1 cachem_1.1.0 knitr_1.48 htmltools_0.5.8.1
#> [9] buildtools_1.0.0 lifecycle_1.0.4 cli_3.6.3 sass_0.4.9
#> [13] jquerylib_0.1.4 compiler_4.4.1 sys_3.4.3 tools_4.4.1
#> [17] evaluate_1.0.1 bslib_0.8.0 yaml_2.3.10 jsonlite_1.8.9
#> [21] rlang_1.1.4