MMGFM: simulation1

This vignette introduces the usage of MMGFM for the analysis of high-dimensional multi-study multi-modality data with additional covariates, by comparison with other methods.

Generate the simulated data

First, we generate the simulated data from three data sources and three modalities, with each modality consisting of count variables.

N <- 100
q <- 3
qsvec  <- rep(2,3)
sigma_eps <- 1
datlist <- gendata_mmgfm(seed = 1,  nvec = c(300, 200, 100),  
                        pveclist = list('poisson'=c(50, 150, 200)),
                        q = q,  d= 3,qs = qsvec,  rho = 2, rho_z=0.5,
                        sigmavec=1,  sigma_eps=sigma_eps)
XList <- datlist$XList
max(unlist(XList))
print(str(XList))
ZList <- datlist$ZList # covariates
print(head(ZList[[1]]))
tauList <- datlist$tauList # offset term
numvarmat <- datlist$numvarmat

Fit the MMGFM model using the function MMGFM() in the R package MMGFM. Users can use ?MMGFM to see the details about this function

system.time({
  tic <- proc.time()
  reslist <- MMGFM(XList, ZList=ZList, numvarmat, q=q, qsvec = qsvec, init='MSFRVI')
  toc <- proc.time()
  time_MMGFM <- toc[3] - tic[3]
})

Check the increased property of the envidence lower bound function.

library(ggplot2)
library(scales)
dat_iter <- data.frame(iter=1:length(reslist$ELBO_seq), ELBO=reslist$ELBO_seq)
ggplot(data=dat_iter, aes(x=iter, y=ELBO)) + geom_line() + geom_point() + theme_bw(base_size = 20) +
  scale_y_continuous(labels = label_scientific(digits = 8))

We calculate the metrics to measure the estimation accuracy, where the mean trace statistic is used to measure the estimation accuracy of loading matrix and prediction accuracy of factor matrix, which is evaluated by the function measurefun() in the R package GFM, and the root of mean absolute error is adopted to measure the estimation error of beta.


methodNames <- c("MMGFM", "GFM", "MRRR", "MSFR", "MultiCOAP")
n_methods <- length(methodNames)
metricList <- list(F_tr = rep(NA, n_methods), 
                   H_tr = rep(NA,  n_methods), 
                   V_tr = rep(NA,  n_methods), 
                   A_tr = rep(NA, n_methods),
                   B_tr = rep(NA,  n_methods),
                   beta_norm=rep(NA,  n_methods),
                   time = rep(NA,  n_methods))
for(ii in seq_along(metricList)) names(metricList[[ii]]) <- methodNames
metricList$F_tr[1] <- meanTr(reslist$hF, datlist$F0List)
metricList$H_tr[1] <-meanTr(reslist$hH, datlist$H0List)
metricList$V_tr[1] <-meanTr(lapply(reslist$hv, function(x) Reduce(cbind,x) ), datlist$VList)
metricList$A_tr[1] <-metric_mean(AList=reslist$hA, datlist$A0List, align='unaligned', numvarmat = numvarmat)
metricList$B_tr[1] <- mean(ms_metric_mean(reslist$hB, datlist$B0List, align='unaligned', numvarmat = numvarmat))
metricList$beta_norm[1] <-normvec(Reduce(cbind, reslist$hbeta)- Reduce(cbind,datlist$betaList))
metricList$time[1] <- reslist$time.use

Compare with other methods

We compare MMGFM with various prominent methods in the literature. They are (1) Generalized factor model (Liu et al. 2023) implemented in the R package GFM; (2) Multi-response reduced-rank Poisson regression model (MMMR, Luo et al. 2018) implemented in rrpack R package; and (3) the multi-study covariate-augmented overdispersed Poisson factor (MultiCOAP) model.

(1). First, we implemented the generalized factor model (GFM) and record the metrics that measure the estimation accuracy and computational cost.


res_gfm <- gfm_run(XList, numvarmat, q=q)
metricList$F_tr[2] <- meanTr(res_gfm$hF, datlist$F0List)
metricList$A_tr[2] <-metric_mean(AList=res_gfm$hA, datlist$A0List, align='unaligned', numvarmat = numvarmat)
metricList$time[2] <- res_gfm$time.use

(2). Then, we implemented Multi-response reduced-rank Poisson regression model (MMMR) and recorded the metrics. Here, we truncate values to ensure normal working of this function. Otherwise, it will produce error.

res_mrrr <- mrrr_run(XList, ZList, numvarmat, q, truncflag=TRUE, trunc=500)
metricList$F_tr[3] <- meanTr(res_mrrr$hF, datlist$F0List)
metricList$A_tr[3] <- metric_mean(AList=res_mrrr$hA, datlist$A0List, align='unaligned', numvarmat = numvarmat)
metricList$beta_norm[3] <-normvec(res_mrrr$hbeta - Reduce(cbind,datlist$betaList))
metricList$time[3] <- res_mrrr$time.use

Thirdly, we implemented the zero-inflated Poisson factor model:

source("https://raw.githubusercontent.com/feiyoung/MMGFM/refs/heads/main/simu_code/MSFR_main_R_MSFR_V1.R")
## To produce results in limited time, here we set maxIter=5. Even Set maxIter=1e4, the result is also not good.
res_msfr <- MSFR_run(XList, ZList, numvarmat, q, qs=qsvec, maxIter=5, load.source=TRUE, log.transform=TRUE)
metricList$F_tr[4] <- meanTr(res_msfr$hF, datlist$F0List)
metricList$H_tr[4] <- meanTr(res_msfr$hH, datlist$H0List)
metricList$A_tr[4] <- metric_mean(AList=res_msfr$hA, datlist$A0List, align='unaligned', numvarmat = numvarmat)
metricList$B_tr[4] <- mean(ms_metric_mean(res_msfr$hB, datlist$B0List, align='unaligned', numvarmat = numvarmat))
metricList$beta_norm[4] <- normvec(t(res_msfr$hbeta)- Reduce(cbind,datlist$betaList))
metricList$time[4] <- res_msfr$time.use

Fourthly, we also applied the the multi-study covariate-augmented overdispersed Poisson factor (MultiCOAP) model.

res_mcoap <- multicoap_run(XcList=XList, ZList,numvarmat, q, qsvec)
metricList$F_tr[5] <- meanTr(res_mcoap$hF, datlist$F0List)
metricList$H_tr[5] <-meanTr(res_mcoap$hH, datlist$H0List)
metricList$A_tr[5] <- metric_mean(AList=res_mcoap$hA, datlist$A0List, align='unaligned', numvarmat = numvarmat)
metricList$B_tr[5] <- mean(ms_metric_mean(res_mcoap$hB, datlist$B0List, align='unaligned', numvarmat = numvarmat))
metricList$beta_norm[5] <- normvec(t(res_mcoap$hbeta)- Reduce(cbind,datlist$betaList))
metricList$time[5] <- res_mcoap$time.use

Comparison of performance

Next, we summarized the metrics for MMGFM and other compared methods in a data.frame object. We observed that MMGFM achieved much better estimation accuracy for the quantities of interest.

mat.metric <- round(Reduce(rbind, metricList),3)
row.names(mat.metric) <- names(metricList)

dat_metric <- as.data.frame(mat.metric)
DT::datatable(dat_metric)

Session Info

sessionInfo()
#> R version 4.4.1 (2024-06-14 ucrt)
#> Platform: x86_64-w64-mingw32/x64
#> Running under: Windows 11 x64 (build 26100)
#> 
#> Matrix products: default
#> 
#> 
#> locale:
#> [1] LC_COLLATE=C                               
#> [2] LC_CTYPE=Chinese (Simplified)_China.utf8   
#> [3] LC_MONETARY=Chinese (Simplified)_China.utf8
#> [4] LC_NUMERIC=C                               
#> [5] LC_TIME=Chinese (Simplified)_China.utf8    
#> 
#> time zone: Asia/Shanghai
#> tzcode source: internal
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> loaded via a namespace (and not attached):
#>  [1] digest_0.6.37     R6_2.5.1          fastmap_1.2.0     xfun_0.47        
#>  [5] cachem_1.1.0      knitr_1.48        htmltools_0.5.8.1 rmarkdown_2.28   
#>  [9] lifecycle_1.0.4   cli_3.6.3         sass_0.4.9        jquerylib_0.1.4  
#> [13] compiler_4.4.1    rstudioapi_0.16.0 tools_4.4.1       evaluate_1.0.0   
#> [17] bslib_0.8.0       yaml_2.3.10       rlang_1.1.4       jsonlite_1.8.9

MMGFM: simulation1

Wei Liu

2025-07-29

Generate the simulated data

Compare with other methods

Comparison of performance