This vignette introduces the usage of SpaCOAP for the analysis of high-dimensional count data with additional high-dimensional covariates, by comparison with other methods.
The package can be loaded with the command:
First, we generate the data simulated data.
width <- 20; height <- 30
n <- width*height
p=500
q = 5; d <- 40; k <- 3; r <- 3
bandwidth <- 1
rho<- c(8,0.6)
sigma2_eps=1
datlist <- gendata_spacoap(seed=1, width=width, height = height,
p=p, q=q, d=d, k=k, rank0 = r, bandwidth=1,
eta0 = 0.5, rho=rho, sigma2_eps=sigma2_eps)
X_count <- datlist$X; H <- datlist$H; Z <- datlist$Z
F0 <- datlist$F0; B0 <- datlist$B0
bbeta0 <- datlist$bbeta0; alpha0 <- datlist$alpha0
Adj_sp <- SpaCOAP:::getneighbor_weightmat(datlist$pos, 1.1, bandwidth)
Fit the SpaCOAP model using the function SpaCOAP()
in the R package SpaCOAP
. Users can use ?SpaCOAP
to see the details about this function.
Check the increased property of the envidence lower bound function.
library(ggplot2)
dat_iter <- data.frame(iter=1:length(reslist$ELBO_seq[-1]), ELBO=reslist$ELBO_seq[-1])
ggplot(data=dat_iter, aes(x=iter, y=ELBO)) + geom_line() + geom_point() + theme_bw(base_size = 20)
We calculate the metrics to measure the estimatioin accuracy, where the trace statistic is used to measure the estimation accuracy of loading matrix and prediction accuracy of factor matrix, and the mean absolute error is adopted to measure the estimation error of bbeta and alpha.
First, we define the metric functions:
norm1_vec <- function(x) mean(abs(x))
trace_statistic_fun <- function(H, H0){
tr_fun <- function(x) sum(diag(x))
mat1 <- t(H0) %*% H %*% qr.solve(t(H) %*% H) %*% t(H) %*% H0
tr_fun(mat1) / tr_fun(t(H0) %*% H0)
}
Then, we calculate the quantities for SpaCOAP.
metricList <- list()
metricList$SpaCOAP <- list()
metricList$SpaCOAP$F_tr <- trace_statistic_fun(reslist$F, F0)
metricList$SpaCOAP$B_tr <- trace_statistic_fun(reslist$B, B0)
metricList$SpaCOAP$alpha_norm1 <- norm1_vec(reslist$alpha- alpha0)/mean(abs(alpha0))
metricList$SpaCOAP$beta_norm1<- norm1_vec(reslist$bbeta- bbeta0)/mean(abs(bbeta0))
metricList$SpaCOAP$time <- reslist$time_use
We compare SpaCOAP with various prominent methods in the literature. (1) COAP performs covariate-argumented Poisson factor model with the consideration of the low-rank structure of the coefficient matrix, but falls short in accounting for the spatial dependence among units, which is implemented in the R package COAP; (2) PLNPCA performs covariate-argumented PCA without considering the low-rank structure of the coefficient matrix and spatial dependence among units, which is implemented in the R package PLNmodels; (3) Multi-response reduced-rank Poisson regression model (MRRR) performs the estimation of low-rank coefficient matrix and factor part by treating them as a single entity, and fails to account for spatial dependence among units, which is implemented in rrpack R package. (4) Spatial Poisson factor model, called FAST, takes into account spatial dependence among units, but unable to account for additional covariates, which is implemented in the R package ProFAST.
First, we implement COAP method:
library(COAP)
tic <- proc.time()
res_coap <- RR_COAP(X_count, Z = cbind(Z, H), rank_use= k+r, q=5, epsELBO = 1e-9)
toc <- proc.time()
time_coap <- toc[3] - tic[3]
metricList$COAP$F_tr <- trace_statistic_fun(res_coap$H, F0)
metricList$COAP$B_tr <- trace_statistic_fun(res_coap$B, B0)
alpha_coap <- res_coap$bbeta[,1:k]
beta_coap <- res_coap$bbeta[,(k+1):(k+d)]
metricList$COAP$alpha_norm1 <- norm1_vec(alpha_coap- alpha0)/mean(abs(alpha0))
metricList$COAP$beta_norm1 <- norm1_vec(beta_coap- bbeta0)/mean(abs(bbeta0))
metricList$COAP$time <- time_coap
PLNPCA_run <- function(X_count, covariates, q, Offset=rep(1, nrow(X_count)), workers=NULL,
maxIter=10000,ftol_rel=1e-8, xtol_rel= 1e-4){
require(PLNmodels)
if(!is.null(workers)){
future::plan("multisession", workers = workers)
}
if(!is.character(Offset)){
dat_plnpca <- prepare_data(X_count, covariates)
dat_plnpca$Offset <- Offset
}else{
dat_plnpca <- prepare_data(X_count, covariates, offset = Offset)
}
d <- ncol(covariates)
# offset(log(Offset))+
formu <- paste0("Abundance ~ 1 + offset(log(Offset))+",paste(paste0("V",1:d), collapse = '+'))
control_use <- list(maxeval=maxIter, ftol_rel=ftol_rel, xtol_rel= ftol_rel)
control_main <- PLNPCA_param(
backend = "nlopt",
trace = 1,
config_optim = control_use,
inception = NULL
)
myPCA <- PLNPCA(as.formula(formu), data = dat_plnpca, ranks = q, control = control_main)
myPCA1 <- getBestModel(myPCA)
myPCA1$scores
res_plnpca <- list(PCs= myPCA1$scores, bbeta= myPCA1$model_par$B,
loadings=myPCA1$model_par$C)
return(res_plnpca)
}
tic <- proc.time()
res_plnpca <- PLNPCA_run(X_count, cbind(Z[,-1],H), q=q)
toc <- proc.time()
time_plnpca <- toc[3] - tic[3]
metricList$PLNPCA$F_tr <- trace_statistic_fun(res_plnpca$PCs, F0)
metricList$PLNPCA$B_tr <- trace_statistic_fun(res_plnpca$loadings, B0)
alpha_plnpca <- t(res_plnpca$bbeta[1:k,])
beta_plnpca <- t(res_plnpca$bbeta[(k+1):(k+d),])
metricList$PLNPCA$alpha_norm1 <- norm1_vec(alpha_plnpca- alpha0)/mean(abs(alpha0))
metricList$PLNPCA$beta_norm1 <- norm1_vec(beta_plnpca- bbeta0)/mean(abs(bbeta0))
metricList$PLNPCA$time <- time_plnpca
## MRRR
## Compare with MRRR
mrrr_run <- function(Y, X, rank0, q=NULL, family=list(poisson()),
familygroup=rep(1,ncol(Y)), epsilon = 1e-4, sv.tol = 1e-2,
maxIter = 2000, trace=TRUE, truncflag=FALSE, trunc=500){
# epsilon = 1e-4; sv.tol = 1e-2; maxIter = 30; trace=TRUE
# Y <- X_count; X <- cbind(Z, H); rank0 = r + ncol(Z)
require(rrpack)
n <- nrow(Y); p <- ncol(Y)
if(!is.null(q)){
rank0 <- rank0+q
X <- cbind(X, diag(n))
}
if(truncflag){
## Trunction
Y[Y>trunc] <- trunc
}
svdX0d1 <- svd(X)$d[1]
init1 = list(kappaC0 = svdX0d1 * 5)
offset = NULL
control = list(epsilon = epsilon, sv.tol = sv.tol, maxit = maxIter,
trace = trace, gammaC0 = 1.1, plot.cv = TRUE,
conv.obj = TRUE)
fit.mrrr <- mrrr(Y=Y, X=X[,-1], family = family, familygroup = familygroup,
penstr = list(penaltySVD = "rankCon", lambdaSVD = 1),
control = control, init = init1, maxrank = rank0)
return(fit.mrrr)
}
tic <- proc.time()
res_mrrr <- mrrr_run(X_count, cbind(Z,H), r+ncol(Z), q=q, truncflag= TRUE, trunc=1e4)
toc <- proc.time()
time_mrrr <- toc[3] - tic[3]
Calculate the metrics for MRRR.
hbbeta_mrrr <-t(res_mrrr$coef[1:ncol(cbind(Z,H)), ])
Theta_hb <- (res_mrrr$coef[(ncol(cbind(Z,H))+1): (nrow(cbind(Z,H))+ncol(cbind(Z,H))), ])
svdTheta <- svd(Theta_hb, nu=q, nv=q)
metricList$MRRR$F_tr <- trace_statistic_fun(svdTheta$u, F0)
metricList$MRRR$B_tr <- trace_statistic_fun(svdTheta$v, B0)
alpha_mrrr <- hbbeta_mrrr[,1:k]
beta_mrrr <- hbbeta_mrrr[,(k+1):(k+d)]
metricList$MRRR$alpha_norm1 <- norm1_vec(alpha_mrrr- alpha0)/mean(abs(alpha0))
metricList$MRRR$beta_norm1 <- norm1_vec(beta_mrrr- bbeta0)/mean(abs(bbeta0))
metricList$MRRR$time <- time_mrrr
## FAST
fast_run <- function(X_count, Adj_sp, q, verbose=TRUE, epsELBO=1e-8){
require(ProFAST)
reslist <- FAST_run(XList = list(X_count),
AdjList = list(Adj_sp), q = q, fit.model = 'poisson',
verbose=verbose, epsLogLik=epsELBO)
reslist$hV <- reslist$hV[[1]]
return(reslist)
}
tic <- proc.time()
res_fast <- fast_run(X_count, Adj_sp, q=q, verbose=TRUE, epsELBO=1e-8)
toc <- proc.time()
time_fast <- toc[3] - tic[3]
metricList$FAST$F_tr <- trace_statistic_fun(res_fast$hV, F0)
metricList$FAST$B_tr <- trace_statistic_fun(res_fast$W, B0)
metricList$FAST$time <- time_fast
Next, we summarized the metrics for COAP and other compared methods in a dataframe object.
list2vec <- function(xlist){
nn <- length(xlist)
me <- rep(NA, nn)
idx_noNA <- which(sapply(xlist, function(x) !is.null(x)))
for(r in idx_noNA) me[r] <- xlist[[r]]
return(me)
}
dat_metric <- data.frame(Tr_F = sapply(metricList, function(x) x$F_tr),
Tr_B = sapply(metricList, function(x) x$B_tr),
err_alpha =list2vec(lapply(metricList, function(x) x$alpha_norm1)),
err_beta = list2vec(lapply(metricList, function(x) x$beta_norm1)),
time = sapply(metricList, function(x) x$time),
Method = names(metricList))
dat_metric$Method <- factor(dat_metric$Method, levels=dat_metric$Method)
Plot the results for COAP and other methods, which suggests that COAP achieves better estimation accuracy for the quantiites of interest.
library(cowplot)
p1 <- ggplot(data=subset(dat_metric, !is.na(Tr_B)), aes(x= Method, y=Tr_B, fill=Method)) + geom_bar(stat="identity") + xlab(NULL) + scale_x_discrete(breaks=NULL) + theme_bw(base_size = 16)
p2 <- ggplot(data=subset(dat_metric, !is.na(Tr_F)), aes(x= Method, y=Tr_F, fill=Method)) + geom_bar(stat="identity") + xlab(NULL) + scale_x_discrete(breaks=NULL)+ theme_bw(base_size = 16)
p3 <- ggplot(data=subset(dat_metric, !is.na(err_alpha)), aes(x= Method, y=err_alpha, fill=Method)) + geom_bar(stat="identity") + xlab(NULL) + scale_x_discrete(breaks=NULL)+ theme_bw(base_size = 16)
p4 <- ggplot(data=subset(dat_metric, !is.na(err_beta)), aes(x= Method, y=err_beta, fill=Method)) + geom_bar(stat="identity") + xlab(NULL) + scale_x_discrete(breaks=NULL)+ theme_bw(base_size = 16)
plot_grid(p1,p2,p3, p4, nrow=2, ncol=2)
We applied the singular value ratio based method to select the number of factors and the rank of coefficient matrix. The results showed that the SVR method has the potential to identify the true values.
res1 <- chooseParams(X_count, Adj_sp, H, Z, verbose=FALSE)
print(c(q_true=q, q_est=res1['hq']))
print(c(r_true=r, r_est=res1['hr']))
Session Info
sessionInfo()
#> R version 4.1.2 (2021-11-01)
#> Platform: x86_64-w64-mingw32/x64 (64-bit)
#> Running under: Windows 10 x64 (build 22631)
#>
#> Matrix products: default
#>
#> locale:
#> [1] LC_COLLATE=C
#> [2] LC_CTYPE=Chinese (Simplified)_China.936
#> [3] LC_MONETARY=Chinese (Simplified)_China.936
#> [4] LC_NUMERIC=C
#> [5] LC_TIME=Chinese (Simplified)_China.936
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> loaded via a namespace (and not attached):
#> [1] digest_0.6.29 R6_2.5.1 jsonlite_1.8.0 magrittr_2.0.3
#> [5] evaluate_0.15 stringi_1.7.6 rlang_1.1.0 cli_3.2.0
#> [9] rstudioapi_0.13 jquerylib_0.1.4 bslib_0.3.1 rmarkdown_2.11
#> [13] tools_4.1.2 stringr_1.4.0 xfun_0.29 yaml_2.3.6
#> [17] fastmap_1.1.0 compiler_4.1.2 htmltools_0.5.2 knitr_1.37
#> [21] sass_0.4.1