An Example of Irregular Data Analysis

library(DGP4LCF)

In this example, we demonstrate the usage of BSFA-DGP model to irregular data, which means there exist subjects in the study who have incomplete gene expression. The complete data is the same as that used in the other vignette, and it is saved in ‘sim_fcs_truth.rda’. We generated simulated data, where even-numbered people have complete observations of gene expression, while poeple whose index is odd number have two missing observations at the third and sixth time point.

First, the user has to prepare the initial values for latent factor scores \(y\), regression coefficients of factor loadings \(a\), binary variables of factor loadings \(z\) and also the variance for residuals \(\phi^2\). One possible way to obtain initial values is to use BFRM program to conduct a simple Bayesian Sparse Factor Analysis for cross-sectional data. We have already saved initial values for use in the file named ‘sim_fcs_init.rda’. Note that to save the time for knitting rmarkdown, we have also saved results of this simulated data in the data file named ‘sim_fcs_results_irregular_6_8.rda’, and users should be able to reproduce all results following the instructions in this vignette.

Firstly, we use the function ‘mcem_parameters_setup’ to generate R objects storing parameters in the model that are needed in the following Monte Carlo Expectation Maximization (MCEM) algorithm.

# for reproducibility purpose
set.seed(456)

mcem_parameter_setup_irregular_time_result<-
  mcem_parameter_setup(p = 100, k = 4, n = 17, q = 8,
                                      obs_time_num = sim_fcs_truth$obs_time_num,
                                      obs_time_index = sim_fcs_truth$obs_time_index,
                                      a_person = sim_fcs_truth$a_person,
                                      col_person_index = sim_fcs_truth$col_person_index,
                                      y_init = sim_fcs_init$y_init_irregular,
                                      a_init = sim_fcs_init$a_init_2,
                                      z_init = sim_fcs_init$z_init_2,
                                      phi_init = sim_fcs_init$phi_init_irregular,
                                      a_full = sim_fcs_truth$a_full,
                                      train_index = (1:8),
                                      x = sim_fcs_truth$observed_x_train_irregular)

Next, these generated R objects, along with other model setups created by the user, can be input to the function ‘mcem_algorithm_irregular_time’ for the Maximum Likelihood Estimate (MLE) of Dependent Gaussian Process (DGP) parameters. We choose a model with an intercept term.

mcem_algorithm_irregular_time_result<-
  mcem_algorithm(ind_x = TRUE,
                 x = sim_fcs_truth$observed_x_train_irregular,
                 mcem_parameter_setup_result = mcem_parameter_setup_irregular_time_result)

The function ‘mcem_cov_plot’ visualizes the updating process of cross-correlations. The true cross-correlation is also displayed for comparison.

old_par <- par(no.readonly = TRUE)

par(mfrow = c(2,4))
for (em_index in 1:sim_fcs_results_irregular_6_8$mcem_algorithm_irregular_time_result$index_used){
  mcem_cov_plot(sim_fcs_results_irregular_6_8$mcem_algorithm_irregular_time_result$sigmay_record[em_index,,], k = 4, q = 8, title = paste0("MCEM Iteration ", em_index))
}

mcem_cov_plot(sim_fcs_truth$gp_sigmay_truth, k = 4, q = 10, title = "Truth: Correlated Factors")

par(old_par)

Under the MLE of DGP parameters, we run the final Gibbs sampler for the posterior summary of other parameters in the model, and also of the predicted gene expression. We run multiple chains, and use the function ‘gibbs_after_mcem_diff_initials’ to generate different initial values for different chains.

gibbs_after_mcem_diff_initials_irregular_time_result<-
  gibbs_after_mcem_diff_initials(ind_x = TRUE,
                                 tot_chain = 5, 
                                 mcem_parameter_setup_result = mcem_parameter_setup_irregular_time_result,
                                 mcem_algorithm_result = mcem_algorithm_irregular_time_result)

After initialization, we can run multiple chains, and posterior samples (after burnin and thinning, as designated by the user) will be saved as csv files to the specified location. One csv file contains samples of one parameter, and each row within the csv file contains samples from one Gibbs iteration.

tot_chain<- 5

for (chain_index in 1:tot_chain){
  
  gibbs_after_mcem_algorithm(chain_index = chain_index,
                                            mc_num = 10000,
                                            burnin = 3000,
                                            thin_step = 10 ,
                                            pathname = "path",
                                            pred_indicator = TRUE,
                                            pred_time_index = (9:10),
                                            x = sim_fcs_truth$observed_x_train_irregular,
                                            gibbs_after_mcem_diff_initials_result = gibbs_after_mcem_diff_initials_irregular_time_result,
                                            mcem_algorithm_result = mcem_algorithm_irregular_time_result,
                                            mcem_parameter_setup_result =  mcem_parameter_setup_irregular_time_result)
}

To combine results from different chains for the final summary, run:

constant_list<- list(num_time_test = 2,
                     mc_num = 10000,
                     thin_step = 10,
                     burnin = 3000,
                     pathname = "path",
                     p = 100,
                     k = 4,
                     n = 17,
                     q = 8,
                     ind_x = TRUE,
                     pred_indicator = TRUE)

for (chain_index in 1:tot_chain){

  gibbs_after_mcem_load_chains_result<- gibbs_after_mcem_load_chains(chain_index = chain_index,
                                                                     gibbs_after_mcem_algorithm_result = constant_list)

  save(gibbs_after_mcem_load_chains_result,
       file = paste0("path/chain_", chain_index,"_result.RData"))
  
}

gibbs_after_mcem_combine_chains_irregular_time_result<- gibbs_after_mcem_combine_chains(tot_chain = 5,
                                                                                        gibbs_after_mcem_algorithm_result = constant_list)

For the posterior analysis, we first focus on the continuous variables that do not need to be aligned (due to the identifiability issue), including variables pred_x and phi (also including subject-gene mean and variance_g if the chosen model includes an intercept term). The function ‘numerics_summary_do_not_need_alignment’ not only assesses the convergence of these variables, but also returns the posterior result of pred_x:

 numerics_summary_do_not_need_alignment_irregular_time_result<- 
  numerics_summary_do_not_need_alignment(pred_x_truth =  sim_fcs_truth$observed_x_pred_reformat,
                                             pred_x_truth_indicator = TRUE,
                                         gibbs_after_mcem_combine_chains_result =  gibbs_after_mcem_combine_chains_irregular_time_result)

To provide an overview of the model’s performance in predicting gene expression:

pred_result_overview<- matrix(c(sim_fcs_results_irregular_6_8$numerics_summary_do_not_need_alignment_irregular_time_result$pred_x_result$mae_using_median_est,
                                sim_fcs_results_irregular_6_8$numerics_summary_do_not_need_alignment_irregular_time_result$pred_x_result$mean_width_interval,        
                                sim_fcs_results_irregular_6_8$numerics_summary_do_not_need_alignment_irregular_time_result$pred_x_result$proportion_of_within_interval_biomarkers), 
                                nrow = 3, ncol = 1)

rownames(pred_result_overview)<- c("Mean Absolute Error", "Mean Width of Intervals", "Proportion of Coverage")
colnames(pred_result_overview)<- "Value"

pred_result_overview
##                             Value
## Mean Absolute Error     0.5407061
## Mean Width of Intervals 2.6766196
## Proportion of Coverage  0.9438235

For factor loadings and factor scores that need alignment before further analysis, we implement the function ‘numerics_summary_need_alignment’.

numerics_summary_need_alignment_irregular_time_result<-
  numerics_summary_need_alignment(gibbs_after_mcem_combine_chains_result =  gibbs_after_mcem_combine_chains_irregular_time_result)

An overview of the model’s performance in estimating true factor scores can be calculated as:

q<- 8
k<- 4
n<- 17

a_train<-  sim_fcs_truth$a_full[(1:q)]

h3n2_data<- list()

list_temp <- vector("list", k)
for (list_index in 1:k){
  list_temp[[list_index]]<- a_train
}
h3n2_data$input<- list_temp

fcs_sigma_y_init_truth_for_train_data<- GPFDA::mgpCovMat(Data=h3n2_data, hp=sim_fcs_truth$gp_hp_truth)

# rescale truth
d_matrix<- diag(sqrt(diag(fcs_sigma_y_init_truth_for_train_data)))
d_matrix_inv<- solve(d_matrix)

fcs_real_y_rescaled<- array(0, dim = c(q,k,n))

for (person_index in 1:n){
  fcs_real_y_rescaled[,,person_index]<- matrix(d_matrix_inv%*%as.numeric(sim_fcs_truth$real_y[1:q,,person_index]),
                                               nrow = q,
                                               ncol = k)
}

# compare
mae_irregular_6_8<-  mean(abs(sim_fcs_results_irregular_6_8$numerics_summary_need_alignment_irregular_time_result$reordered_summary$latent_y[(1:q),,] - fcs_real_y_rescaled)) 

mae_irregular_6_8
## [1] 0.2661175