Load the package and generate a dataset.
library(DPBBM)
## Warning: replacing previous import by 'splines::splineDesign' when loading
## 'VGAM'
set.seed(123455)
S <- 4
G <- 100
K <- 3
nb_mu <- 100
nb_size <- 0.8
prob <- c(1,1,1)
mat <- bbm_data_generate(S=S,G=G,K=K,prob=prob,alpha_band=c(2,6),beta_band=c(2,6),
nb_mu=nb_mu,nb_size=nb_size, plotf = TRUE, max_cor=0.5)
## [1] "after tried 42 times"
## [1] "sample data generated with correlation smaller than: 0.5"
check the generated data. The color on the left shows the true clustering IDs of the site.
id <- order(mat$gamma);
c <- mat$gamma[id]
mat_ratio <- (mat$k+1)/(mat$n+1);
heatmap(mat_ratio[id,], Rowv = NA, Colv = NA, scale="none", RowSideColors=as.character(c),
xlab = "4 samples", ylab="100 RNA methylation sites")
Run the DPBBM result. This step takes a really long time.
cluster_label <- dpbbm_mc_iterations(mat$k, mat$n)
## [1] "Gibbs sampling started. It will take a long time."
## [1] "Shown only the clustering information in the first 20 iterations."
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [71] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [1] 2 1 1 1 13 1 1 2 4 11 2 1 1 1 4 2 1 1 1 1 1 1 3
## [24] 6 1 4 1 3 1 1 1 1 2 1 1 1 15 1 1 4
## [1] 3 1 2 1 28 1 2 2 1 12 1 1 2 1 1 1 6 2 29 1 3 1 1
## [1] 4 1 4 31 33 4 6 2 4 9 1 1 3 1
## [1] 5 1 9 31 37 4 2 2 1 2 9 2
## [1] 6 14 4 31 42 7 1 1
## [1] 7 16 1 30 30 17 1 2 3
## [1] 8 13 1 30 26 27 2 1
## [1] 9 14 1 31 34 20
## [1] 10 23 1 30 25 15 4 2
## [1] 11 12 3 32 24 25 1 1 1 1
## [1] 12 9 1 30 26 30 2 1 1
## [1] 13 4 2 31 12 34 1 12 2 2
## [1] 14 2 1 30 10 40 3 11 2 1
## [1] 15 1 1 30 9 35 1 22 1
## [1] 16 20 1 32 1 45 1
## [1] 17 21 1 32 1 45
## [1] 18 20 1 31 1 46 1
## [1] 19 17 3 30 1 47 2
## [1] 20 25 1 32 1 41
Show the cluster sizes.
table(cluster_label)
## cluster_label
## 1 2 3
## 32 58 10
table(mat$gamma)
##
## 1 2 3
## 35 34 31
Compare the clustering result with the true clustering IDs.
id <- order(mat$gamma);
c <- cluster_label;
mat_ratio <- (mat$k+1)/(mat$n+1);
heatmap(mat_ratio[id,], Rowv = NA, Colv = NA, scale="none",
RowSideColors = as.character(cluster_label[id]),
xlab = "4 samples", ylab="100 RNA methylation sites")
As is shown, clustering results are consistent for most of the sites, but there exist a few misclassied sites as well.