usarrests-NIW-vignette

# Import the TIP library 
library(tip)

# Import the US Arrests dataset
data(USArrests)

# Convert the data to a matrix 
X <- data.matrix(USArrests)

# Compute the distance matrix
distance_matrix <- data.matrix(dist(X))

# Compute the temperature parameter estiamte
temperature <- 1/median(distance_matrix[upper.tri(distance_matrix)])

# For each subject, compute the point estimate for the number of similar 
# subjects using  univariate multiple change point detection (i.e.)
init_num_neighbors = get_cpt_neighbors(.distance_matrix = distance_matrix)
#> Warning in BINSEG(sumstat, pen = pen.value, cost_func = costfunc, minseglen
#> = minseglen, : The number of changepoints identified is Q, it is advised to
#> increase Q to make sure changepoints have not been missed.

# Set the number of burn-in iterations in the Gibbs samlper
# RECOMENDATION: *** burn >= 1000 ***
burn <- 10

# Set the number of sampling iterations in the Gibbs sampler
# RECOMENDATION: *** samples >= 1000 ***
samples <- 10

# Extract the state names 
names_subjects <- rownames(USArrests)

# Run TIP clustering using only the prior
# --> That is, the likelihood function is constant
tip1 <- tip(.data = data.matrix(X),
            .burn = burn,
            .samples = samples,
            .similarity_matrix = exp(-1.0*temperature*distance_matrix),
            .init_num_neighbors = init_num_neighbors,
            .likelihood_model = "NIW",
            .subject_names = names_subjects,
            .num_cores = 1)
#> Bayesian Clustering: Table Invitation Prior Gibbs Sampler
#> burn-in: 10
#> samples: 10
#> Likelihood Model: NIW
#> 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |====                                                                  |   6%
  |                                                                            
  |========                                                              |  11%
  |                                                                            
  |============                                                          |  17%
  |                                                                            
  |================                                                      |  22%
  |                                                                            
  |===================                                                   |  28%
  |                                                                            
  |=======================                                               |  33%
  |                                                                            
  |===========================                                           |  39%
  |                                                                            
  |===============================                                       |  44%
  |                                                                            
  |===================================                                   |  50%
  |                                                                            
  |=======================================                               |  56%
  |                                                                            
  |===========================================                           |  61%
  |                                                                            
  |===============================================                       |  67%
  |                                                                            
  |===================================================                   |  72%
  |                                                                            
  |======================================================                |  78%
  |                                                                            
  |==========================================================            |  83%
  |                                                                            
  |==============================================================        |  89%
  |                                                                            
  |==================================================================    |  94%
  |                                                                            
  |======================================================================| 100%

# Produce plots for the Bayesian Clustering Model
tip_plots <- plot(tip1)

# View the posterior distribution of the number of clusters
tip_plots$trace_plot_posterior_number_of_clusters

# View the trace plot with respect to the posterior number of clusters
tip_plots$trace_plot_posterior_number_of_clusters

# Extract posterior cluster assignments using the Posterior Expected Adjusted Rand (PEAR) index
cluster_assignments <- mcclust::maxpear(psm = tip1@posterior_similarity_matrix)$cl

# Create a list where each element stores the cluster assignments
cluster_assignment_list <- list()
for(k in 1:length(unique(cluster_assignments))){
  cluster_assignment_list[[k]] <- names_subjects[which(cluster_assignments == k)]
}
cluster_assignment_list
#> [[1]]
#>  [1] "Alabama"     "Alaska"      "California"  "Colorado"    "Illinois"   
#>  [6] "Louisiana"   "Michigan"    "Mississippi" "Missouri"    "Nevada"     
#> [11] "New Jersey"  "Oregon"      "Texas"       "Washington" 
#> 
#> [[2]]
#>  [1] "Arizona"        "Connecticut"    "Delaware"       "Hawaii"        
#>  [5] "Maryland"       "New Mexico"     "North Carolina" "Ohio"          
#>  [9] "Pennsylvania"   "South Carolina" "Tennessee"     
#> 
#> [[3]]
#>  [1] "Arkansas"      "Florida"       "Idaho"         "Iowa"         
#>  [5] "Kentucky"      "Maine"         "Montana"       "New Hampshire"
#>  [9] "New York"      "North Dakota"  "Rhode Island"  "South Dakota" 
#> [13] "Utah"          "Vermont"       "West Virginia" "Wisconsin"    
#> [17] "Wyoming"      
#> 
#> [[4]]
#> [1] "Georgia"       "Massachusetts"
#> 
#> [[5]]
#> [1] "Indiana"  "Virginia"
#> 
#> [[6]]
#> [1] "Kansas"   "Nebraska"
#> 
#> [[7]]
#> [1] "Minnesota" "Oklahoma"

# Create the one component graph with minimum entropy
partition_list <- partition_undirected_graph(.graph_matrix = tip1@posterior_similarity_matrix,
                                             .num_components = 1,
                                             .step_size = 0.001)

# View the state names
# names_subjects

# Create a vector of true region labels to see if there is a pattern
true_region <- c("Southeast", "West", "Southwest", "Southeast", "West", "West",
                 "Northeast", "Northeast", "Southeast", "Southeast", "West", "West",
                 "Midwest", "Midwest", "Midwest", "Midwest", "Southeast", "Southeast",
                 "Northeast", "Northeast", "Northeast", "Midwest", "Midwest", "Southeast",
                 "Midwest", "West", "Midwest", "West", "Northeast", "Northeast",
                 "Southwest", "Northeast", "Southeast", "Midwest", "Midwest", "Southwest",
                 "West", "Northeast", "Northeast", "Southeast", "Midwest", "Southeast",
                 "Southwest", "West", "Northeast", "Southeast", "West", "Southeast",
                 "Midwest", "West")

names_subjects
#>  [1] "Alabama"        "Alaska"         "Arizona"        "Arkansas"      
#>  [5] "California"     "Colorado"       "Connecticut"    "Delaware"      
#>  [9] "Florida"        "Georgia"        "Hawaii"         "Idaho"         
#> [13] "Illinois"       "Indiana"        "Iowa"           "Kansas"        
#> [17] "Kentucky"       "Louisiana"      "Maine"          "Maryland"      
#> [21] "Massachusetts"  "Michigan"       "Minnesota"      "Mississippi"   
#> [25] "Missouri"       "Montana"        "Nebraska"       "Nevada"        
#> [29] "New Hampshire"  "New Jersey"     "New Mexico"     "New York"      
#> [33] "North Carolina" "North Dakota"   "Ohio"           "Oklahoma"      
#> [37] "Oregon"         "Pennsylvania"   "Rhode Island"   "South Carolina"
#> [41] "South Dakota"   "Tennessee"      "Texas"          "Utah"          
#> [45] "Vermont"        "Virginia"       "Washington"     "West Virginia" 
#> [49] "Wisconsin"      "Wyoming"

# Associate class labels and colors for the plot
class_palette_colors <- c("Northeast" = "blue",
                          "Southeast" = "red",
                          "Midwest" = "purple",
                          "Southwest" = "orange",
                          "West" = "green")

# Associate class labels and shapes for the plot
class_palette_shapes <- c("Northeast" = 15,
                          "Southeast" = 16,
                          "Midwest" = 17,
                          "Southwest" = 18,
                          "West" = 19)

# Visualize the posterior similarity matrix by constructing a graph plot of 
# the one-cluster graph. The true labels are used here (below they are not).
ggnet2_network_plot(.matrix_graph = partition_list$partitioned_graph_matrix,
                .subject_names = names_subjects,
                .subject_class_names = true_region,
                .class_colors = class_palette_colors,
                .class_shapes = class_palette_shapes,
                .node_size = 2,
                .add_node_labels = TRUE)
#> Warning: Duplicated override.aes is ignored.

# Visualize the posterior similarity matrix by constructing a graph plot of 
# the one-cluster graph. The true labels are used here (below they are not).
# Remove the subject names with .add_node_labels = FALSE
ggnet2_network_plot(.matrix_graph = partition_list$partitioned_graph_matrix,
                .subject_names = names_subjects,
                .subject_class_names = true_region,
                .class_colors = class_palette_colors,
                .class_shapes = class_palette_shapes,
                .node_size = 2,
                .add_node_labels = FALSE)
#> Warning: Duplicated override.aes is ignored.