# Import the TIP library
library(tip)
# Import the US Arrests dataset
data(USArrests)
# Convert the data to a matrix
<- data.matrix(USArrests)
X
# Compute the distance matrix
<- data.matrix(dist(X))
distance_matrix
# Compute the temperature parameter estiamte
<- 1/median(distance_matrix[upper.tri(distance_matrix)])
temperature
# For each subject, compute the point estimate for the number of similar
# subjects using univariate multiple change point detection (i.e.)
= get_cpt_neighbors(.distance_matrix = distance_matrix)
init_num_neighbors #> Warning in BINSEG(sumstat, pen = pen.value, cost_func = costfunc, minseglen
#> = minseglen, : The number of changepoints identified is Q, it is advised to
#> increase Q to make sure changepoints have not been missed.
# Set the number of burn-in iterations in the Gibbs samlper
# RECOMENDATION: *** burn >= 1000 ***
<- 10
burn
# Set the number of sampling iterations in the Gibbs sampler
# RECOMENDATION: *** samples >= 1000 ***
<- 10
samples
# Extract the state names
<- rownames(USArrests)
names_subjects
# Run TIP clustering using only the prior
# --> That is, the likelihood function is constant
<- tip(.data = data.matrix(X),
tip1 .burn = burn,
.samples = samples,
.similarity_matrix = exp(-1.0*temperature*distance_matrix),
.init_num_neighbors = init_num_neighbors,
.likelihood_model = "NIW",
.subject_names = names_subjects,
.num_cores = 1)
#> Bayesian Clustering: Table Invitation Prior Gibbs Sampler
#> burn-in: 10
#> samples: 10
#> Likelihood Model: NIW
#>
|
| | 0%
|
|==== | 6%
|
|======== | 11%
|
|============ | 17%
|
|================ | 22%
|
|=================== | 28%
|
|======================= | 33%
|
|=========================== | 39%
|
|=============================== | 44%
|
|=================================== | 50%
|
|======================================= | 56%
|
|=========================================== | 61%
|
|=============================================== | 67%
|
|=================================================== | 72%
|
|====================================================== | 78%
|
|========================================================== | 83%
|
|============================================================== | 89%
|
|================================================================== | 94%
|
|======================================================================| 100%
# Produce plots for the Bayesian Clustering Model
<- plot(tip1) tip_plots
# View the posterior distribution of the number of clusters
$trace_plot_posterior_number_of_clusters tip_plots
# View the trace plot with respect to the posterior number of clusters
$trace_plot_posterior_number_of_clusters tip_plots
# Extract posterior cluster assignments using the Posterior Expected Adjusted Rand (PEAR) index
<- mcclust::maxpear(psm = tip1@posterior_similarity_matrix)$cl
cluster_assignments
# Create a list where each element stores the cluster assignments
<- list()
cluster_assignment_list for(k in 1:length(unique(cluster_assignments))){
<- names_subjects[which(cluster_assignments == k)]
cluster_assignment_list[[k]]
}
cluster_assignment_list#> [[1]]
#> [1] "Alabama" "Alaska" "California" "Colorado" "Illinois"
#> [6] "Louisiana" "Michigan" "Mississippi" "Missouri" "Nevada"
#> [11] "New Jersey" "Oregon" "Texas" "Washington"
#>
#> [[2]]
#> [1] "Arizona" "Connecticut" "Delaware" "Hawaii"
#> [5] "Maryland" "New Mexico" "North Carolina" "Ohio"
#> [9] "Pennsylvania" "South Carolina" "Tennessee"
#>
#> [[3]]
#> [1] "Arkansas" "Florida" "Idaho" "Iowa"
#> [5] "Kentucky" "Maine" "Montana" "New Hampshire"
#> [9] "New York" "North Dakota" "Rhode Island" "South Dakota"
#> [13] "Utah" "Vermont" "West Virginia" "Wisconsin"
#> [17] "Wyoming"
#>
#> [[4]]
#> [1] "Georgia" "Massachusetts"
#>
#> [[5]]
#> [1] "Indiana" "Virginia"
#>
#> [[6]]
#> [1] "Kansas" "Nebraska"
#>
#> [[7]]
#> [1] "Minnesota" "Oklahoma"
# Create the one component graph with minimum entropy
<- partition_undirected_graph(.graph_matrix = tip1@posterior_similarity_matrix,
partition_list .num_components = 1,
.step_size = 0.001)
# View the state names
# names_subjects
# Create a vector of true region labels to see if there is a pattern
<- c("Southeast", "West", "Southwest", "Southeast", "West", "West",
true_region "Northeast", "Northeast", "Southeast", "Southeast", "West", "West",
"Midwest", "Midwest", "Midwest", "Midwest", "Southeast", "Southeast",
"Northeast", "Northeast", "Northeast", "Midwest", "Midwest", "Southeast",
"Midwest", "West", "Midwest", "West", "Northeast", "Northeast",
"Southwest", "Northeast", "Southeast", "Midwest", "Midwest", "Southwest",
"West", "Northeast", "Northeast", "Southeast", "Midwest", "Southeast",
"Southwest", "West", "Northeast", "Southeast", "West", "Southeast",
"Midwest", "West")
names_subjects#> [1] "Alabama" "Alaska" "Arizona" "Arkansas"
#> [5] "California" "Colorado" "Connecticut" "Delaware"
#> [9] "Florida" "Georgia" "Hawaii" "Idaho"
#> [13] "Illinois" "Indiana" "Iowa" "Kansas"
#> [17] "Kentucky" "Louisiana" "Maine" "Maryland"
#> [21] "Massachusetts" "Michigan" "Minnesota" "Mississippi"
#> [25] "Missouri" "Montana" "Nebraska" "Nevada"
#> [29] "New Hampshire" "New Jersey" "New Mexico" "New York"
#> [33] "North Carolina" "North Dakota" "Ohio" "Oklahoma"
#> [37] "Oregon" "Pennsylvania" "Rhode Island" "South Carolina"
#> [41] "South Dakota" "Tennessee" "Texas" "Utah"
#> [45] "Vermont" "Virginia" "Washington" "West Virginia"
#> [49] "Wisconsin" "Wyoming"
# Associate class labels and colors for the plot
<- c("Northeast" = "blue",
class_palette_colors "Southeast" = "red",
"Midwest" = "purple",
"Southwest" = "orange",
"West" = "green")
# Associate class labels and shapes for the plot
<- c("Northeast" = 15,
class_palette_shapes "Southeast" = 16,
"Midwest" = 17,
"Southwest" = 18,
"West" = 19)
# Visualize the posterior similarity matrix by constructing a graph plot of
# the one-cluster graph. The true labels are used here (below they are not).
ggnet2_network_plot(.matrix_graph = partition_list$partitioned_graph_matrix,
.subject_names = names_subjects,
.subject_class_names = true_region,
.class_colors = class_palette_colors,
.class_shapes = class_palette_shapes,
.node_size = 2,
.add_node_labels = TRUE)
#> Warning: Duplicated override.aes is ignored.
# Visualize the posterior similarity matrix by constructing a graph plot of
# the one-cluster graph. The true labels are used here (below they are not).
# Remove the subject names with .add_node_labels = FALSE
ggnet2_network_plot(.matrix_graph = partition_list$partitioned_graph_matrix,
.subject_names = names_subjects,
.subject_class_names = true_region,
.class_colors = class_palette_colors,
.class_shapes = class_palette_shapes,
.node_size = 2,
.add_node_labels = FALSE)
#> Warning: Duplicated override.aes is ignored.
# Construct a network plot without class labels
# Note: Subject labels may be suppressed using .add_node_labels = FALSE.
ggnet2_network_plot(.matrix_graph = partition_list$partitioned_graph_matrix,
.subject_names = names_subjects,
.node_size = 2,
.add_node_labels = TRUE)