scirs2_metrics/clustering/evaluation/mod.rs
1//! Clustering evaluation utilities
2//!
3//! This module provides utilities for evaluating clustering results, including
4//! metrics like Dunn index and elbow method for determining optimal number of clusters.
5
6use scirs2_core::ndarray::{ArrayBase, Data, Ix2};
7use scirs2_core::numeric::{Float, NumCast};
8
9use super::{calculate_distance, group_by_labels, pairwise_distances};
10use crate::error::{MetricsError, Result};
11
12/// Calculates the enhanced Dunn index for a clustering (evaluation version)
13///
14/// The Dunn index is defined as the ratio of the smallest distance between clusters
15/// to the largest intra-cluster distance (cluster diameter). Higher values indicate
16/// better clustering with dense, well-separated clusters.
17///
18/// This is an alternative implementation for the evaluation module.
19///
20/// # Arguments
21///
22/// * `x` - Array of shape (n_samples, n_features) - The data
23/// * `labels` - Array of shape (n_samples,) - Predicted labels for each sample
24///
25/// # Returns
26///
27/// * The Dunn index
28///
29/// # Examples
30///
31/// ```
32/// use scirs2_core::ndarray::{array, Array2};
33/// // This function is not re-exported at the top level
34/// use scirs2_metrics::clustering::evaluation::dunn_index_enhanced;
35///
36/// // Create a small dataset with 2 clusters
37/// let x = Array2::from_shape_vec((6, 2), vec![
38/// 1.0, 2.0,
39/// 1.5, 1.8,
40/// 1.2, 2.2,
41/// 5.0, 6.0,
42/// 5.2, 5.8,
43/// 5.5, 6.2,
44/// ]).unwrap();
45///
46/// let labels = array![0, 0, 0, 1, 1, 1];
47///
48/// let score = dunn_index_enhanced(&x, &labels).unwrap();
49/// assert!(score > 0.5); // High score for well-separated clusters
50/// ```
51#[allow(dead_code)]
52pub fn dunn_index_enhanced<F, S1, S2, D>(
53 x: &ArrayBase<S1, Ix2>,
54 labels: &ArrayBase<S2, D>,
55) -> Result<F>
56where
57 F: Float + NumCast + std::fmt::Debug,
58 S1: Data<Elem = F>,
59 S2: Data<Elem = usize>,
60 D: scirs2_core::ndarray::Dimension,
61{
62 // Check dimensions
63 let n_samples = x.shape()[0];
64 if labels.len() != n_samples {
65 return Err(MetricsError::InvalidInput(format!(
66 "Number of samples in x ({}) does not match number of labels ({})",
67 n_samples,
68 labels.len()
69 )));
70 }
71
72 // Group data points by cluster
73 let clusters = group_by_labels(x, labels)?;
74 let n_clusters = clusters.len();
75
76 if n_clusters <= 1 {
77 return Err(MetricsError::InvalidInput(
78 "Dunn index is only defined for more than one cluster".to_string(),
79 ));
80 }
81
82 // Calculate pairwise distances within the dataset
83 let distances = pairwise_distances::<F, S1>(x, "euclidean")?;
84
85 // Find minimum inter-cluster distance
86 let mut min_inter_distance = F::infinity();
87
88 for (i, indices_i) in clusters.iter() {
89 for (j, indices_j) in clusters.iter() {
90 if i != j {
91 // Calculate minimum distance between points in different clusters
92 for &idx_i in indices_i {
93 for &idx_j in indices_j {
94 min_inter_distance = min_inter_distance.min(distances[[idx_i, idx_j]]);
95 }
96 }
97 }
98 }
99 }
100
101 // Find maximum intra-cluster distance (cluster diameter)
102 let mut max_intra_distance = F::zero();
103
104 for (_, indices) in clusters.iter() {
105 for (idx, &i) in indices.iter().enumerate() {
106 for &j in indices.iter().skip(idx + 1) {
107 max_intra_distance = max_intra_distance.max(distances[[i, j]]);
108 }
109 }
110 }
111
112 // Calculate Dunn index
113 if max_intra_distance == F::zero() {
114 // Avoid division by zero
115 Ok(F::infinity())
116 } else {
117 Ok(min_inter_distance / max_intra_distance)
118 }
119}
120
121/// Implements the elbow method to determine the optimal number of clusters
122///
123/// The elbow method computes the sum of squared distances for a range of k values
124/// and helps identify where the rate of decrease sharply changes (the "elbow").
125///
126/// # Arguments
127///
128/// * `x` - Data matrix, shape (n_samples, n_features)
129/// * `k_range` - Range of k values to evaluate
130/// * `kmeans_fn` - Function that runs k-means clustering and returns (centroids, labels, inertia)
131///
132/// # Returns
133///
134/// * Vector of inertia values for each k in k_range
135///
136/// # Examples
137///
138/// ```no_run
139/// use scirs2_core::ndarray::{Array1, Array2};
140/// use scirs2_metrics::clustering::elbow_method;
141///
142/// // Create a dataset
143/// let x = Array2::<f64>::zeros((100, 2));
144///
145/// // Define a function that runs k-means
146/// let kmeans_fn = |data: &Array2<f64>, k: usize| {
147/// // In a real example, you would call your actual k-means implementation
148/// // This is just a placeholder
149/// let inertia = k as f64; // Placeholder value
150/// inertia
151/// };
152///
153/// // Run elbow method for k from 1 to 10
154/// let inertias = elbow_method(&x, 1..=10, kmeans_fn).unwrap();
155///
156/// // Now you can plot inertias against k to find the "elbow"
157/// ```
158#[allow(dead_code)]
159pub fn elbow_method<F, S>(
160 x: &ArrayBase<S, Ix2>,
161 k_range: std::ops::RangeInclusive<usize>,
162 kmeans_fn: impl Fn(&ArrayBase<S, Ix2>, usize) -> F,
163) -> Result<Vec<F>>
164where
165 F: Float + NumCast + std::fmt::Debug,
166 S: Data<Elem = F>,
167{
168 // Check if k_range is valid
169 let start = *k_range.start();
170 let end = *k_range.end();
171
172 if start < 1 {
173 return Err(MetricsError::InvalidInput(
174 "k_range must start at 1 or greater".to_string(),
175 ));
176 }
177
178 if end < start {
179 return Err(MetricsError::InvalidInput(
180 "k_range end must be greater than or equal to start".to_string(),
181 ));
182 }
183
184 // Check data dimensions
185 let (n_samples, n_features) = x.dim();
186 if n_samples == 0 || n_features == 0 {
187 return Err(MetricsError::InvalidInput(
188 "Input data is empty".to_string(),
189 ));
190 }
191
192 // Run k-means for each value of k and collect inertias
193 let mut inertias = Vec::with_capacity(end - start + 1);
194 for k in k_range {
195 let inertia = kmeans_fn(x, k);
196 inertias.push(inertia);
197 }
198
199 Ok(inertias)
200}
201
202#[cfg(test)]
203mod tests {
204 use super::*;
205 use crate::clustering::dunn_index;
206 use scirs2_core::ndarray::{array, Array2};
207
208 #[test]
209 fn test_dunn_index() {
210 // Create a dataset with 2 well-separated clusters
211 let x = Array2::from_shape_vec(
212 (6, 2),
213 vec![1.0, 2.0, 1.5, 1.8, 1.2, 2.2, 5.0, 6.0, 5.2, 5.8, 5.5, 6.2],
214 )
215 .unwrap();
216
217 let labels = array![0, 0, 0, 1, 1, 1];
218
219 let dunn = dunn_index(&x, &labels).unwrap();
220 // The clusters are well separated, so we expect a high Dunn index
221 assert!(dunn > 0.5);
222
223 // Create a dataset with overlapping clusters
224 let x_overlap = Array2::from_shape_vec(
225 (6, 2),
226 vec![
227 1.0, 2.0, 1.5, 1.8, 3.2, 3.2, // Overlap point
228 3.0, 3.0, // Overlap point
229 5.2, 5.8, 5.5, 6.2,
230 ],
231 )
232 .unwrap();
233
234 let labels_overlap = array![0, 0, 0, 1, 1, 1];
235
236 let dunn_overlap = dunn_index(&x_overlap, &labels_overlap).unwrap();
237 // Clusters are less separated, so we expect a lower Dunn index
238 assert!(dunn_overlap < dunn);
239 }
240
241 #[test]
242 fn test_elbow_method() {
243 // Create a small dataset
244 let x = Array2::<f64>::zeros((10, 2));
245
246 // Simple mock k-means function that returns decreasing inertia values
247 let kmeans_mock = |_: &Array2<f64>, k: usize| {
248 // Simulate decreasing inertia with increasing k
249 // with an "elbow" around k=3
250 let base = 100.0;
251 match k {
252 1 => base,
253 2 => base / 2.0,
254 3 => base / 3.0, // Elbow point
255 4 => base / 3.2, // Small decrease after elbow
256 5 => base / 3.4,
257 _ => base / (3.5 + (k as f64 - 5.0) * 0.1),
258 }
259 };
260
261 let inertias = elbow_method(&x, 1..=6, kmeans_mock).unwrap();
262
263 // Check expected properties
264 assert_eq!(inertias.len(), 6);
265 for i in 1..inertias.len() {
266 // Inertia should always decrease
267 assert!(inertias[i] < inertias[i - 1]);
268 }
269
270 // Check for elbow - the largest drop should be between k=1 and k=2,
271 // with a smaller drop between k=2 and k=3, and minimal changes after
272 let drop_1_to_2 = inertias[0] - inertias[1];
273 let drop_2_to_3 = inertias[1] - inertias[2];
274 let drop_3_to_4 = inertias[2] - inertias[3];
275
276 assert!(drop_1_to_2 > drop_2_to_3);
277 assert!(drop_2_to_3 > drop_3_to_4);
278 }
279}