scirs2_metrics/clustering/evaluation/
mod.rs

1//! Clustering evaluation utilities
2//!
3//! This module provides utilities for evaluating clustering results, including
4//! metrics like Dunn index and elbow method for determining optimal number of clusters.
5
6use scirs2_core::ndarray::{ArrayBase, Data, Ix2};
7use scirs2_core::numeric::{Float, NumCast};
8
9use super::{calculate_distance, group_by_labels, pairwise_distances};
10use crate::error::{MetricsError, Result};
11
12/// Calculates the enhanced Dunn index for a clustering (evaluation version)
13///
14/// The Dunn index is defined as the ratio of the smallest distance between clusters
15/// to the largest intra-cluster distance (cluster diameter). Higher values indicate
16/// better clustering with dense, well-separated clusters.
17///
18/// This is an alternative implementation for the evaluation module.
19///
20/// # Arguments
21///
22/// * `x` - Array of shape (n_samples, n_features) - The data
23/// * `labels` - Array of shape (n_samples,) - Predicted labels for each sample
24///
25/// # Returns
26///
27/// * The Dunn index
28///
29/// # Examples
30///
31/// ```
32/// use scirs2_core::ndarray::{array, Array2};
33/// // This function is not re-exported at the top level
34/// use scirs2_metrics::clustering::evaluation::dunn_index_enhanced;
35///
36/// // Create a small dataset with 2 clusters
37/// let x = Array2::from_shape_vec((6, 2), vec![
38///     1.0, 2.0,
39///     1.5, 1.8,
40///     1.2, 2.2,
41///     5.0, 6.0,
42///     5.2, 5.8,
43///     5.5, 6.2,
44/// ]).unwrap();
45///
46/// let labels = array![0, 0, 0, 1, 1, 1];
47///
48/// let score = dunn_index_enhanced(&x, &labels).unwrap();
49/// assert!(score > 0.5); // High score for well-separated clusters
50/// ```
51#[allow(dead_code)]
52pub fn dunn_index_enhanced<F, S1, S2, D>(
53    x: &ArrayBase<S1, Ix2>,
54    labels: &ArrayBase<S2, D>,
55) -> Result<F>
56where
57    F: Float + NumCast + std::fmt::Debug,
58    S1: Data<Elem = F>,
59    S2: Data<Elem = usize>,
60    D: scirs2_core::ndarray::Dimension,
61{
62    // Check dimensions
63    let n_samples = x.shape()[0];
64    if labels.len() != n_samples {
65        return Err(MetricsError::InvalidInput(format!(
66            "Number of samples in x ({}) does not match number of labels ({})",
67            n_samples,
68            labels.len()
69        )));
70    }
71
72    // Group data points by cluster
73    let clusters = group_by_labels(x, labels)?;
74    let n_clusters = clusters.len();
75
76    if n_clusters <= 1 {
77        return Err(MetricsError::InvalidInput(
78            "Dunn index is only defined for more than one cluster".to_string(),
79        ));
80    }
81
82    // Calculate pairwise distances within the dataset
83    let distances = pairwise_distances::<F, S1>(x, "euclidean")?;
84
85    // Find minimum inter-cluster distance
86    let mut min_inter_distance = F::infinity();
87
88    for (i, indices_i) in clusters.iter() {
89        for (j, indices_j) in clusters.iter() {
90            if i != j {
91                // Calculate minimum distance between points in different clusters
92                for &idx_i in indices_i {
93                    for &idx_j in indices_j {
94                        min_inter_distance = min_inter_distance.min(distances[[idx_i, idx_j]]);
95                    }
96                }
97            }
98        }
99    }
100
101    // Find maximum intra-cluster distance (cluster diameter)
102    let mut max_intra_distance = F::zero();
103
104    for (_, indices) in clusters.iter() {
105        for (idx, &i) in indices.iter().enumerate() {
106            for &j in indices.iter().skip(idx + 1) {
107                max_intra_distance = max_intra_distance.max(distances[[i, j]]);
108            }
109        }
110    }
111
112    // Calculate Dunn index
113    if max_intra_distance == F::zero() {
114        // Avoid division by zero
115        Ok(F::infinity())
116    } else {
117        Ok(min_inter_distance / max_intra_distance)
118    }
119}
120
121/// Implements the elbow method to determine the optimal number of clusters
122///
123/// The elbow method computes the sum of squared distances for a range of k values
124/// and helps identify where the rate of decrease sharply changes (the "elbow").
125///
126/// # Arguments
127///
128/// * `x` - Data matrix, shape (n_samples, n_features)
129/// * `k_range` - Range of k values to evaluate
130/// * `kmeans_fn` - Function that runs k-means clustering and returns (centroids, labels, inertia)
131///
132/// # Returns
133///
134/// * Vector of inertia values for each k in k_range
135///
136/// # Examples
137///
138/// ```no_run
139/// use scirs2_core::ndarray::{Array1, Array2};
140/// use scirs2_metrics::clustering::elbow_method;
141///
142/// // Create a dataset
143/// let x = Array2::<f64>::zeros((100, 2));
144///
145/// // Define a function that runs k-means
146/// let kmeans_fn = |data: &Array2<f64>, k: usize| {
147///     // In a real example, you would call your actual k-means implementation
148///     // This is just a placeholder
149///     let inertia = k as f64; // Placeholder value
150///     inertia
151/// };
152///
153/// // Run elbow method for k from 1 to 10
154/// let inertias = elbow_method(&x, 1..=10, kmeans_fn).unwrap();
155///
156/// // Now you can plot inertias against k to find the "elbow"
157/// ```
158#[allow(dead_code)]
159pub fn elbow_method<F, S>(
160    x: &ArrayBase<S, Ix2>,
161    k_range: std::ops::RangeInclusive<usize>,
162    kmeans_fn: impl Fn(&ArrayBase<S, Ix2>, usize) -> F,
163) -> Result<Vec<F>>
164where
165    F: Float + NumCast + std::fmt::Debug,
166    S: Data<Elem = F>,
167{
168    // Check if k_range is valid
169    let start = *k_range.start();
170    let end = *k_range.end();
171
172    if start < 1 {
173        return Err(MetricsError::InvalidInput(
174            "k_range must start at 1 or greater".to_string(),
175        ));
176    }
177
178    if end < start {
179        return Err(MetricsError::InvalidInput(
180            "k_range end must be greater than or equal to start".to_string(),
181        ));
182    }
183
184    // Check data dimensions
185    let (n_samples, n_features) = x.dim();
186    if n_samples == 0 || n_features == 0 {
187        return Err(MetricsError::InvalidInput(
188            "Input data is empty".to_string(),
189        ));
190    }
191
192    // Run k-means for each value of k and collect inertias
193    let mut inertias = Vec::with_capacity(end - start + 1);
194    for k in k_range {
195        let inertia = kmeans_fn(x, k);
196        inertias.push(inertia);
197    }
198
199    Ok(inertias)
200}
201
202#[cfg(test)]
203mod tests {
204    use super::*;
205    use crate::clustering::dunn_index;
206    use scirs2_core::ndarray::{array, Array2};
207
208    #[test]
209    fn test_dunn_index() {
210        // Create a dataset with 2 well-separated clusters
211        let x = Array2::from_shape_vec(
212            (6, 2),
213            vec![1.0, 2.0, 1.5, 1.8, 1.2, 2.2, 5.0, 6.0, 5.2, 5.8, 5.5, 6.2],
214        )
215        .unwrap();
216
217        let labels = array![0, 0, 0, 1, 1, 1];
218
219        let dunn = dunn_index(&x, &labels).unwrap();
220        // The clusters are well separated, so we expect a high Dunn index
221        assert!(dunn > 0.5);
222
223        // Create a dataset with overlapping clusters
224        let x_overlap = Array2::from_shape_vec(
225            (6, 2),
226            vec![
227                1.0, 2.0, 1.5, 1.8, 3.2, 3.2, // Overlap point
228                3.0, 3.0, // Overlap point
229                5.2, 5.8, 5.5, 6.2,
230            ],
231        )
232        .unwrap();
233
234        let labels_overlap = array![0, 0, 0, 1, 1, 1];
235
236        let dunn_overlap = dunn_index(&x_overlap, &labels_overlap).unwrap();
237        // Clusters are less separated, so we expect a lower Dunn index
238        assert!(dunn_overlap < dunn);
239    }
240
241    #[test]
242    fn test_elbow_method() {
243        // Create a small dataset
244        let x = Array2::<f64>::zeros((10, 2));
245
246        // Simple mock k-means function that returns decreasing inertia values
247        let kmeans_mock = |_: &Array2<f64>, k: usize| {
248            // Simulate decreasing inertia with increasing k
249            // with an "elbow" around k=3
250            let base = 100.0;
251            match k {
252                1 => base,
253                2 => base / 2.0,
254                3 => base / 3.0, // Elbow point
255                4 => base / 3.2, // Small decrease after elbow
256                5 => base / 3.4,
257                _ => base / (3.5 + (k as f64 - 5.0) * 0.1),
258            }
259        };
260
261        let inertias = elbow_method(&x, 1..=6, kmeans_mock).unwrap();
262
263        // Check expected properties
264        assert_eq!(inertias.len(), 6);
265        for i in 1..inertias.len() {
266            // Inertia should always decrease
267            assert!(inertias[i] < inertias[i - 1]);
268        }
269
270        // Check for elbow - the largest drop should be between k=1 and k=2,
271        // with a smaller drop between k=2 and k=3, and minimal changes after
272        let drop_1_to_2 = inertias[0] - inertias[1];
273        let drop_2_to_3 = inertias[1] - inertias[2];
274        let drop_3_to_4 = inertias[2] - inertias[3];
275
276        assert!(drop_1_to_2 > drop_2_to_3);
277        assert!(drop_2_to_3 > drop_3_to_4);
278    }
279}