scirs2_metrics/clustering/
mod.rs

1//! Clustering metrics module
2//!
3//! This module provides functions for evaluating clustering algorithms, including
4//! silhouette score, Davies-Bouldin index, Calinski-Harabasz index, and Adjusted Rand index.
5//!
6//! ## Internal Metrics
7//!
8//! Internal metrics assess clustering without external ground truth:
9//! - Silhouette score
10//! - Davies-Bouldin index
11//! - Calinski-Harabasz index
12//! - Dunn index
13//! - Inter-cluster and intra-cluster distance metrics
14//!
15//! ## External Metrics
16//!
17//! External metrics assess clustering compared to ground truth:
18//! - Adjusted Rand index
19//! - Normalized Mutual Information
20//! - Adjusted Mutual Information
21//! - Homogeneity, Completeness, V-measure
22//! - Fowlkes-Mallows score
23
24pub mod evaluation;
25pub mod external_metrics;
26mod internal_metrics;
27
28// Existing submodules
29pub mod density;
30pub mod distance;
31pub mod validation;
32
33// Re-export all public items from submodules - be specific with the evaluation module
34pub use self::density::*;
35pub use self::distance::*;
36pub use self::evaluation::{dunn_index_enhanced, elbow_method};
37pub use self::external_metrics::*;
38pub use self::internal_metrics::*;
39pub use self::validation::*;
40
41// Common utility functions that might be used across multiple submodules
42use scirs2_core::ndarray::{Array1, Array2, ArrayBase, Data, Dimension, Ix2};
43use scirs2_core::numeric::{Float, NumCast};
44use std::collections::HashMap;
45
46use crate::error::{MetricsError, Result};
47
48/// Calculate distance between two points
49///
50/// # Arguments
51///
52/// * `a` - First point
53/// * `b` - Second point
54/// * `metric` - Distance metric name ("euclidean", "manhattan", "cosine", etc.)
55///
56/// # Returns
57///
58/// * The distance between the points
59pub(crate) fn calculate_distance<F>(a: &[F], b: &[F], metric: &str) -> Result<F>
60where
61    F: Float + NumCast + std::fmt::Debug,
62{
63    if a.len() != b.len() {
64        return Err(MetricsError::InvalidInput(format!(
65            "Points must have the same dimension, got {} and {}",
66            a.len(),
67            b.len()
68        )));
69    }
70
71    match metric.to_lowercase().as_str() {
72        "euclidean" => {
73            let mut sum = F::zero();
74            for (ai, bi) in a.iter().zip(b.iter()) {
75                let diff = *ai - *bi;
76                sum = sum + diff * diff;
77            }
78            Ok(sum.sqrt())
79        }
80        "manhattan" => {
81            let mut sum = F::zero();
82            for (ai, bi) in a.iter().zip(b.iter()) {
83                sum = sum + (*ai - *bi).abs();
84            }
85            Ok(sum)
86        }
87        "cosine" => {
88            let mut dot = F::zero();
89            let mut norm_a = F::zero();
90            let mut norm_b = F::zero();
91
92            for (ai, bi) in a.iter().zip(b.iter()) {
93                dot = dot + *ai * *bi;
94                norm_a = norm_a + *ai * *ai;
95                norm_b = norm_b + *bi * *bi;
96            }
97
98            if norm_a < F::epsilon() || norm_b < F::epsilon() {
99                return Err(MetricsError::InvalidInput(
100                    "Cannot compute cosine distance for zero vectors".to_string(),
101                ));
102            }
103
104            let cosine_similarity = dot / (norm_a.sqrt() * norm_b.sqrt());
105
106            // Cosine distance is 1 - cosine similarity
107            Ok(F::one() - cosine_similarity)
108        }
109        _ => Err(MetricsError::InvalidInput(format!(
110            "Unknown distance metric: {metric}"
111        ))),
112    }
113}
114
115/// Compute pairwise distances between all points in a matrix
116///
117/// # Arguments
118///
119/// * `x` - Data matrix, shape (n_samples, n_features)
120/// * `metric` - Distance metric name
121///
122/// # Returns
123///
124/// * Distance matrix, shape (n_samples, n_samples)
125pub(crate) fn pairwise_distances<F, S>(x: &ArrayBase<S, Ix2>, metric: &str) -> Result<Array2<F>>
126where
127    F: Float + NumCast + std::fmt::Debug,
128    S: Data<Elem = F>,
129{
130    let (n_samples, _n_features) = x.dim();
131    let mut distances = Array2::zeros((n_samples, n_samples));
132
133    for i in 0..n_samples {
134        for j in i..n_samples {
135            let row_i = x.row(i).to_vec();
136            let row_j = x.row(j).to_vec();
137
138            let dist = calculate_distance(&row_i, &row_j, metric)?;
139
140            distances[[i, j]] = dist;
141            if i != j {
142                distances[[j, i]] = dist; // Distance matrix is symmetric
143            }
144        }
145    }
146
147    Ok(distances)
148}
149
150/// Group data points by cluster labels
151///
152/// # Arguments
153///
154/// * `x` - Data matrix, shape (n_samples, n_features)
155/// * `labels` - Cluster labels, shape (n_samples,)
156///
157/// # Returns
158///
159/// * HashMap mapping cluster labels to indices of points in that cluster
160pub(crate) fn group_by_labels<F, T, S1, S2, D>(
161    x: &ArrayBase<S1, Ix2>,
162    labels: &ArrayBase<S2, D>,
163) -> Result<HashMap<T, Vec<usize>>>
164where
165    F: Float + NumCast + std::fmt::Debug,
166    T: std::hash::Hash + std::cmp::Eq + Copy,
167    S1: Data<Elem = F>,
168    S2: Data<Elem = T>,
169    D: Dimension,
170{
171    let n_samples = x.shape()[0];
172
173    if labels.len() != n_samples {
174        return Err(MetricsError::InvalidInput(format!(
175            "Number of labels ({}) does not match number of samples ({})",
176            labels.len(),
177            n_samples
178        )));
179    }
180
181    let mut clusters: HashMap<T, Vec<usize>> = HashMap::new();
182
183    for (i, &label) in labels.iter().enumerate() {
184        clusters.entry(label).or_default().push(i);
185    }
186
187    Ok(clusters)
188}