scirs2_metrics/clustering/
density.rs

1//! Density-based metrics for clustering evaluation
2//!
3//! This module contains metrics for evaluating clustering results based on
4//! density characteristics of the clusters. These metrics help assess how
5//! well clusters represent dense regions in the data space.
6
7use scirs2_core::ndarray::{ArrayBase, Data, Dimension, Ix1, Ix2};
8use scirs2_core::numeric::{Float, NumCast};
9use std::collections::HashMap;
10use std::ops::{AddAssign, DivAssign};
11
12use crate::error::{MetricsError, Result};
13
14/// Calculate the Local Density Factor (LDF) for all clusters
15///
16/// The Local Density Factor measures how dense clusters are compared to their
17/// surrounding space. Higher values indicate better clustering of dense regions.
18///
19/// # Arguments
20///
21/// * `x` - Array of shape (n_samples, n_features) - The data
22/// * `labels` - Array of shape (n_samples,) - Predicted labels for each sample
23/// * `k` - Number of neighbors to consider for density calculation (default: 5)
24///
25/// # Returns
26///
27/// * HashMap mapping each cluster label to its local density factor
28///
29/// # Examples
30///
31/// ```
32/// use scirs2_core::ndarray::{array, Array2};
33/// use scirs2_metrics::clustering::density::local_density_factor;
34///
35/// // Create a simple dataset with 2 clusters
36/// let x = Array2::from_shape_vec((6, 2), vec![
37///     1.0, 2.0, 1.5, 1.8, 1.2, 2.2,
38///     5.0, 6.0, 5.2, 5.8, 5.5, 6.2,
39/// ]).unwrap();
40///
41/// let labels = array![0, 0, 0, 1, 1, 1];
42///
43/// let factors = local_density_factor(&x, &labels, None).unwrap();
44/// ```
45#[allow(dead_code)]
46pub fn local_density_factor<F, S1, S2, D>(
47    x: &ArrayBase<S1, Ix2>,
48    labels: &ArrayBase<S2, D>,
49    k: Option<usize>,
50) -> Result<HashMap<usize, F>>
51where
52    F: Float
53        + NumCast
54        + std::fmt::Debug
55        + scirs2_core::ndarray::ScalarOperand
56        + AddAssign
57        + DivAssign,
58    S1: Data<Elem = F>,
59    S2: Data<Elem = usize>,
60    D: Dimension,
61{
62    // Check that x and labels have the same number of samples
63    let n_samples = x.shape()[0];
64    if n_samples != labels.len() {
65        return Err(MetricsError::InvalidInput(format!(
66            "x has {} samples, but labels has {} samples",
67            n_samples,
68            labels.len()
69        )));
70    }
71
72    // Default k to a safe value based on n_samples
73    let k = k.unwrap_or_else(|| {
74        if n_samples <= 1 {
75            1
76        } else if n_samples < 10 {
77            std::cmp::min(2, n_samples - 1)
78        } else {
79            std::cmp::min(5, n_samples / 10)
80        }
81    });
82
83    // Get unique cluster labels
84    let mut unique_labels = Vec::new();
85    for &label in labels.iter() {
86        if !unique_labels.contains(&label) {
87            unique_labels.push(label);
88        }
89    }
90
91    // Sort labels for consistent results
92    unique_labels.sort();
93
94    // For each sample, compute the k-nearest neighbors distance (density estimate)
95    let mut all_knn_distances = Vec::new();
96    let mut cluster_idx = HashMap::new();
97
98    for label in &unique_labels {
99        cluster_idx.insert(*label, Vec::new());
100    }
101
102    // Collect indices by cluster
103    for (i, &label) in labels.iter().enumerate() {
104        if let Some(indices) = cluster_idx.get_mut(&label) {
105            indices.push(i);
106        }
107    }
108
109    // Calculate k-distance for each point
110    for i in 0..n_samples {
111        let current_point = x.row(i);
112
113        // Calculate distances to all other points
114        let mut distances = Vec::new();
115        for j in 0..n_samples {
116            if i != j {
117                let other_point = x.row(j);
118                let dist = calculate_euclidean_distance(&current_point, &other_point);
119                distances.push(dist);
120            }
121        }
122
123        // Sort distances and take the k-th nearest
124        distances.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
125        let k_distance = if distances.len() >= k {
126            distances[k - 1] // k-th nearest neighbor distance
127        } else if !distances.is_empty() {
128            distances[distances.len() - 1] // Use farthest if k is too large
129        } else {
130            F::zero() // Fallback if no distances
131        };
132
133        all_knn_distances.push((i, k_distance));
134    }
135
136    // Calculate average k-distance per cluster
137    let mut ldf = HashMap::new();
138
139    for &label in &unique_labels {
140        let cluster_indices = cluster_idx.get(&label).unwrap();
141        if cluster_indices.is_empty() {
142            continue;
143        }
144
145        // Average k-distance within the cluster
146        let mut cluster_knn_sum = F::zero();
147        let mut count = 0;
148
149        for &idx in cluster_indices {
150            cluster_knn_sum += all_knn_distances[idx].1;
151            count += 1;
152        }
153
154        let avg_knn = if count > 0 {
155            cluster_knn_sum / F::from(count).unwrap()
156        } else {
157            F::zero()
158        };
159
160        // Calculate LDF as the inverse of the average k-distance
161        // (smaller distances = higher density)
162        let factor = if avg_knn > F::zero() {
163            F::one() / avg_knn
164        } else {
165            F::zero()
166        };
167
168        ldf.insert(label, factor);
169    }
170
171    Ok(ldf)
172}
173
174/// Calculate the Relative Density Index (RDI) for clustering evaluation
175///
176/// The Relative Density Index measures the ratio of intra-cluster density
177/// to inter-cluster density. Higher values indicate better separation
178/// between clusters in terms of density.
179///
180/// # Arguments
181///
182/// * `x` - Array of shape (n_samples, n_features) - The data
183/// * `labels` - Array of shape (n_samples,) - Predicted labels for each sample
184/// * `k` - Number of neighbors to consider for density calculation (default: 5)
185///
186/// # Returns
187///
188/// * Relative Density Index value
189///
190/// # Examples
191///
192/// ```
193/// use scirs2_core::ndarray::{array, Array2};
194/// use scirs2_metrics::clustering::density::relative_density_index;
195///
196/// // Create a simple dataset with 2 clusters
197/// let x = Array2::from_shape_vec((6, 2), vec![
198///     1.0, 2.0, 1.5, 1.8, 1.2, 2.2,
199///     5.0, 6.0, 5.2, 5.8, 5.5, 6.2,
200/// ]).unwrap();
201///
202/// let labels = array![0, 0, 0, 1, 1, 1];
203///
204/// let rdi = relative_density_index(&x, &labels, None).unwrap();
205/// ```
206#[allow(dead_code)]
207pub fn relative_density_index<F, S1, S2, D>(
208    x: &ArrayBase<S1, Ix2>,
209    labels: &ArrayBase<S2, D>,
210    k: Option<usize>,
211) -> Result<F>
212where
213    F: Float
214        + NumCast
215        + std::fmt::Debug
216        + scirs2_core::ndarray::ScalarOperand
217        + AddAssign
218        + DivAssign,
219    S1: Data<Elem = F>,
220    S2: Data<Elem = usize>,
221    D: Dimension,
222{
223    // Check that x and labels have the same number of samples
224    let n_samples = x.shape()[0];
225    if n_samples != labels.len() {
226        return Err(MetricsError::InvalidInput(format!(
227            "x has {} samples, but labels has {} samples",
228            n_samples,
229            labels.len()
230        )));
231    }
232
233    // Get unique cluster labels
234    let mut unique_labels = Vec::new();
235    for &label in labels.iter() {
236        if !unique_labels.contains(&label) {
237            unique_labels.push(label);
238        }
239    }
240
241    // Sort labels for consistent results
242    unique_labels.sort();
243
244    // Default k to a safe value based on n_samples
245    let k = k.unwrap_or_else(|| {
246        if n_samples <= 1 {
247            1
248        } else if n_samples < 10 {
249            std::cmp::min(2, n_samples - 1)
250        } else {
251            std::cmp::min(5, n_samples / 10)
252        }
253    });
254
255    // Calculate intra-cluster and inter-cluster density estimates
256    let mut intra_density_sum = F::zero();
257    let mut inter_density_sum = F::zero();
258
259    for (i, &label_i) in labels.iter().enumerate() {
260        // Calculate distances to all other points
261        let mut intra_distances = Vec::new();
262        let mut inter_distances = Vec::new();
263
264        for (j, &label_j) in labels.iter().enumerate() {
265            if i != j {
266                let dist = calculate_euclidean_distance(&x.row(i), &x.row(j));
267
268                if label_i == label_j {
269                    intra_distances.push(dist);
270                } else {
271                    inter_distances.push(dist);
272                }
273            }
274        }
275
276        // Calculate intra-cluster density (k-nearest neighbors within cluster)
277        intra_distances.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
278        let intra_k = std::cmp::min(k, intra_distances.len());
279
280        if intra_k > 0 {
281            let knn_intra_dist = intra_distances[intra_k - 1];
282            let intra_density = if knn_intra_dist > F::zero() {
283                F::one() / knn_intra_dist
284            } else {
285                F::zero()
286            };
287            intra_density_sum += intra_density;
288        }
289
290        // Calculate inter-cluster density (k-nearest neighbors from other clusters)
291        inter_distances.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
292        let inter_k = std::cmp::min(k, inter_distances.len());
293
294        if inter_k > 0 {
295            let knn_inter_dist = inter_distances[inter_k - 1];
296            let inter_density = if knn_inter_dist > F::zero() {
297                F::one() / knn_inter_dist
298            } else {
299                F::zero()
300            };
301            inter_density_sum += inter_density;
302        }
303    }
304
305    // Calculate average densities
306    let avg_intra_density = if n_samples > 0 {
307        intra_density_sum / F::from(n_samples).unwrap()
308    } else {
309        F::zero()
310    };
311
312    let avg_inter_density = if n_samples > 0 {
313        inter_density_sum / F::from(n_samples).unwrap()
314    } else {
315        F::zero()
316    };
317
318    // Calculate RDI (avoiding division by zero)
319    let rdi = if avg_inter_density > F::zero() {
320        avg_intra_density / avg_inter_density
321    } else if avg_intra_density > F::zero() {
322        F::max_value() // If inter-density is 0 but intra-density is not
323    } else {
324        F::one() // Default case if both are zero
325    };
326
327    Ok(rdi)
328}
329
330/// Calculate Density-Based Cluster Validity (DBCV) index
331///
332/// The DBCV index measures the validity of a clustering based on the relative density
333/// of clusters. It accounts for variations in cluster densities and shapes.
334/// Values closer to 1 indicate better clustering.
335///
336/// # Arguments
337///
338/// * `x` - Array of shape (n_samples, n_features) - The data
339/// * `labels` - Array of shape (n_samples,) - Predicted labels for each sample
340/// * `k` - Number of neighbors to consider for density calculation (default: 5)
341///
342/// # Returns
343///
344/// * DBCV index value (between -1 and 1)
345///
346/// # Examples
347///
348/// ```
349/// use scirs2_core::ndarray::{array, Array2};
350/// use scirs2_metrics::clustering::density::density_based_cluster_validity;
351///
352/// // Create a simple dataset with 2 clusters
353/// let x = Array2::from_shape_vec((6, 2), vec![
354///     1.0, 2.0, 1.5, 1.8, 1.2, 2.2,
355///     5.0, 6.0, 5.2, 5.8, 5.5, 6.2,
356/// ]).unwrap();
357///
358/// let labels = array![0, 0, 0, 1, 1, 1];
359///
360/// let dbcv = density_based_cluster_validity(&x, &labels, None).unwrap();
361/// ```
362#[allow(dead_code)]
363pub fn density_based_cluster_validity<F, S1, S2, D>(
364    x: &ArrayBase<S1, Ix2>,
365    labels: &ArrayBase<S2, D>,
366    k: Option<usize>,
367) -> Result<F>
368where
369    F: Float
370        + NumCast
371        + std::fmt::Debug
372        + scirs2_core::ndarray::ScalarOperand
373        + AddAssign
374        + DivAssign,
375    S1: Data<Elem = F>,
376    S2: Data<Elem = usize>,
377    D: Dimension,
378{
379    // Check that x and labels have the same number of samples
380    let n_samples = x.shape()[0];
381    if n_samples != labels.len() {
382        return Err(MetricsError::InvalidInput(format!(
383            "x has {} samples, but labels has {} samples",
384            n_samples,
385            labels.len()
386        )));
387    }
388
389    // Default k to a safe value based on n_samples
390    let k = k.unwrap_or_else(|| {
391        if n_samples <= 1 {
392            1
393        } else if n_samples < 10 {
394            std::cmp::min(2, n_samples - 1)
395        } else {
396            std::cmp::min(5, n_samples / 10)
397        }
398    });
399
400    // Get unique cluster labels
401    let mut unique_labels = Vec::new();
402    for &label in labels.iter() {
403        if !unique_labels.contains(&label) {
404            unique_labels.push(label);
405        }
406    }
407
408    if unique_labels.len() < 2 {
409        return Err(MetricsError::InvalidInput(
410            "At least two clusters are required to calculate DBCV".to_string(),
411        ));
412    }
413
414    // Sort labels for consistent results
415    unique_labels.sort();
416
417    // Collect indices by cluster
418    let mut cluster_indices = HashMap::new();
419    for label in &unique_labels {
420        cluster_indices.insert(*label, Vec::new());
421    }
422
423    for (i, &label) in labels.iter().enumerate() {
424        if let Some(indices) = cluster_indices.get_mut(&label) {
425            indices.push(i);
426        }
427    }
428
429    // Calculate density sparseness for each cluster
430    let mut sparseness_values = Vec::new();
431
432    for &label in &unique_labels {
433        let indices = cluster_indices.get(&label).unwrap();
434        if indices.len() <= 1 {
435            // Single point clusters have zero sparseness
436            sparseness_values.push(F::zero());
437            continue;
438        }
439
440        // Calculate core distance (k-nearest neighbor distance) for each point in the cluster
441        let mut core_distances = Vec::new();
442
443        for &i in indices {
444            let mut distances = Vec::new();
445            for &j in indices {
446                if i != j {
447                    let dist = calculate_euclidean_distance(&x.row(i), &x.row(j));
448                    distances.push(dist);
449                }
450            }
451
452            distances.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
453            let k_actual = std::cmp::min(k, distances.len());
454
455            if k_actual > 0 {
456                core_distances.push(distances[k_actual - 1]);
457            } else {
458                core_distances.push(F::zero());
459            }
460        }
461
462        // Calculate density sparseness (mutual reachability distance)
463        let avg_core_distance = if !core_distances.is_empty() {
464            core_distances.iter().fold(F::zero(), |sum, &val| sum + val)
465                / F::from(core_distances.len()).unwrap()
466        } else {
467            F::zero()
468        };
469
470        // Calculate the variance of core distances
471        let variance = if core_distances.len() > 1 {
472            let mean = avg_core_distance;
473            let sum_squared_diff = core_distances
474                .iter()
475                .fold(F::zero(), |sum, &val| sum + (val - mean) * (val - mean));
476            sum_squared_diff / F::from(core_distances.len() - 1).unwrap()
477        } else {
478            F::zero()
479        };
480
481        // Sparseness is a function of the average core distance and variance
482        let sparseness = avg_core_distance * (F::one() + variance.sqrt());
483        sparseness_values.push(sparseness);
484    }
485
486    // Calculate cluster separation (density separation)
487    let mut separation_matrix = vec![vec![F::zero(); unique_labels.len()]; unique_labels.len()];
488
489    for (i, &label_i) in unique_labels.iter().enumerate() {
490        let indices_i = cluster_indices.get(&label_i).unwrap();
491
492        for (j, &label_j) in unique_labels.iter().enumerate() {
493            if i == j {
494                continue;
495            }
496
497            let indices_j = cluster_indices.get(&label_j).unwrap();
498
499            // Calculate minimum distance between clusters
500            let mut min_dist = F::max_value();
501
502            for &idx_i in indices_i {
503                for &idx_j in indices_j {
504                    let dist = calculate_euclidean_distance(&x.row(idx_i), &x.row(idx_j));
505                    min_dist = F::min(min_dist, dist);
506                }
507            }
508
509            separation_matrix[i][j] = min_dist;
510        }
511    }
512
513    // Calculate the DBCV for each cluster
514    let mut cluster_validity = Vec::new();
515
516    for (i, &_) in unique_labels.iter().enumerate() {
517        let cluster_sparseness = sparseness_values[i];
518
519        // Find minimum separation to other clusters
520        let mut min_separation = F::max_value();
521        for j in 0..unique_labels.len() {
522            if i != j && separation_matrix[i][j] < min_separation {
523                min_separation = separation_matrix[i][j];
524            }
525        }
526
527        // If no other clusters or all separations are max_value
528        if min_separation == F::max_value() {
529            min_separation = F::zero();
530        }
531
532        // Calculate validity for this cluster
533        let validity = if min_separation > F::zero() || cluster_sparseness > F::zero() {
534            (min_separation - cluster_sparseness) / F::max(min_separation, cluster_sparseness)
535        } else {
536            F::zero()
537        };
538
539        cluster_validity.push(validity);
540    }
541
542    // Calculate DBCV as weighted average of cluster validities
543    let mut weighted_sum = F::zero();
544    let mut weight_sum = F::zero();
545
546    for (i, &label) in unique_labels.iter().enumerate() {
547        let weight = F::from(cluster_indices.get(&label).unwrap().len()).unwrap();
548        weighted_sum += weight * cluster_validity[i];
549        weight_sum += weight;
550    }
551
552    let dbcv = if weight_sum > F::zero() {
553        weighted_sum / weight_sum
554    } else {
555        F::zero()
556    };
557
558    // DBCV ranges from -1 to 1
559    Ok(dbcv)
560}
561
562/// Helper function to calculate Euclidean distance between two vectors
563#[allow(dead_code)]
564fn calculate_euclidean_distance<F, S1, S2>(a: &ArrayBase<S1, Ix1>, b: &ArrayBase<S2, Ix1>) -> F
565where
566    F: Float,
567    S1: Data<Elem = F>,
568    S2: Data<Elem = F>,
569{
570    let mut sum = F::zero();
571    for (x, y) in a.iter().zip(b.iter()) {
572        let diff = *x - *y;
573        sum = sum + diff * diff;
574    }
575    sum.sqrt()
576}
577
578#[cfg(test)]
579mod tests {
580    use super::*;
581    use scirs2_core::ndarray::Array2;
582
583    #[test]
584    fn test_local_density_factor() {
585        // Create a simple dataset with two well-separated clusters
586        let well_separated = Array2::from_shape_vec(
587            (6, 2),
588            vec![
589                1.0, 2.0, 1.5, 1.8, 1.2, 2.2, 10.0, 12.0, 10.2, 11.8, 10.5, 12.2,
590            ],
591        )
592        .unwrap();
593
594        let labels = scirs2_core::ndarray::array![0, 0, 0, 1, 1, 1];
595
596        // Calculate LDF with k=2 (small enough for this test case)
597        let factors = local_density_factor(&well_separated, &labels, Some(2)).unwrap();
598
599        // Both clusters should have similar density factors
600        assert!(factors.len() == 2);
601        assert!(factors.contains_key(&0));
602        assert!(factors.contains_key(&1));
603
604        // Create a dataset with varying density
605        let varying_density = Array2::from_shape_vec(
606            (6, 2),
607            vec![
608                1.0, 1.1, 1.05, 1.05, 1.1, 1.0, // Dense cluster
609                5.0, 5.0, 6.0, 6.0, 7.0, 7.0, // Sparse cluster
610            ],
611        )
612        .unwrap();
613
614        let labels = scirs2_core::ndarray::array![0, 0, 0, 1, 1, 1];
615
616        // Calculate LDF with k=2
617        let factors = local_density_factor(&varying_density, &labels, Some(2)).unwrap();
618
619        // Dense cluster should have higher factor than sparse cluster
620        assert!(*factors.get(&0).unwrap() > *factors.get(&1).unwrap());
621    }
622
623    #[test]
624    fn test_relative_density_index() {
625        // Create a simple dataset with two well-separated clusters
626        let well_separated = Array2::from_shape_vec(
627            (6, 2),
628            vec![
629                1.0, 2.0, 1.5, 1.8, 1.2, 2.2, 10.0, 12.0, 10.2, 11.8, 10.5, 12.2,
630            ],
631        )
632        .unwrap();
633
634        let labels = scirs2_core::ndarray::array![0, 0, 0, 1, 1, 1];
635
636        // Calculate RDI with k=2
637        let rdi = relative_density_index(&well_separated, &labels, Some(2)).unwrap();
638
639        // Well-separated clusters should have high RDI
640        assert!(rdi > 1.0);
641
642        // Create a dataset with overlapping clusters
643        let overlapping = Array2::from_shape_vec(
644            (6, 2),
645            vec![1.0, 2.0, 1.5, 1.8, 3.0, 3.0, 3.0, 3.0, 4.0, 4.5, 5.0, 5.5],
646        )
647        .unwrap();
648
649        let labels = scirs2_core::ndarray::array![0, 0, 0, 1, 1, 1];
650
651        // Calculate RDI for overlapping clusters
652        let rdi_overlapping = relative_density_index(&overlapping, &labels, Some(2)).unwrap();
653
654        // Overlapping clusters should have lower RDI
655        assert!(rdi > rdi_overlapping);
656    }
657
658    #[test]
659    fn test_density_based_cluster_validity() {
660        // Create a simple dataset with two well-separated clusters
661        let well_separated = Array2::from_shape_vec(
662            (6, 2),
663            vec![
664                1.0, 2.0, 1.5, 1.8, 1.2, 2.2, 10.0, 12.0, 10.2, 11.8, 10.5, 12.2,
665            ],
666        )
667        .unwrap();
668
669        let labels = scirs2_core::ndarray::array![0, 0, 0, 1, 1, 1];
670
671        // Calculate DBCV with k=2
672        let dbcv = density_based_cluster_validity(&well_separated, &labels, Some(2)).unwrap();
673
674        // DBCV should be positive for well-separated clusters
675        assert!(dbcv > 0.0);
676
677        // Create a dataset with poor clustering
678        let poor_clustering = Array2::from_shape_vec(
679            (6, 2),
680            vec![1.0, 2.0, 8.0, 9.0, 1.2, 2.2, 8.0, 9.0, 1.0, 2.0, 8.0, 9.0],
681        )
682        .unwrap();
683
684        // Labels not matching the natural clusters
685        let bad_labels = scirs2_core::ndarray::array![0, 0, 0, 1, 1, 1];
686
687        // Calculate DBCV for poor clustering
688        let bad_dbcv =
689            density_based_cluster_validity(&poor_clustering, &bad_labels, Some(2)).unwrap();
690
691        // DBCV should be lower for poorly defined clusters
692        assert!(dbcv > bad_dbcv);
693    }
694}