scirs2-cluster 0.4.2

Clustering algorithms module for SciRS2 (scirs2-cluster)
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
//! Core density peaks clustering algorithm.
//!
//! Implements the Rodriguez & Laio (2014) density peaks algorithm with both
//! Gaussian and cutoff kernel density estimators.

use crate::error::{ClusteringError, Result};

/// Result of density peaks clustering.
#[derive(Debug, Clone)]
pub struct DensityPeaksAdvResult {
    /// Cluster label for each data point (-1 = unassigned/noise if no center found).
    pub labels: Vec<i64>,
    /// Indices of detected cluster centers.
    pub centers: Vec<usize>,
    /// Local density rho for each point.
    pub rho: Vec<f64>,
    /// Distance delta to nearest point with higher density.
    pub delta: Vec<f64>,
    /// Total number of clusters found.
    pub n_clusters: usize,
}

/// Kernel type for local density estimation.
#[derive(Debug, Clone, PartialEq)]
pub enum KernelType {
    /// Gaussian kernel: rho_i = sum_j exp(-d_{ij}^2 / (2 * d_c^2))
    Gaussian,
    /// Step/cutoff kernel: rho_i = #{j : d_{ij} < d_c} - 1
    Cutoff,
}

/// Density Peaks clustering builder.
///
/// # Example
///
/// ```
/// use scirs2_cluster::density_peaks_adv::{DensityPeaksAdv, KernelType};
///
/// let data = vec![
///     vec![0.0, 0.0], vec![0.1, 0.0], vec![0.0, 0.1],
///     vec![5.0, 5.0], vec![5.1, 5.0], vec![5.0, 5.1],
/// ];
/// let result = DensityPeaksAdv::new()
///     .with_n_clusters(2)
///     .fit(&data)
///     .expect("operation should succeed");
/// assert_eq!(result.n_clusters, 2);
/// ```
pub struct DensityPeaksAdv {
    /// Cutoff distance d_c (None = auto-select at 2nd percentile of pairwise distances).
    pub d_c: Option<f64>,
    /// Kernel type for density estimation.
    pub kernel: KernelType,
    /// If Some(k), automatically select the top-k centers by gamma = rho * delta.
    pub n_clusters: Option<usize>,
    /// Density threshold for center selection (threshold-based mode).
    pub rho_threshold: f64,
    /// Delta threshold for center selection (threshold-based mode).
    pub delta_threshold: f64,
}

impl DensityPeaksAdv {
    /// Create a new `DensityPeaksAdv` with default parameters.
    pub fn new() -> Self {
        Self {
            d_c: None,
            kernel: KernelType::Gaussian,
            n_clusters: None,
            rho_threshold: 0.0,
            delta_threshold: 0.0,
        }
    }

    /// Set the cutoff distance d_c explicitly.
    pub fn with_d_c(mut self, d_c: f64) -> Self {
        self.d_c = Some(d_c);
        self
    }

    /// Set the kernel type.
    pub fn with_kernel(mut self, kernel: KernelType) -> Self {
        self.kernel = kernel;
        self
    }

    /// Automatically select the top-k centers by gamma = rho * delta.
    pub fn with_n_clusters(mut self, k: usize) -> Self {
        self.n_clusters = Some(k);
        self
    }

    /// Use threshold-based center selection.
    pub fn with_thresholds(mut self, rho_t: f64, delta_t: f64) -> Self {
        self.rho_threshold = rho_t;
        self.delta_threshold = delta_t;
        self
    }

    /// Fit density peaks clustering on a slice of feature vectors.
    ///
    /// Each inner `Vec<f64>` is one data point. All points must have the same
    /// dimensionality.
    ///
    /// # Errors
    ///
    /// Returns [`ClusteringError::InvalidInput`] if `data` is empty or if
    /// `n_clusters` exceeds the number of data points.
    pub fn fit(&self, data: &[Vec<f64>]) -> Result<DensityPeaksAdvResult> {
        let n = data.len();
        if n == 0 {
            return Err(ClusteringError::InvalidInput(
                "input data must not be empty".to_string(),
            ));
        }

        // Validate uniform dimensionality
        let dim = data[0].len();
        for (i, point) in data.iter().enumerate() {
            if point.len() != dim {
                return Err(ClusteringError::InvalidInput(format!(
                    "point {} has dimension {} but expected {}",
                    i,
                    point.len(),
                    dim
                )));
            }
        }

        // Compute full n×n pairwise Euclidean distance matrix (upper-tri then mirror).
        let dists = compute_pairwise_distances(data);

        // Auto-select d_c: 2nd percentile of all pairwise distances.
        let d_c = match self.d_c {
            Some(v) => {
                if v <= 0.0 {
                    return Err(ClusteringError::InvalidInput(
                        "d_c must be positive".to_string(),
                    ));
                }
                v
            }
            None => auto_select_dc(&dists, n),
        };

        // Compute local density rho for each point.
        let rho = compute_rho(&dists, n, d_c, &self.kernel);

        // Sort indices by decreasing density.
        let mut order: Vec<usize> = (0..n).collect();
        order.sort_by(|&a, &b| {
            rho[b]
                .partial_cmp(&rho[a])
                .unwrap_or(std::cmp::Ordering::Equal)
        });

        // For each point: find distance to nearest point with higher density.
        let (delta, nearest_higher) = compute_delta(&dists, n, &rho, &order);

        // Select cluster centers.
        let centers = select_centers(n, &rho, &delta, self.n_clusters, self.rho_threshold, self.delta_threshold)?;

        // Propagate cluster assignments following density order.
        let labels = propagate_labels(n, &centers, &nearest_higher, &order);

        let n_clusters = centers.len();
        Ok(DensityPeaksAdvResult {
            labels,
            centers,
            rho,
            delta,
            n_clusters,
        })
    }
}

impl Default for DensityPeaksAdv {
    fn default() -> Self {
        Self::new()
    }
}

// ---------------------------------------------------------------------------
// Internal helpers
// ---------------------------------------------------------------------------

/// Compute the n×n Euclidean distance matrix stored as a flat Vec (row-major).
fn compute_pairwise_distances(data: &[Vec<f64>]) -> Vec<f64> {
    let n = data.len();
    let mut dists = vec![0.0f64; n * n];
    for i in 0..n {
        for j in (i + 1)..n {
            let d: f64 = data[i]
                .iter()
                .zip(data[j].iter())
                .map(|(a, b)| (a - b).powi(2))
                .sum::<f64>()
                .sqrt();
            dists[i * n + j] = d;
            dists[j * n + i] = d;
        }
    }
    dists
}

/// Auto-select d_c as the 2nd percentile of all (n*(n-1)/2) pairwise distances.
fn auto_select_dc(dists: &[f64], n: usize) -> f64 {
    let mut upper: Vec<f64> = Vec::with_capacity(n * (n - 1) / 2);
    for i in 0..n {
        for j in (i + 1)..n {
            upper.push(dists[i * n + j]);
        }
    }
    upper.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
    let idx = ((upper.len() as f64) * 0.02) as usize;
    upper.get(idx).copied().unwrap_or(1.0).max(1e-10)
}

/// Compute local density rho for each point given a flat distance matrix.
fn compute_rho(dists: &[f64], n: usize, d_c: f64, kernel: &KernelType) -> Vec<f64> {
    (0..n)
        .map(|i| match kernel {
            KernelType::Gaussian => {
                let two_dc2 = 2.0 * d_c * d_c;
                (0..n)
                    .filter(|&j| j != i)
                    .map(|j| {
                        let d = dists[i * n + j];
                        (-(d * d) / two_dc2).exp()
                    })
                    .sum()
            }
            KernelType::Cutoff => {
                (0..n)
                    .filter(|&j| j != i && dists[i * n + j] < d_c)
                    .count() as f64
            }
        })
        .collect()
}

/// For each point, compute delta (min dist to a higher-density point) and
/// the index of that nearest higher-density neighbor.
///
/// The point with maximum density gets delta = max pairwise distance in the dataset.
fn compute_delta(
    dists: &[f64],
    n: usize,
    rho: &[f64],
    order: &[usize],
) -> (Vec<f64>, Vec<usize>) {
    let mut delta = vec![f64::INFINITY; n];
    let mut nearest_higher = vec![0usize; n];

    for (rank, &i) in order.iter().enumerate() {
        if rank == 0 {
            // Highest-density point: delta = max distance to any other point.
            let max_d = (0..n)
                .filter(|&j| j != i)
                .map(|j| dists[i * n + j])
                .fold(f64::NEG_INFINITY, f64::max);
            delta[i] = if max_d.is_finite() { max_d } else { 0.0 };
            nearest_higher[i] = i;
        } else {
            // Find nearest point among those with strictly higher density.
            for &j in order[..rank].iter() {
                let d = dists[i * n + j];
                if d < delta[i] {
                    delta[i] = d;
                    nearest_higher[i] = j;
                }
            }
            // Handle case where delta was never updated (all higher-density
            // points are at the same index — should not happen, but defensive).
            if delta[i].is_infinite() {
                delta[i] = 0.0;
            }
        }
    }

    (delta, nearest_higher)
}

/// Select cluster centers either by top-k gamma or by (rho, delta) thresholds.
fn select_centers(
    n: usize,
    rho: &[f64],
    delta: &[f64],
    n_clusters: Option<usize>,
    rho_threshold: f64,
    delta_threshold: f64,
) -> Result<Vec<usize>> {
    match n_clusters {
        Some(k) => {
            if k > n {
                return Err(ClusteringError::InvalidInput(format!(
                    "n_clusters ({}) exceeds number of data points ({})",
                    k, n
                )));
            }
            // Sort by gamma = rho * delta descending; take top-k.
            let mut gamma: Vec<(f64, usize)> = (0..n).map(|i| (rho[i] * delta[i], i)).collect();
            gamma.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
            Ok(gamma[..k].iter().map(|(_, idx)| *idx).collect())
        }
        None => {
            // Threshold-based.
            let centers: Vec<usize> = (0..n)
                .filter(|&i| rho[i] > rho_threshold && delta[i] > delta_threshold)
                .collect();
            Ok(centers)
        }
    }
}

/// Assign labels to all points by propagating cluster assignments along the
/// decreasing density order.
fn propagate_labels(
    n: usize,
    centers: &[usize],
    nearest_higher: &[usize],
    order: &[usize],
) -> Vec<i64> {
    let mut labels = vec![-1i64; n];

    // Seed cluster centers with their cluster id.
    for (cluster_id, &center) in centers.iter().enumerate() {
        labels[center] = cluster_id as i64;
    }

    // Propagate: walk density order (highest to lowest). Each unlabeled point
    // inherits the label of its nearest higher-density neighbor which was
    // processed earlier (has higher density, so appeared earlier in `order`).
    for &i in order.iter() {
        if labels[i] == -1 {
            labels[i] = labels[nearest_higher[i]];
        }
    }

    labels
}

#[cfg(test)]
mod tests {
    use super::*;

    fn make_two_gaussians() -> Vec<Vec<f64>> {
        // Two tight Gaussian-like clusters: cluster A near (0,0), cluster B near (10,10)
        vec![
            vec![0.0, 0.0],
            vec![0.2, 0.1],
            vec![-0.1, 0.2],
            vec![0.1, -0.1],
            vec![0.3, 0.0],
            vec![10.0, 10.0],
            vec![10.1, 9.9],
            vec![9.9, 10.1],
            vec![10.2, 10.0],
            vec![10.0, 9.8],
        ]
    }

    #[test]
    fn test_density_peaks_two_clusters_n_clusters() {
        let data = make_two_gaussians();
        let result = DensityPeaksAdv::new()
            .with_n_clusters(2)
            .fit(&data)
            .expect("fit should succeed");

        assert_eq!(result.n_clusters, 2);
        assert_eq!(result.labels.len(), 10);
        // All points should be assigned (no -1 remaining for n_clusters mode
        // when two centers exist and all neighbors are reachable).
        for &lbl in &result.labels {
            assert!(lbl >= 0, "label should be non-negative");
        }
    }

    #[test]
    fn test_density_peaks_gaussian_kernel() {
        let data = make_two_gaussians();
        let result = DensityPeaksAdv::new()
            .with_kernel(KernelType::Gaussian)
            .with_n_clusters(2)
            .fit(&data)
            .expect("Gaussian kernel fit should succeed");

        assert_eq!(result.n_clusters, 2);
        assert_eq!(result.rho.len(), 10);
        assert_eq!(result.delta.len(), 10);
    }

    #[test]
    fn test_density_peaks_cutoff_kernel() {
        let data = make_two_gaussians();
        let result = DensityPeaksAdv::new()
            .with_kernel(KernelType::Cutoff)
            .with_n_clusters(2)
            .fit(&data)
            .expect("Cutoff kernel fit should succeed");

        assert_eq!(result.n_clusters, 2);
    }

    #[test]
    fn test_density_peaks_explicit_dc() {
        let data = make_two_gaussians();
        let result = DensityPeaksAdv::new()
            .with_d_c(0.5)
            .with_n_clusters(2)
            .fit(&data)
            .expect("fit with explicit d_c should succeed");

        assert_eq!(result.n_clusters, 2);
    }

    #[test]
    fn test_density_peaks_threshold_mode() {
        let data = make_two_gaussians();
        // With very low thresholds, many centers can be selected.
        let result = DensityPeaksAdv::new()
            .with_thresholds(0.0, 0.0)
            .fit(&data)
            .expect("threshold mode fit should succeed");

        // At least some centers should be selected.
        assert!(result.n_clusters > 0);
    }

    #[test]
    fn test_density_peaks_empty_input() {
        let data: Vec<Vec<f64>> = vec![];
        let err = DensityPeaksAdv::new().fit(&data);
        assert!(err.is_err());
    }

    #[test]
    fn test_density_peaks_invalid_n_clusters() {
        let data = make_two_gaussians();
        let err = DensityPeaksAdv::new().with_n_clusters(100).fit(&data);
        assert!(err.is_err());
    }

    #[test]
    fn test_auto_select_dc() {
        // Test the auto d_c selection with a known dataset
        let data = make_two_gaussians();
        let dists = compute_pairwise_distances(&data);
        let dc = auto_select_dc(&dists, data.len());
        assert!(dc > 0.0, "d_c must be positive");
    }

    #[test]
    fn test_same_cluster_labeling() {
        let data = make_two_gaussians();
        let result = DensityPeaksAdv::new()
            .with_n_clusters(2)
            .fit(&data)
            .expect("fit should succeed");

        // Points 0-4 should share the same label.
        let label_a = result.labels[0];
        for i in 1..5 {
            assert_eq!(result.labels[i], label_a, "cluster A consistency");
        }
        // Points 5-9 should share the same label.
        let label_b = result.labels[5];
        for i in 6..10 {
            assert_eq!(result.labels[i], label_b, "cluster B consistency");
        }
        assert_ne!(label_a, label_b, "two clusters must differ");
    }
}