Skip to main content

scirs2_cluster/
consensus.rs

1//! Consensus Clustering (Monti et al., 2003)
2//!
3//! Consensus clustering repeatedly subsamples the dataset and applies a base clustering
4//! algorithm to build a co-clustering frequency matrix (the "consensus matrix").
5//! The final clustering is obtained by applying hierarchical agglomeration to that matrix.
6//! A sweep over a range of k values lets callers identify the most stable k by examining
7//! the relative change in the area under the empirical CDF of the consensus matrix.
8//!
9//! # References
10//!
11//! Monti, S., Tamayo, P., Mesirov, J., & Golub, T. (2003). Consensus clustering: A resampling-
12//! based method for class discovery and visualization of gene expression microarray data.
13//! *Machine Learning*, 52(1-2), 91-118.
14
15use scirs2_core::ndarray::Array2;
16
17use crate::error::{ClusteringError, Result};
18
19/// Configuration for consensus clustering.
20#[derive(Debug, Clone)]
21pub struct ConsensusConfig {
22    /// Number of subsampling / clustering runs (default 100).
23    pub n_runs: usize,
24    /// Fraction of items to include in each subsampled run (default 0.8).
25    pub subsample_fraction: f64,
26    /// Maximum number of clusters to try when sweeping k values.
27    pub max_k: usize,
28    /// Random seed for reproducibility.
29    pub seed: u64,
30}
31
32impl Default for ConsensusConfig {
33    fn default() -> Self {
34        Self {
35            n_runs: 100,
36            subsample_fraction: 0.8,
37            max_k: 10,
38            seed: 42,
39        }
40    }
41}
42
43/// Result of a single consensus clustering run for a fixed k.
44#[derive(Debug, Clone)]
45pub struct ConsensusResult {
46    /// n×n co-clustering frequency matrix.  `matrix[i][j]` is the fraction of runs
47    /// (in which both i and j were sampled) where they were assigned to the same cluster.
48    pub consensus_matrix: Array2<f64>,
49    /// Final hard cluster assignments (0-indexed).
50    pub assignments: Vec<usize>,
51    /// Area under the empirical CDF of the upper-triangle entries of the consensus matrix.
52    pub cdf_area: f64,
53    /// Relative change in CDF area compared with k-1 (0.0 for the first k in a sweep).
54    pub delta_k: f64,
55    /// Number of clusters k.
56    pub k: usize,
57}
58
59// ---------------------------------------------------------------------------
60// Internal helpers
61// ---------------------------------------------------------------------------
62
63/// Minimal linear-congruential PRNG (Knuth MMIX) — avoids any external dependency.
64#[derive(Clone)]
65struct Lcg {
66    state: u64,
67}
68
69impl Lcg {
70    fn new(seed: u64) -> Self {
71        Self {
72            state: seed.wrapping_add(1),
73        }
74    }
75
76    /// Return next value in [0, 1).
77    fn next_f64(&mut self) -> f64 {
78        self.state = self
79            .state
80            .wrapping_mul(6_364_136_223_846_793_005)
81            .wrapping_add(1_442_695_040_888_963_407);
82        // Use upper 53 bits for float mantissa precision
83        (self.state >> 11) as f64 / (1u64 << 53) as f64
84    }
85
86    /// Sample k distinct indices from 0..n using Fisher-Yates partial shuffle.
87    fn sample_indices(&mut self, n: usize, k: usize) -> Vec<usize> {
88        let mut pool: Vec<usize> = (0..n).collect();
89        for i in 0..k {
90            let j = i + (self.next_f64() * (n - i) as f64) as usize;
91            let j = j.min(n - 1);
92            pool.swap(i, j);
93        }
94        pool[..k].to_vec()
95    }
96}
97
98/// Build the consensus matrix from `n_runs` subsampled clusterings.
99///
100/// For each run:
101/// 1. Subsample `round(n * fraction)` items.
102/// 2. Run the base clusterer on the submatrix.
103/// 3. Accumulate co-clustering counts and indicator counts.
104///
105/// `consensus[i][j] = co_cluster_count[i][j] / indicator[i][j]`
106/// where `indicator[i][j]` is the number of runs in which both i and j were sampled.
107fn build_consensus_matrix<F>(
108    data: &Array2<f64>,
109    k: usize,
110    base_clusterer: &F,
111    config: &ConsensusConfig,
112) -> Result<Array2<f64>>
113where
114    F: Fn(&Array2<f64>, usize, u64) -> Vec<usize>,
115{
116    let n = data.nrows();
117    let d = data.ncols();
118
119    if n == 0 || d == 0 {
120        return Err(ClusteringError::InvalidInput(
121            "Data matrix must be non-empty".into(),
122        ));
123    }
124    if k < 2 {
125        return Err(ClusteringError::InvalidInput("k must be at least 2".into()));
126    }
127    if config.subsample_fraction <= 0.0 || config.subsample_fraction > 1.0 {
128        return Err(ClusteringError::InvalidInput(
129            "subsample_fraction must be in (0, 1]".into(),
130        ));
131    }
132    if config.n_runs == 0 {
133        return Err(ClusteringError::InvalidInput(
134            "n_runs must be at least 1".into(),
135        ));
136    }
137
138    let sub_n = ((n as f64 * config.subsample_fraction).round() as usize).max(k);
139    let sub_n = sub_n.min(n);
140
141    let mut co_counts = vec![0u32; n * n];
142    let mut indicator = vec![0u32; n * n];
143
144    let mut rng = Lcg::new(config.seed);
145
146    for run in 0..config.n_runs {
147        let run_seed = config
148            .seed
149            .wrapping_add(run as u64)
150            .wrapping_mul(2_654_435_761);
151
152        // Sample indices
153        let idx = rng.sample_indices(n, sub_n);
154
155        // Build submatrix
156        let mut sub_data = Array2::<f64>::zeros((sub_n, d));
157        for (row_out, &row_in) in idx.iter().enumerate() {
158            for col in 0..d {
159                sub_data[[row_out, col]] = data[[row_in, col]];
160            }
161        }
162
163        // Run base clusterer
164        let sub_labels = base_clusterer(&sub_data, k, run_seed);
165
166        if sub_labels.len() != sub_n {
167            return Err(ClusteringError::ComputationError(format!(
168                "Base clusterer returned {} labels but expected {}",
169                sub_labels.len(),
170                sub_n
171            )));
172        }
173
174        // Accumulate
175        for (a, &ia) in idx.iter().enumerate() {
176            for (b, &ib) in idx.iter().enumerate() {
177                indicator[ia * n + ib] += 1;
178                if sub_labels[a] == sub_labels[b] {
179                    co_counts[ia * n + ib] += 1;
180                }
181            }
182        }
183    }
184
185    // Normalise
186    let mut matrix = Array2::<f64>::zeros((n, n));
187    for i in 0..n {
188        for j in 0..n {
189            let ind = indicator[i * n + j];
190            matrix[[i, j]] = if ind > 0 {
191                co_counts[i * n + j] as f64 / ind as f64
192            } else {
193                0.0
194            };
195        }
196    }
197
198    Ok(matrix)
199}
200
201/// Compute the area under the empirical CDF of the upper-triangle entries
202/// of the consensus matrix.  Values are sorted; the CDF is a step function
203/// with uniform x-spacing 1/(m-1) where m is the number of values.
204fn cdf_area(matrix: &Array2<f64>) -> f64 {
205    let n = matrix.nrows();
206    let mut vals: Vec<f64> = Vec::with_capacity(n * (n - 1) / 2);
207
208    for i in 0..n {
209        for j in (i + 1)..n {
210            vals.push(matrix[[i, j]]);
211        }
212    }
213
214    if vals.is_empty() {
215        return 0.0;
216    }
217
218    vals.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
219
220    let m = vals.len();
221    if m == 1 {
222        return vals[0];
223    }
224
225    // Trapezoid rule on the empirical CDF
226    // x values are evenly spaced in [0,1]
227    let step = 1.0 / (m - 1) as f64;
228    let mut area = 0.0_f64;
229    for (rank, &v) in vals.iter().enumerate() {
230        let cdf = (rank as f64) / ((m - 1) as f64);
231        // height at this x = 1 - cdf  (fraction of values <= v)
232        // contribution via trapezoid: 0.5 * step * (height[rank] + height[rank+1])
233        // simplified by integrating 1 - F(x) directly
234        if rank + 1 < m {
235            let cdf_next = (rank + 1) as f64 / (m - 1) as f64;
236            let _ = cdf;
237            let _ = cdf_next;
238            // Area under the sorted CDF from val[rank] to val[rank+1] segment:
239            // Integrate (1 - F(c)) dc over c in [v, v_next]
240            // For a uniform step in rank-space: area += step * (1 - avg_rank_normalized)
241            let avg_cdf = (rank as f64 + 0.5) / (m - 1) as f64;
242            area += step * (1.0 - avg_cdf);
243        }
244    }
245
246    // Normalise by the range of the CDF (which is [0,1]): already normalised.
247    area
248}
249
250/// Hierarchical agglomerative clustering (average-linkage) on the consensus matrix,
251/// returning cluster assignments for k clusters.
252fn cluster_consensus_matrix(matrix: &Array2<f64>, k: usize) -> Result<Vec<usize>> {
253    let n = matrix.nrows();
254    if k == 0 || k > n {
255        return Err(ClusteringError::InvalidInput(format!(
256            "k={} out of valid range [1, {}]",
257            k, n
258        )));
259    }
260    if k == n {
261        return Ok((0..n).collect());
262    }
263    if k == 1 {
264        return Ok(vec![0; n]);
265    }
266
267    // Distance = 1 - similarity
268    let mut clusters: Vec<Vec<usize>> = (0..n).map(|i| vec![i]).collect();
269
270    while clusters.len() > k {
271        let nc = clusters.len();
272        let mut best_sim = f64::NEG_INFINITY;
273        let mut best_ci = 0;
274        let mut best_cj = 1;
275
276        for ci in 0..nc {
277            for cj in (ci + 1)..nc {
278                // Average linkage
279                let mut sim_sum = 0.0_f64;
280                let mut count = 0usize;
281                for &a in &clusters[ci] {
282                    for &b in &clusters[cj] {
283                        sim_sum += matrix[[a, b]];
284                        count += 1;
285                    }
286                }
287                let avg_sim = if count > 0 {
288                    sim_sum / count as f64
289                } else {
290                    0.0
291                };
292                if avg_sim > best_sim {
293                    best_sim = avg_sim;
294                    best_ci = ci;
295                    best_cj = cj;
296                }
297            }
298        }
299
300        // Merge cj into ci (take from higher index to avoid shifting)
301        let removed = clusters.remove(best_cj);
302        clusters[best_ci].extend(removed);
303    }
304
305    // Build label array
306    let mut labels = vec![0usize; n];
307    for (cluster_idx, members) in clusters.iter().enumerate() {
308        for &point_idx in members {
309            labels[point_idx] = cluster_idx;
310        }
311    }
312    Ok(labels)
313}
314
315// ---------------------------------------------------------------------------
316// Public API
317// ---------------------------------------------------------------------------
318
319/// Run consensus clustering on `data` using a fixed k.
320///
321/// The base clusterer is called as `base_clusterer(subdata, k, seed)` and must
322/// return a `Vec<usize>` of length equal to `subdata.nrows()`.
323///
324/// # Arguments
325///
326/// * `data`            — n × d data matrix.
327/// * `k`               — Number of clusters.
328/// * `base_clusterer`  — Closure `(data, k, seed) -> labels`.
329/// * `config`          — Algorithm configuration.
330///
331/// # Returns
332///
333/// `ConsensusResult` containing the consensus matrix, final assignments, and stability metrics.
334///
335/// # Examples
336///
337/// ```rust
338/// use scirs2_cluster::consensus::{consensus_clustering, ConsensusConfig};
339/// use scirs2_core::ndarray::Array2;
340///
341/// fn simple_clusterer(data: &Array2<f64>, k: usize, seed: u64) -> Vec<usize> {
342///     let n = data.nrows();
343///     (0..n).map(|i| i % k).collect()
344/// }
345///
346/// let data = Array2::from_shape_vec((6, 2), vec![
347///     0.0, 0.0,  0.1, 0.0,  0.0, 0.1,
348///     5.0, 5.0,  5.1, 5.0,  5.0, 5.1,
349/// ]).expect("operation should succeed");
350///
351/// let config = ConsensusConfig { n_runs: 10, subsample_fraction: 0.8, max_k: 4, seed: 42 };
352/// let result = consensus_clustering(&data, 2, simple_clusterer, config).expect("operation should succeed");
353/// assert_eq!(result.k, 2);
354/// ```
355pub fn consensus_clustering<F>(
356    data: &Array2<f64>,
357    k: usize,
358    base_clusterer: F,
359    config: ConsensusConfig,
360) -> Result<ConsensusResult>
361where
362    F: Fn(&Array2<f64>, usize, u64) -> Vec<usize>,
363{
364    let matrix = build_consensus_matrix(data, k, &base_clusterer, &config)?;
365    let assignments = cluster_consensus_matrix(&matrix, k)?;
366    let area = cdf_area(&matrix);
367
368    Ok(ConsensusResult {
369        consensus_matrix: matrix,
370        assignments,
371        cdf_area: area,
372        delta_k: 0.0, // Filled in by sweep; single-k call has no reference.
373        k,
374    })
375}
376
377/// Sweep over a range of k values and return consensus results for each k.
378///
379/// The `delta_k` field of each result is the relative increase in CDF area:
380/// `delta_k[k] = (area[k] - area[k-1]) / area[k-1]`  (0 for the first entry).
381///
382/// Callers can select the k with the smallest positive `delta_k` as the most stable.
383///
384/// # Arguments
385///
386/// * `data`      — n × d data matrix.
387/// * `k_range`   — Half-open range of k values (e.g. `2..8`).
388/// * `config`    — Algorithm configuration; `max_k` is ignored (k_range takes precedence).
389///
390/// # Examples
391///
392/// ```rust
393/// use scirs2_cluster::consensus::{consensus_clustering_sweep, ConsensusConfig};
394/// use scirs2_core::ndarray::Array2;
395///
396/// fn simple_clusterer(data: &Array2<f64>, k: usize, seed: u64) -> Vec<usize> {
397///     let n = data.nrows();
398///     (0..n).map(|i| i % k).collect()
399/// }
400///
401/// let data = Array2::from_shape_vec((8, 2), vec![
402///     0.0, 0.0,  0.1, 0.0,  0.0, 0.1,  -0.1, 0.0,
403///     5.0, 5.0,  5.1, 5.0,  5.0, 5.1,   4.9, 5.0,
404/// ]).expect("operation should succeed");
405///
406/// let config = ConsensusConfig { n_runs: 10, subsample_fraction: 0.8, max_k: 4, seed: 7 };
407/// let results = consensus_clustering_sweep(&data, 2..4, simple_clusterer, config).expect("operation should succeed");
408/// assert_eq!(results.len(), 2);
409/// ```
410pub fn consensus_clustering_sweep<F>(
411    data: &Array2<f64>,
412    k_range: std::ops::Range<usize>,
413    base_clusterer: F,
414    config: ConsensusConfig,
415) -> Result<Vec<ConsensusResult>>
416where
417    F: Fn(&Array2<f64>, usize, u64) -> Vec<usize>,
418{
419    if k_range.is_empty() {
420        return Err(ClusteringError::InvalidInput(
421            "k_range must be non-empty".into(),
422        ));
423    }
424    if k_range.start < 2 {
425        return Err(ClusteringError::InvalidInput(
426            "k_range must start at 2 or above".into(),
427        ));
428    }
429
430    let k_values: Vec<usize> = k_range.collect();
431    let mut results: Vec<ConsensusResult> = Vec::with_capacity(k_values.len());
432    let mut prev_area: Option<f64> = None;
433
434    for &k in &k_values {
435        // Use a k-specific seed offset to ensure different runs per k
436        let k_config = ConsensusConfig {
437            seed: config.seed.wrapping_add(k as u64 * 999_983),
438            ..config.clone()
439        };
440        let matrix = build_consensus_matrix(data, k, &base_clusterer, &k_config)?;
441        let assignments = cluster_consensus_matrix(&matrix, k)?;
442        let area = cdf_area(&matrix);
443
444        let delta_k = match prev_area {
445            None => 0.0,
446            Some(prev) => {
447                if prev.abs() < 1e-15 {
448                    0.0
449                } else {
450                    (area - prev) / prev
451                }
452            }
453        };
454
455        results.push(ConsensusResult {
456            consensus_matrix: matrix,
457            assignments,
458            cdf_area: area,
459            delta_k,
460            k,
461        });
462
463        prev_area = Some(area);
464    }
465
466    Ok(results)
467}
468
469// ---------------------------------------------------------------------------
470// Tests
471// ---------------------------------------------------------------------------
472
473#[cfg(test)]
474mod tests {
475    use super::*;
476    use scirs2_core::ndarray::Array2;
477
478    /// A simple deterministic base clusterer that assigns points to clusters by proximity
479    /// to a grid of k equidistant centers along the first feature dimension.
480    fn grid_clusterer(data: &Array2<f64>, k: usize, _seed: u64) -> Vec<usize> {
481        let n = data.nrows();
482        if n == 0 {
483            return vec![];
484        }
485        let min_val = data.column(0).fold(f64::INFINITY, |a, &b| a.min(b));
486        let max_val = data.column(0).fold(f64::NEG_INFINITY, |a, &b| a.max(b));
487        let range = (max_val - min_val).max(1e-10);
488
489        (0..n)
490            .map(|i| {
491                let t = (data[[i, 0]] - min_val) / range;
492                let c = (t * k as f64).floor() as usize;
493                c.min(k - 1)
494            })
495            .collect()
496    }
497
498    fn two_cluster_data() -> Array2<f64> {
499        Array2::from_shape_vec(
500            (12, 2),
501            vec![
502                0.0, 0.0, 0.1, 0.0, 0.0, 0.1, -0.1, 0.0, 0.1, 0.1, -0.1, -0.1, 10.0, 10.0, 10.1,
503                10.0, 10.0, 10.1, 9.9, 10.0, 10.1, 10.1, 9.9, 9.9,
504            ],
505        )
506        .expect("create test data")
507    }
508
509    #[test]
510    fn test_consensus_matrix_symmetry() {
511        let data = two_cluster_data();
512        let config = ConsensusConfig {
513            n_runs: 20,
514            subsample_fraction: 0.8,
515            max_k: 4,
516            seed: 1,
517        };
518        let result = consensus_clustering(&data, 2, grid_clusterer, config)
519            .expect("operation should succeed");
520        let m = &result.consensus_matrix;
521        let n = m.nrows();
522        for i in 0..n {
523            for j in 0..n {
524                assert!(
525                    (m[[i, j]] - m[[j, i]]).abs() < 1e-12,
526                    "Consensus matrix not symmetric at [{},{}]",
527                    i,
528                    j
529                );
530            }
531        }
532    }
533
534    #[test]
535    fn test_consensus_matrix_range() {
536        let data = two_cluster_data();
537        let config = ConsensusConfig {
538            n_runs: 20,
539            subsample_fraction: 0.8,
540            max_k: 4,
541            seed: 2,
542        };
543        let result = consensus_clustering(&data, 2, grid_clusterer, config)
544            .expect("operation should succeed");
545        for &v in result.consensus_matrix.iter() {
546            assert!(
547                v >= 0.0 && v <= 1.0 + 1e-12,
548                "Consensus matrix value {} out of [0,1]",
549                v
550            );
551        }
552    }
553
554    #[test]
555    fn test_consensus_assignments_count() {
556        let data = two_cluster_data();
557        let config = ConsensusConfig {
558            n_runs: 30,
559            subsample_fraction: 0.8,
560            max_k: 4,
561            seed: 3,
562        };
563        let result = consensus_clustering(&data, 2, grid_clusterer, config)
564            .expect("operation should succeed");
565        assert_eq!(result.assignments.len(), 12);
566        assert_eq!(result.k, 2);
567        // All assignments in [0, k)
568        for &a in &result.assignments {
569            assert!(a < 2, "Assignment {} >= k=2", a);
570        }
571    }
572
573    #[test]
574    fn test_consensus_sweep_returns_correct_count() {
575        let data = two_cluster_data();
576        let config = ConsensusConfig {
577            n_runs: 10,
578            subsample_fraction: 0.8,
579            max_k: 6,
580            seed: 4,
581        };
582        let results = consensus_clustering_sweep(&data, 2..5, grid_clusterer, config)
583            .expect("operation should succeed");
584        assert_eq!(results.len(), 3);
585        assert_eq!(results[0].k, 2);
586        assert_eq!(results[1].k, 3);
587        assert_eq!(results[2].k, 4);
588    }
589
590    #[test]
591    fn test_consensus_sweep_delta_k_first_is_zero() {
592        let data = two_cluster_data();
593        let config = ConsensusConfig {
594            n_runs: 10,
595            subsample_fraction: 0.8,
596            max_k: 6,
597            seed: 5,
598        };
599        let results = consensus_clustering_sweep(&data, 2..5, grid_clusterer, config)
600            .expect("operation should succeed");
601        assert_eq!(
602            results[0].delta_k, 0.0,
603            "First delta_k should be 0.0 (no prior k)"
604        );
605    }
606
607    #[test]
608    fn test_consensus_cdf_area_in_range() {
609        let data = two_cluster_data();
610        let config = ConsensusConfig {
611            n_runs: 20,
612            subsample_fraction: 0.8,
613            max_k: 4,
614            seed: 6,
615        };
616        let result = consensus_clustering(&data, 2, grid_clusterer, config)
617            .expect("operation should succeed");
618        assert!(
619            result.cdf_area >= 0.0 && result.cdf_area <= 1.0 + 1e-9,
620            "CDF area {} should be in [0,1]",
621            result.cdf_area
622        );
623    }
624
625    #[test]
626    fn test_consensus_invalid_k() {
627        let data = two_cluster_data();
628        let config = ConsensusConfig::default();
629        let err = consensus_clustering(&data, 1, grid_clusterer, config);
630        assert!(err.is_err(), "k=1 should return error");
631    }
632
633    #[test]
634    fn test_consensus_invalid_subsample() {
635        let data = two_cluster_data();
636        let config = ConsensusConfig {
637            subsample_fraction: 0.0,
638            ..ConsensusConfig::default()
639        };
640        let err = consensus_clustering(&data, 2, grid_clusterer, config);
641        assert!(err.is_err(), "subsample_fraction=0 should return error");
642    }
643}