1use scirs2_core::ndarray::Array2;
16
17use crate::error::{ClusteringError, Result};
18
19#[derive(Debug, Clone)]
21pub struct ConsensusConfig {
22 pub n_runs: usize,
24 pub subsample_fraction: f64,
26 pub max_k: usize,
28 pub seed: u64,
30}
31
32impl Default for ConsensusConfig {
33 fn default() -> Self {
34 Self {
35 n_runs: 100,
36 subsample_fraction: 0.8,
37 max_k: 10,
38 seed: 42,
39 }
40 }
41}
42
43#[derive(Debug, Clone)]
45pub struct ConsensusResult {
46 pub consensus_matrix: Array2<f64>,
49 pub assignments: Vec<usize>,
51 pub cdf_area: f64,
53 pub delta_k: f64,
55 pub k: usize,
57}
58
59#[derive(Clone)]
65struct Lcg {
66 state: u64,
67}
68
69impl Lcg {
70 fn new(seed: u64) -> Self {
71 Self {
72 state: seed.wrapping_add(1),
73 }
74 }
75
76 fn next_f64(&mut self) -> f64 {
78 self.state = self
79 .state
80 .wrapping_mul(6_364_136_223_846_793_005)
81 .wrapping_add(1_442_695_040_888_963_407);
82 (self.state >> 11) as f64 / (1u64 << 53) as f64
84 }
85
86 fn sample_indices(&mut self, n: usize, k: usize) -> Vec<usize> {
88 let mut pool: Vec<usize> = (0..n).collect();
89 for i in 0..k {
90 let j = i + (self.next_f64() * (n - i) as f64) as usize;
91 let j = j.min(n - 1);
92 pool.swap(i, j);
93 }
94 pool[..k].to_vec()
95 }
96}
97
98fn build_consensus_matrix<F>(
108 data: &Array2<f64>,
109 k: usize,
110 base_clusterer: &F,
111 config: &ConsensusConfig,
112) -> Result<Array2<f64>>
113where
114 F: Fn(&Array2<f64>, usize, u64) -> Vec<usize>,
115{
116 let n = data.nrows();
117 let d = data.ncols();
118
119 if n == 0 || d == 0 {
120 return Err(ClusteringError::InvalidInput(
121 "Data matrix must be non-empty".into(),
122 ));
123 }
124 if k < 2 {
125 return Err(ClusteringError::InvalidInput("k must be at least 2".into()));
126 }
127 if config.subsample_fraction <= 0.0 || config.subsample_fraction > 1.0 {
128 return Err(ClusteringError::InvalidInput(
129 "subsample_fraction must be in (0, 1]".into(),
130 ));
131 }
132 if config.n_runs == 0 {
133 return Err(ClusteringError::InvalidInput(
134 "n_runs must be at least 1".into(),
135 ));
136 }
137
138 let sub_n = ((n as f64 * config.subsample_fraction).round() as usize).max(k);
139 let sub_n = sub_n.min(n);
140
141 let mut co_counts = vec![0u32; n * n];
142 let mut indicator = vec![0u32; n * n];
143
144 let mut rng = Lcg::new(config.seed);
145
146 for run in 0..config.n_runs {
147 let run_seed = config
148 .seed
149 .wrapping_add(run as u64)
150 .wrapping_mul(2_654_435_761);
151
152 let idx = rng.sample_indices(n, sub_n);
154
155 let mut sub_data = Array2::<f64>::zeros((sub_n, d));
157 for (row_out, &row_in) in idx.iter().enumerate() {
158 for col in 0..d {
159 sub_data[[row_out, col]] = data[[row_in, col]];
160 }
161 }
162
163 let sub_labels = base_clusterer(&sub_data, k, run_seed);
165
166 if sub_labels.len() != sub_n {
167 return Err(ClusteringError::ComputationError(format!(
168 "Base clusterer returned {} labels but expected {}",
169 sub_labels.len(),
170 sub_n
171 )));
172 }
173
174 for (a, &ia) in idx.iter().enumerate() {
176 for (b, &ib) in idx.iter().enumerate() {
177 indicator[ia * n + ib] += 1;
178 if sub_labels[a] == sub_labels[b] {
179 co_counts[ia * n + ib] += 1;
180 }
181 }
182 }
183 }
184
185 let mut matrix = Array2::<f64>::zeros((n, n));
187 for i in 0..n {
188 for j in 0..n {
189 let ind = indicator[i * n + j];
190 matrix[[i, j]] = if ind > 0 {
191 co_counts[i * n + j] as f64 / ind as f64
192 } else {
193 0.0
194 };
195 }
196 }
197
198 Ok(matrix)
199}
200
201fn cdf_area(matrix: &Array2<f64>) -> f64 {
205 let n = matrix.nrows();
206 let mut vals: Vec<f64> = Vec::with_capacity(n * (n - 1) / 2);
207
208 for i in 0..n {
209 for j in (i + 1)..n {
210 vals.push(matrix[[i, j]]);
211 }
212 }
213
214 if vals.is_empty() {
215 return 0.0;
216 }
217
218 vals.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
219
220 let m = vals.len();
221 if m == 1 {
222 return vals[0];
223 }
224
225 let step = 1.0 / (m - 1) as f64;
228 let mut area = 0.0_f64;
229 for (rank, &v) in vals.iter().enumerate() {
230 let cdf = (rank as f64) / ((m - 1) as f64);
231 if rank + 1 < m {
235 let cdf_next = (rank + 1) as f64 / (m - 1) as f64;
236 let _ = cdf;
237 let _ = cdf_next;
238 let avg_cdf = (rank as f64 + 0.5) / (m - 1) as f64;
242 area += step * (1.0 - avg_cdf);
243 }
244 }
245
246 area
248}
249
250fn cluster_consensus_matrix(matrix: &Array2<f64>, k: usize) -> Result<Vec<usize>> {
253 let n = matrix.nrows();
254 if k == 0 || k > n {
255 return Err(ClusteringError::InvalidInput(format!(
256 "k={} out of valid range [1, {}]",
257 k, n
258 )));
259 }
260 if k == n {
261 return Ok((0..n).collect());
262 }
263 if k == 1 {
264 return Ok(vec![0; n]);
265 }
266
267 let mut clusters: Vec<Vec<usize>> = (0..n).map(|i| vec![i]).collect();
269
270 while clusters.len() > k {
271 let nc = clusters.len();
272 let mut best_sim = f64::NEG_INFINITY;
273 let mut best_ci = 0;
274 let mut best_cj = 1;
275
276 for ci in 0..nc {
277 for cj in (ci + 1)..nc {
278 let mut sim_sum = 0.0_f64;
280 let mut count = 0usize;
281 for &a in &clusters[ci] {
282 for &b in &clusters[cj] {
283 sim_sum += matrix[[a, b]];
284 count += 1;
285 }
286 }
287 let avg_sim = if count > 0 {
288 sim_sum / count as f64
289 } else {
290 0.0
291 };
292 if avg_sim > best_sim {
293 best_sim = avg_sim;
294 best_ci = ci;
295 best_cj = cj;
296 }
297 }
298 }
299
300 let removed = clusters.remove(best_cj);
302 clusters[best_ci].extend(removed);
303 }
304
305 let mut labels = vec![0usize; n];
307 for (cluster_idx, members) in clusters.iter().enumerate() {
308 for &point_idx in members {
309 labels[point_idx] = cluster_idx;
310 }
311 }
312 Ok(labels)
313}
314
315pub fn consensus_clustering<F>(
356 data: &Array2<f64>,
357 k: usize,
358 base_clusterer: F,
359 config: ConsensusConfig,
360) -> Result<ConsensusResult>
361where
362 F: Fn(&Array2<f64>, usize, u64) -> Vec<usize>,
363{
364 let matrix = build_consensus_matrix(data, k, &base_clusterer, &config)?;
365 let assignments = cluster_consensus_matrix(&matrix, k)?;
366 let area = cdf_area(&matrix);
367
368 Ok(ConsensusResult {
369 consensus_matrix: matrix,
370 assignments,
371 cdf_area: area,
372 delta_k: 0.0, k,
374 })
375}
376
377pub fn consensus_clustering_sweep<F>(
411 data: &Array2<f64>,
412 k_range: std::ops::Range<usize>,
413 base_clusterer: F,
414 config: ConsensusConfig,
415) -> Result<Vec<ConsensusResult>>
416where
417 F: Fn(&Array2<f64>, usize, u64) -> Vec<usize>,
418{
419 if k_range.is_empty() {
420 return Err(ClusteringError::InvalidInput(
421 "k_range must be non-empty".into(),
422 ));
423 }
424 if k_range.start < 2 {
425 return Err(ClusteringError::InvalidInput(
426 "k_range must start at 2 or above".into(),
427 ));
428 }
429
430 let k_values: Vec<usize> = k_range.collect();
431 let mut results: Vec<ConsensusResult> = Vec::with_capacity(k_values.len());
432 let mut prev_area: Option<f64> = None;
433
434 for &k in &k_values {
435 let k_config = ConsensusConfig {
437 seed: config.seed.wrapping_add(k as u64 * 999_983),
438 ..config.clone()
439 };
440 let matrix = build_consensus_matrix(data, k, &base_clusterer, &k_config)?;
441 let assignments = cluster_consensus_matrix(&matrix, k)?;
442 let area = cdf_area(&matrix);
443
444 let delta_k = match prev_area {
445 None => 0.0,
446 Some(prev) => {
447 if prev.abs() < 1e-15 {
448 0.0
449 } else {
450 (area - prev) / prev
451 }
452 }
453 };
454
455 results.push(ConsensusResult {
456 consensus_matrix: matrix,
457 assignments,
458 cdf_area: area,
459 delta_k,
460 k,
461 });
462
463 prev_area = Some(area);
464 }
465
466 Ok(results)
467}
468
469#[cfg(test)]
474mod tests {
475 use super::*;
476 use scirs2_core::ndarray::Array2;
477
478 fn grid_clusterer(data: &Array2<f64>, k: usize, _seed: u64) -> Vec<usize> {
481 let n = data.nrows();
482 if n == 0 {
483 return vec![];
484 }
485 let min_val = data.column(0).fold(f64::INFINITY, |a, &b| a.min(b));
486 let max_val = data.column(0).fold(f64::NEG_INFINITY, |a, &b| a.max(b));
487 let range = (max_val - min_val).max(1e-10);
488
489 (0..n)
490 .map(|i| {
491 let t = (data[[i, 0]] - min_val) / range;
492 let c = (t * k as f64).floor() as usize;
493 c.min(k - 1)
494 })
495 .collect()
496 }
497
498 fn two_cluster_data() -> Array2<f64> {
499 Array2::from_shape_vec(
500 (12, 2),
501 vec![
502 0.0, 0.0, 0.1, 0.0, 0.0, 0.1, -0.1, 0.0, 0.1, 0.1, -0.1, -0.1, 10.0, 10.0, 10.1,
503 10.0, 10.0, 10.1, 9.9, 10.0, 10.1, 10.1, 9.9, 9.9,
504 ],
505 )
506 .expect("create test data")
507 }
508
509 #[test]
510 fn test_consensus_matrix_symmetry() {
511 let data = two_cluster_data();
512 let config = ConsensusConfig {
513 n_runs: 20,
514 subsample_fraction: 0.8,
515 max_k: 4,
516 seed: 1,
517 };
518 let result = consensus_clustering(&data, 2, grid_clusterer, config)
519 .expect("operation should succeed");
520 let m = &result.consensus_matrix;
521 let n = m.nrows();
522 for i in 0..n {
523 for j in 0..n {
524 assert!(
525 (m[[i, j]] - m[[j, i]]).abs() < 1e-12,
526 "Consensus matrix not symmetric at [{},{}]",
527 i,
528 j
529 );
530 }
531 }
532 }
533
534 #[test]
535 fn test_consensus_matrix_range() {
536 let data = two_cluster_data();
537 let config = ConsensusConfig {
538 n_runs: 20,
539 subsample_fraction: 0.8,
540 max_k: 4,
541 seed: 2,
542 };
543 let result = consensus_clustering(&data, 2, grid_clusterer, config)
544 .expect("operation should succeed");
545 for &v in result.consensus_matrix.iter() {
546 assert!(
547 v >= 0.0 && v <= 1.0 + 1e-12,
548 "Consensus matrix value {} out of [0,1]",
549 v
550 );
551 }
552 }
553
554 #[test]
555 fn test_consensus_assignments_count() {
556 let data = two_cluster_data();
557 let config = ConsensusConfig {
558 n_runs: 30,
559 subsample_fraction: 0.8,
560 max_k: 4,
561 seed: 3,
562 };
563 let result = consensus_clustering(&data, 2, grid_clusterer, config)
564 .expect("operation should succeed");
565 assert_eq!(result.assignments.len(), 12);
566 assert_eq!(result.k, 2);
567 for &a in &result.assignments {
569 assert!(a < 2, "Assignment {} >= k=2", a);
570 }
571 }
572
573 #[test]
574 fn test_consensus_sweep_returns_correct_count() {
575 let data = two_cluster_data();
576 let config = ConsensusConfig {
577 n_runs: 10,
578 subsample_fraction: 0.8,
579 max_k: 6,
580 seed: 4,
581 };
582 let results = consensus_clustering_sweep(&data, 2..5, grid_clusterer, config)
583 .expect("operation should succeed");
584 assert_eq!(results.len(), 3);
585 assert_eq!(results[0].k, 2);
586 assert_eq!(results[1].k, 3);
587 assert_eq!(results[2].k, 4);
588 }
589
590 #[test]
591 fn test_consensus_sweep_delta_k_first_is_zero() {
592 let data = two_cluster_data();
593 let config = ConsensusConfig {
594 n_runs: 10,
595 subsample_fraction: 0.8,
596 max_k: 6,
597 seed: 5,
598 };
599 let results = consensus_clustering_sweep(&data, 2..5, grid_clusterer, config)
600 .expect("operation should succeed");
601 assert_eq!(
602 results[0].delta_k, 0.0,
603 "First delta_k should be 0.0 (no prior k)"
604 );
605 }
606
607 #[test]
608 fn test_consensus_cdf_area_in_range() {
609 let data = two_cluster_data();
610 let config = ConsensusConfig {
611 n_runs: 20,
612 subsample_fraction: 0.8,
613 max_k: 4,
614 seed: 6,
615 };
616 let result = consensus_clustering(&data, 2, grid_clusterer, config)
617 .expect("operation should succeed");
618 assert!(
619 result.cdf_area >= 0.0 && result.cdf_area <= 1.0 + 1e-9,
620 "CDF area {} should be in [0,1]",
621 result.cdf_area
622 );
623 }
624
625 #[test]
626 fn test_consensus_invalid_k() {
627 let data = two_cluster_data();
628 let config = ConsensusConfig::default();
629 let err = consensus_clustering(&data, 1, grid_clusterer, config);
630 assert!(err.is_err(), "k=1 should return error");
631 }
632
633 #[test]
634 fn test_consensus_invalid_subsample() {
635 let data = two_cluster_data();
636 let config = ConsensusConfig {
637 subsample_fraction: 0.0,
638 ..ConsensusConfig::default()
639 };
640 let err = consensus_clustering(&data, 2, grid_clusterer, config);
641 assert!(err.is_err(), "subsample_fraction=0 should return error");
642 }
643}