1use super::core::*;
8use crate::error::{ClusteringError, Result};
9use crate::metrics::{adjusted_rand_index, silhouette_score};
10use scirs2_core::ndarray::{Array1, Array2, ArrayView2};
11use scirs2_core::numeric::{Float, FromPrimitive};
12use scirs2_core::random::prelude::*;
13use std::collections::{HashMap, HashSet};
14use std::fmt::Debug;
15
16pub struct EnsembleClusterer<F: Float> {
18 config: EnsembleConfig,
19 phantom: std::marker::PhantomData<F>,
20}
21
22impl<
23 F: Float + FromPrimitive + Debug + 'static + std::iter::Sum + std::fmt::Display + Send + Sync,
24 > EnsembleClusterer<F>
25where
26 f64: From<F>,
27{
28 pub fn new(config: EnsembleConfig) -> Self {
30 Self {
31 config,
32 phantom: std::marker::PhantomData,
33 }
34 }
35
36 pub fn fit(&self, data: ArrayView2<F>) -> Result<EnsembleResult> {
38 let start_time = std::time::Instant::now();
39
40 let individual_results = self.generate_diverse_clusterings(data)?;
42
43 let filtered_results = self.filter_by_quality(&individual_results);
45
46 let consensus_labels = self.build_consensus(&filtered_results, data)?;
48
49 let consensus_stats =
51 self.calculate_consensus_statistics(&filtered_results, &consensus_labels)?;
52 let diversity_metrics = self.calculate_diversity_metrics(&filtered_results)?;
53
54 let data_f64 = data.mapv(|x| x.to_f64().unwrap_or(0.0));
56 let ensemble_quality =
57 silhouette_score(data_f64.view(), consensus_labels.view()).unwrap_or(0.0);
58
59 let stability_score = self.calculate_consensus_stability_score(&consensus_stats);
61
62 let total_time = start_time.elapsed().as_secs_f64();
63
64 Ok(EnsembleResult {
65 consensus_labels,
66 individual_results: filtered_results,
67 consensus_stats,
68 diversity_metrics,
69 ensemble_quality,
70 stability_score,
71 })
72 }
73
74 fn generate_diverse_clusterings(&self, data: ArrayView2<F>) -> Result<Vec<ClusteringResult>> {
76 let mut results = Vec::new();
77 let mut rng = match self.config.random_seed {
78 Some(seed) => scirs2_core::random::rngs::StdRng::seed_from_u64(seed),
79 None => scirs2_core::random::rngs::StdRng::seed_from_u64(42),
80 };
81
82 for i in 0..self.config.n_estimators {
83 let clustering_start = std::time::Instant::now();
84
85 let (sampled_data, sample_indices) = self.apply_sampling_strategy(data, &mut rng)?;
87
88 let (algorithm, parameters) = self.select_algorithm_and_parameters(i, &mut rng)?;
90
91 let mut labels = self.run_clustering(&sampled_data, &algorithm, ¶meters)?;
93
94 if sample_indices.len() != data.nrows() {
96 labels = self.map_labels_to_full_data(&labels, &sample_indices, data.nrows())?;
97 }
98
99 let data_f64 = data.mapv(|x| x.to_f64().unwrap_or(0.0));
101 let quality_score = silhouette_score(data_f64.view(), labels.view()).unwrap_or(-1.0);
102
103 let runtime = clustering_start.elapsed().as_secs_f64();
104 let n_clusters = self.count_clusters(&labels);
105
106 let result = ClusteringResult {
107 labels,
108 algorithm: format!("{:?}", algorithm),
109 parameters,
110 quality_score,
111 stability_score: None,
112 n_clusters,
113 runtime,
114 };
115
116 results.push(result);
117 }
118
119 Ok(results)
120 }
121
122 fn apply_sampling_strategy(
124 &self,
125 data: ArrayView2<F>,
126 rng: &mut scirs2_core::random::rngs::StdRng,
127 ) -> Result<(Array2<F>, Vec<usize>)> {
128 let n_samples = data.nrows();
129 let n_features = data.ncols();
130
131 match &self.config.sampling_strategy {
132 SamplingStrategy::Bootstrap { sample_ratio } => {
133 let sample_size = (n_samples as f64 * sample_ratio) as usize;
134 let mut indices = Vec::new();
135
136 for _ in 0..sample_size {
137 indices.push(rng.gen_range(0..n_samples));
138 }
139
140 let sampled_data = self.extract_samples(data, &indices)?;
141 Ok((sampled_data, indices))
142 }
143 SamplingStrategy::RandomSubspace { feature_ratio } => {
144 let n_selected_features = (n_features as f64 * feature_ratio) as usize;
145 let mut featureindices: Vec<usize> = (0..n_features).collect();
146 featureindices.shuffle(rng);
147 featureindices.truncate(n_selected_features);
148
149 let sample_indices: Vec<usize> = (0..n_samples).collect();
150 let sampled_data = self.extract_features(data, &featureindices)?;
151 Ok((sampled_data, sample_indices))
152 }
153 SamplingStrategy::BootstrapSubspace {
154 sample_ratio,
155 feature_ratio,
156 } => {
157 let sample_size = (n_samples as f64 * sample_ratio) as usize;
159 let mut sample_indices = Vec::new();
160
161 for _ in 0..sample_size {
162 sample_indices.push(rng.gen_range(0..n_samples));
163 }
164
165 let n_selected_features = (n_features as f64 * feature_ratio) as usize;
167 let mut featureindices: Vec<usize> = (0..n_features).collect();
168 featureindices.shuffle(rng);
169 featureindices.truncate(n_selected_features);
170
171 let bootstrap_data = self.extract_samples(data, &sample_indices)?;
172 let sampled_data = self.extract_features(bootstrap_data.view(), &featureindices)?;
173
174 Ok((sampled_data, sample_indices))
175 }
176 SamplingStrategy::NoiseInjection {
177 noise_level,
178 noise_type,
179 } => {
180 let sample_indices: Vec<usize> = (0..n_samples).collect();
181 let mut noisy_data = data.to_owned();
182
183 match noise_type {
184 NoiseType::Gaussian => {
185 for i in 0..n_samples {
186 for j in 0..n_features {
187 let noise = F::from(rng.gen::<f64>() * 2.0 - 1.0).unwrap()
188 * F::from(*noise_level).unwrap();
189 noisy_data[[i, j]] = noisy_data[[i, j]] + noise;
190 }
191 }
192 }
193 NoiseType::Uniform => {
194 for i in 0..n_samples {
195 for j in 0..n_features {
196 let noise =
197 F::from((rng.gen::<f64>() * 2.0 - 1.0) * noise_level).unwrap();
198 noisy_data[[i, j]] = noisy_data[[i, j]] + noise;
199 }
200 }
201 }
202 NoiseType::Outliers { outlier_ratio } => {
203 let n_outliers = (n_samples as f64 * outlier_ratio) as usize;
204 for _ in 0..n_outliers {
205 let outlier_idx = rng.gen_range(0..n_samples);
206 for j in 0..n_features {
207 let outlier_value = F::from(rng.gen::<f64>() * 10.0 - 5.0).unwrap();
208 noisy_data[[outlier_idx, j]] = outlier_value;
209 }
210 }
211 }
212 }
213
214 Ok((noisy_data, sample_indices))
215 }
216 SamplingStrategy::None => {
217 let sample_indices: Vec<usize> = (0..n_samples).collect();
218 Ok((data.to_owned(), sample_indices))
219 }
220 SamplingStrategy::RandomProjection { target_dimensions } => {
221 let n_features = data.ncols();
222 if *target_dimensions >= n_features {
223 let sample_indices: Vec<usize> = (0..n_samples).collect();
225 return Ok((data.to_owned(), sample_indices));
226 }
227
228 let mut rng = match self.config.random_seed {
230 Some(seed) => scirs2_core::random::rngs::StdRng::seed_from_u64(seed),
231 None => scirs2_core::random::rngs::StdRng::seed_from_u64(
232 scirs2_core::random::random(),
233 ),
234 };
235
236 let mut projection_matrix = Array2::zeros((n_features, *target_dimensions));
238 for i in 0..n_features {
239 for j in 0..*target_dimensions {
240 let random_val = F::from(rng.gen::<f64>()).unwrap();
242 let two = F::from(2.0).unwrap();
243 let one = F::from(1.0).unwrap();
244 projection_matrix[[i, j]] = random_val * two - one;
245 }
246 }
247
248 for j in 0..*target_dimensions {
250 let col_norm = projection_matrix.column(j).mapv(|x| x * x).sum().sqrt();
251 if col_norm > F::zero() {
252 for i in 0..n_features {
253 projection_matrix[[i, j]] = projection_matrix[[i, j]] / col_norm;
254 }
255 }
256 }
257
258 let projected_data = data.dot(&projection_matrix);
260 let sample_indices: Vec<usize> = (0..n_samples).collect();
261
262 Ok((projected_data, sample_indices))
263 }
264 }
265 }
266
267 fn extract_samples(&self, data: ArrayView2<F>, indices: &[usize]) -> Result<Array2<F>> {
269 let n_features = data.ncols();
270 let mut sampled_data = Array2::zeros((indices.len(), n_features));
271
272 for (new_idx, &orig_idx) in indices.iter().enumerate() {
273 if orig_idx >= data.nrows() {
274 return Err(ClusteringError::InvalidInput(
275 "Sample index out of bounds".to_string(),
276 ));
277 }
278 sampled_data.row_mut(new_idx).assign(&data.row(orig_idx));
279 }
280
281 Ok(sampled_data)
282 }
283
284 fn extract_features(&self, data: ArrayView2<F>, featureindices: &[usize]) -> Result<Array2<F>> {
286 let n_samples = data.nrows();
287 let mut feature_data = Array2::zeros((n_samples, featureindices.len()));
288
289 for (new_idx, &orig_idx) in featureindices.iter().enumerate() {
290 if orig_idx >= data.ncols() {
291 return Err(ClusteringError::InvalidInput(
292 "Feature index out of bounds".to_string(),
293 ));
294 }
295 feature_data
296 .column_mut(new_idx)
297 .assign(&data.column(orig_idx));
298 }
299
300 Ok(feature_data)
301 }
302
303 fn apply_consensus(
305 &self,
306 results: &[ClusteringResult],
307 data: ArrayView2<F>,
308 ) -> Result<EnsembleResult> {
309 match &self.config.consensus_method {
310 ConsensusMethod::MajorityVoting => self.majority_voting_consensus(results, data),
311 ConsensusMethod::WeightedConsensus => self.weighted_consensus(results, data),
312 ConsensusMethod::GraphBased {
313 similarity_threshold,
314 } => {
315 let result = self.graph_based_consensus(results, data, *similarity_threshold)?;
316 Ok(result)
317 }
318 ConsensusMethod::CoAssociation { threshold } => {
319 let result = self.co_association_consensus(results, data, *threshold)?;
320 Ok(result)
321 }
322 ConsensusMethod::EvidenceAccumulation => {
323 let result = self.evidence_accumulation_consensus(results, data)?;
324 Ok(result)
325 }
326 ConsensusMethod::Hierarchical { linkage_method } => {
327 self.hierarchical_consensus(results, data, linkage_method)
328 }
329 }
330 }
331
332 fn majority_voting_consensus(
334 &self,
335 results: &[ClusteringResult],
336 data: ArrayView2<F>,
337 ) -> Result<EnsembleResult> {
338 let n_samples = data.nrows();
339 let mut consensus_labels = Array1::zeros(n_samples);
340 let mut vote_matrix = HashMap::new();
341
342 for result in results {
344 for (sample_idx, &cluster_label) in result.labels.iter().enumerate() {
345 let entry = vote_matrix.entry(sample_idx).or_insert_with(HashMap::new);
346 *entry.entry(cluster_label).or_insert(0) += 1;
347 }
348 }
349
350 for sample_idx in 0..n_samples {
352 if let Some(votes) = vote_matrix.get(&sample_idx) {
353 let most_voted_cluster = votes
354 .iter()
355 .max_by_key(|(_, &count)| count)
356 .map(|(&cluster_, _)| cluster_)
357 .unwrap_or(0);
358 consensus_labels[sample_idx] = most_voted_cluster;
359 }
360 }
361
362 let avg_quality_score =
364 results.iter().map(|r| r.quality_score).sum::<f64>() / results.len() as f64;
365 let consensus_stats = self.calculate_consensus_statistics(results, &consensus_labels)?;
366 let diversity_metrics = self.calculate_diversity_metrics(results)?;
367 let stability_score = self.calculate_consensus_stability_score(&consensus_stats);
368
369 Ok(EnsembleResult {
370 consensus_labels,
371 individual_results: results.to_vec(),
372 consensus_stats,
373 diversity_metrics,
374 ensemble_quality: avg_quality_score,
375 stability_score,
376 })
377 }
378
379 fn weighted_consensus(
381 &self,
382 results: &[ClusteringResult],
383 data: ArrayView2<F>,
384 ) -> Result<EnsembleResult> {
385 let n_samples = data.nrows();
386 let mut consensus_labels = Array1::zeros(n_samples);
387 let mut weighted_vote_matrix = HashMap::new();
388
389 for result in results {
391 let weight = result.quality_score.max(0.0); for (sample_idx, &cluster_label) in result.labels.iter().enumerate() {
393 let entry = weighted_vote_matrix
394 .entry(sample_idx)
395 .or_insert_with(HashMap::new);
396 *entry.entry(cluster_label).or_insert(0.0) += weight;
397 }
398 }
399
400 for sample_idx in 0..n_samples {
402 if let Some(votes) = weighted_vote_matrix.get(&sample_idx) {
403 let most_voted_cluster = votes
404 .iter()
405 .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
406 .map(|(&cluster_, _)| cluster_)
407 .unwrap_or(0);
408 consensus_labels[sample_idx] = most_voted_cluster;
409 }
410 }
411
412 let total_weight: f64 = results.iter().map(|r| r.quality_score.max(0.0)).sum();
414 let ensemble_score = if total_weight > 0.0 {
415 results
416 .iter()
417 .map(|r| r.quality_score * r.quality_score.max(0.0))
418 .sum::<f64>()
419 / total_weight
420 } else {
421 0.0
422 };
423
424 let consensus_stats = self.calculate_consensus_statistics(results, &consensus_labels)?;
425 let diversity_metrics = self.calculate_diversity_metrics(results)?;
426 let stability_score = self.calculate_consensus_stability_score(&consensus_stats);
427
428 Ok(EnsembleResult {
429 consensus_labels,
430 individual_results: results.to_vec(),
431 consensus_stats,
432 diversity_metrics,
433 ensemble_quality: ensemble_score,
434 stability_score,
435 })
436 }
437
438 fn graph_based_consensus(
440 &self,
441 results: &[ClusteringResult],
442 data: ArrayView2<F>,
443 similarity_threshold: f64,
444 ) -> Result<EnsembleResult> {
445 let n_samples = data.nrows();
446
447 let mut co_association = Array2::zeros((n_samples, n_samples));
449
450 for result in results {
451 for i in 0..n_samples {
452 for j in i + 1..n_samples {
453 if result.labels[i] == result.labels[j] {
454 co_association[[i, j]] += 1.0;
455 co_association[[j, i]] += 1.0;
456 }
457 }
458 }
459 }
460
461 co_association /= results.len() as f64;
463
464 let mut similarity_graph = Array2::zeros((n_samples, n_samples));
466 for i in 0..n_samples {
467 for j in 0..n_samples {
468 if co_association[[i, j]] >= similarity_threshold {
469 similarity_graph[[i, j]] = co_association[[i, j]];
470 }
471 }
472 }
473
474 let mut consensus_labels = Array1::from_elem(n_samples, -1i32);
476 let mut current_cluster = 0i32;
477 let mut visited = vec![false; n_samples];
478
479 for i in 0..n_samples {
480 if !visited[i] {
481 let mut queue = vec![i];
483 visited[i] = true;
484 consensus_labels[i] = current_cluster;
485
486 while let Some(node) = queue.pop() {
487 for j in 0..n_samples {
488 if !visited[j] && similarity_graph[[node, j]] > 0.0 {
489 visited[j] = true;
490 consensus_labels[j] = current_cluster;
491 queue.push(j);
492 }
493 }
494 }
495 current_cluster += 1;
496 }
497 }
498
499 let avg_quality_score =
500 results.iter().map(|r| r.quality_score).sum::<f64>() / results.len() as f64;
501 let consensus_stats = self.calculate_consensus_statistics(results, &consensus_labels)?;
502 let diversity_metrics = self.calculate_diversity_metrics(results)?;
503 let stability_score = self.calculate_consensus_stability_score(&consensus_stats);
504
505 Ok(EnsembleResult {
506 consensus_labels,
507 individual_results: results.to_vec(),
508 consensus_stats,
509 diversity_metrics,
510 ensemble_quality: avg_quality_score,
511 stability_score,
512 })
513 }
514
515 fn co_association_consensus(
517 &self,
518 results: &[ClusteringResult],
519 data: ArrayView2<F>,
520 threshold: f64,
521 ) -> Result<EnsembleResult> {
522 self.graph_based_consensus(results, data, threshold)
524 }
525
526 fn evidence_accumulation_consensus(
528 &self,
529 results: &[ClusteringResult],
530 data: ArrayView2<F>,
531 ) -> Result<EnsembleResult> {
532 self.hierarchical_consensus(results, data, "ward")
534 }
535
536 fn hierarchical_consensus(
538 &self,
539 results: &[ClusteringResult],
540 data: ArrayView2<F>,
541 linkage_method: &str,
542 ) -> Result<EnsembleResult> {
543 let n_samples = data.nrows();
544
545 let mut co_association: Array2<f64> = Array2::zeros((n_samples, n_samples));
547
548 for result in results {
549 for i in 0..n_samples {
550 for j in i + 1..n_samples {
551 if result.labels[i] == result.labels[j] {
552 co_association[[i, j]] += 1.0;
553 co_association[[j, i]] += 1.0;
554 }
555 }
556 }
557 }
558
559 let mut distance_matrix = Array2::ones((n_samples, n_samples));
561 for i in 0..n_samples {
562 for j in 0..n_samples {
563 distance_matrix[[i, j]] = 1.0 - (co_association[[i, j]] / results.len() as f64);
564 }
565 distance_matrix[[i, i]] = 0.0; }
567
568 let threshold = 0.5;
571 let mut consensus_labels = Array1::from_elem(n_samples, -1i32);
572 let mut current_cluster = 0i32;
573 let mut assigned = vec![false; n_samples];
574
575 for i in 0..n_samples {
576 if !assigned[i] {
577 consensus_labels[i] = current_cluster;
578 assigned[i] = true;
579
580 for j in (i + 1)..n_samples {
582 if !assigned[j] && distance_matrix[[i, j]] <= threshold {
583 consensus_labels[j] = current_cluster;
584 assigned[j] = true;
585 }
586 }
587 current_cluster += 1;
588 }
589 }
590
591 let avg_quality_score =
592 results.iter().map(|r| r.quality_score).sum::<f64>() / results.len() as f64;
593 let consensus_stats = self.calculate_consensus_statistics(results, &consensus_labels)?;
594 let diversity_metrics = self.calculate_diversity_metrics(results)?;
595 let stability_score = self.calculate_consensus_stability_score(&consensus_stats);
596
597 Ok(EnsembleResult {
598 consensus_labels,
599 individual_results: results.to_vec(),
600 consensus_stats,
601 diversity_metrics,
602 ensemble_quality: avg_quality_score,
603 stability_score,
604 })
605 }
606
607 fn calculate_diversity_score(&self, results: &[ClusteringResult]) -> f64 {
609 if results.len() < 2 {
610 return 0.0;
611 }
612
613 let mut total_diversity = 0.0;
614 let mut count = 0;
615
616 for i in 0..results.len() {
617 for j in (i + 1)..results.len() {
618 if let Ok(ari) =
620 adjusted_rand_index::<f64>(results[i].labels.view(), results[j].labels.view())
621 {
622 total_diversity += 1.0 - ari; count += 1;
624 }
625 }
626 }
627
628 if count > 0 {
629 total_diversity / count as f64
630 } else {
631 0.0
632 }
633 }
634
635 fn calculate_agreement_ratio(&self, results: &[ClusteringResult]) -> f64 {
637 if results.len() < 2 {
638 return 1.0;
639 }
640
641 let n_samples = results[0].labels.len();
642 let mut total_agreements = 0;
643 let mut total_pairs = 0;
644
645 for i in 0..results.len() {
646 for j in (i + 1)..results.len() {
647 for sample_idx in 0..n_samples {
648 if results[i].labels[sample_idx] == results[j].labels[sample_idx] {
649 total_agreements += 1;
650 }
651 total_pairs += 1;
652 }
653 }
654 }
655
656 if total_pairs > 0 {
657 total_agreements as f64 / total_pairs as f64
658 } else {
659 0.0
660 }
661 }
662
663 fn calculate_confidence_scores(
665 &self,
666 vote_matrix: &HashMap<usize, HashMap<i32, usize>>,
667 n_samples: usize,
668 ) -> Vec<f64> {
669 let mut confidence_scores = vec![0.0; n_samples];
670
671 for sample_idx in 0..n_samples {
672 if let Some(votes) = vote_matrix.get(&sample_idx) {
673 let total_votes: usize = votes.values().sum();
674 let max_votes = votes.values().max().copied().unwrap_or(0);
675
676 if total_votes > 0 {
677 confidence_scores[sample_idx] = max_votes as f64 / total_votes as f64;
678 }
679 }
680 }
681
682 confidence_scores
683 }
684
685 fn calculate_weighted_confidence_scores(
687 &self,
688 vote_matrix: &HashMap<usize, HashMap<i32, f64>>,
689 n_samples: usize,
690 ) -> Vec<f64> {
691 let mut confidence_scores = vec![0.0; n_samples];
692
693 for sample_idx in 0..n_samples {
694 if let Some(votes) = vote_matrix.get(&sample_idx) {
695 let total_votes: f64 = votes.values().sum();
696 let max_votes = votes.values().fold(0.0, |acc, &x| acc.max(x));
697
698 if total_votes > 0.0 {
699 confidence_scores[sample_idx] = max_votes / total_votes;
700 }
701 }
702 }
703
704 confidence_scores
705 }
706
707 fn calculate_cluster_diversity(&self, results: &[ClusteringResult]) -> f64 {
709 let cluster_counts: Vec<usize> = results.iter().map(|r| r.n_clusters).collect();
710
711 if cluster_counts.is_empty() {
712 return 0.0;
713 }
714
715 let mean_clusters =
716 cluster_counts.iter().sum::<usize>() as f64 / cluster_counts.len() as f64;
717 let variance = cluster_counts
718 .iter()
719 .map(|&x| (x as f64 - mean_clusters).powi(2))
720 .sum::<f64>()
721 / cluster_counts.len() as f64;
722
723 variance.sqrt() / mean_clusters }
725
726 fn calculate_algorithm_diversity(&self, results: &[ClusteringResult]) -> f64 {
728 let unique_algorithms: HashSet<String> =
729 results.iter().map(|r| r.algorithm.clone()).collect();
730
731 unique_algorithms.len() as f64 / results.len() as f64
732 }
733
734 fn count_unique_clusters(&self, labels: &Array1<i32>) -> usize {
736 let mut unique_labels = HashSet::new();
737 for &label in labels {
738 unique_labels.insert(label);
739 }
740 unique_labels.len()
741 }
742
743 fn select_algorithm_and_parameters(
745 &self,
746 estimator_index: usize,
747 rng: &mut scirs2_core::random::rngs::StdRng,
748 ) -> Result<(ClusteringAlgorithm, HashMap<String, String>)> {
749 match &self.config.diversity_strategy {
750 Some(DiversityStrategy::AlgorithmDiversity { algorithms }) => {
751 let algorithm = algorithms[estimator_index % algorithms.len()].clone();
752 let parameters = self.generate_random_parameters(&algorithm, rng)?;
753 Ok((algorithm, parameters))
754 }
755 Some(DiversityStrategy::ParameterDiversity {
756 algorithm,
757 parameter_ranges,
758 }) => {
759 let parameters = self.sample_parameter_ranges(parameter_ranges, rng)?;
760 Ok((algorithm.clone(), parameters))
761 }
762 _ => {
763 let k = rng.gen_range(2..=10);
765 let algorithm = ClusteringAlgorithm::KMeans { k_range: (k, k) };
766 let mut parameters = HashMap::new();
767 parameters.insert("k".to_string(), k.to_string());
768 Ok((algorithm, parameters))
769 }
770 }
771 }
772
773 fn generate_random_parameters(
775 &self,
776 algorithm: &ClusteringAlgorithm,
777 rng: &mut scirs2_core::random::rngs::StdRng,
778 ) -> Result<HashMap<String, String>> {
779 let mut parameters = HashMap::new();
780
781 match algorithm {
782 ClusteringAlgorithm::KMeans { k_range } => {
783 let k = rng.gen_range(k_range.0..=k_range.1);
784 parameters.insert("k".to_string(), k.to_string());
785 }
786 ClusteringAlgorithm::DBSCAN {
787 eps_range,
788 min_samples_range,
789 } => {
790 let eps = rng.gen_range(eps_range.0..=eps_range.1);
791 let min_samples = rng.gen_range(min_samples_range.0..=min_samples_range.1);
792 parameters.insert("eps".to_string(), eps.to_string());
793 parameters.insert("min_samples".to_string(), min_samples.to_string());
794 }
795 ClusteringAlgorithm::MeanShift { bandwidth_range } => {
796 let bandwidth = rng.gen_range(bandwidth_range.0..=bandwidth_range.1);
797 parameters.insert("bandwidth".to_string(), bandwidth.to_string());
798 }
799 ClusteringAlgorithm::Hierarchical { methods } => {
800 let method = &methods[rng.gen_range(0..methods.len())];
801 parameters.insert("method".to_string(), method.clone());
802 }
803 ClusteringAlgorithm::Spectral { k_range } => {
804 let k = rng.gen_range(k_range.0..=k_range.1);
805 parameters.insert("k".to_string(), k.to_string());
806 }
807 ClusteringAlgorithm::AffinityPropagation { damping_range } => {
808 let damping = rng.gen_range(damping_range.0..=damping_range.1);
809 parameters.insert("damping".to_string(), damping.to_string());
810 }
811 }
812
813 Ok(parameters)
814 }
815
816 fn sample_parameter_ranges(
818 &self,
819 parameter_ranges: &HashMap<String, ParameterRange>,
820 rng: &mut scirs2_core::random::rngs::StdRng,
821 ) -> Result<HashMap<String, String>> {
822 let mut parameters = HashMap::new();
823
824 for (param_name, range) in parameter_ranges {
825 let value = match range {
826 ParameterRange::Integer(min, max) => rng.gen_range(*min..=*max).to_string(),
827 ParameterRange::Float(min, max) => rng.gen_range(*min..=*max).to_string(),
828 ParameterRange::Categorical(choices) => {
829 choices[rng.gen_range(0..choices.len())].clone()
830 }
831 ParameterRange::Boolean => rng.gen_bool(0.5).to_string(),
832 };
833 parameters.insert(param_name.clone(), value);
834 }
835
836 Ok(parameters)
837 }
838
839 fn run_clustering(
841 &self,
842 data: &Array2<F>,
843 algorithm: &ClusteringAlgorithm,
844 parameters: &HashMap<String, String>,
845 ) -> Result<Array1<i32>> {
846 let data_f64 = data.mapv(|x| x.to_f64().unwrap_or(0.0));
847
848 match algorithm {
849 ClusteringAlgorithm::KMeans { .. } => {
850 let k = parameters
851 .get("k")
852 .and_then(|s| s.parse().ok())
853 .unwrap_or(3);
854
855 use crate::vq::kmeans2;
857 match kmeans2(
858 data.view(),
859 k,
860 Some(100), None, None, None, Some(false), None, ) {
867 Ok((_, labels)) => Ok(labels.mapv(|x| x as i32)),
868 Err(_) => {
869 let n_samples = data.nrows();
871 let labels = Array1::from_shape_fn(n_samples, |i| (i % k) as i32);
872 Ok(labels)
873 }
874 }
875 }
876 ClusteringAlgorithm::AffinityPropagation { .. } => {
877 let damping = parameters
878 .get("damping")
879 .and_then(|s| s.parse().ok())
880 .unwrap_or(0.5);
881 let max_iter = parameters
882 .get("max_iter")
883 .and_then(|s| s.parse().ok())
884 .unwrap_or(200);
885 let convergence_iter = parameters
886 .get("convergence_iter")
887 .and_then(|s| s.parse().ok())
888 .unwrap_or(15);
889
890 use crate::affinity::{affinity_propagation, AffinityPropagationOptions};
892 let options = AffinityPropagationOptions {
893 damping: F::from(damping).unwrap(),
894 max_iter,
895 convergence_iter,
896 preference: None, affinity: "euclidean".to_string(),
898 max_affinity_iterations: max_iter, };
900
901 match affinity_propagation(data.view(), false, Some(options)) {
902 Ok((_, labels)) => Ok(labels),
903 Err(_) => {
904 Ok(Array1::zeros(data.nrows()).mapv(|_: f64| 0i32))
906 }
907 }
908 }
909 _ => {
910 let k = parameters
912 .get("k")
913 .and_then(|s| s.parse().ok())
914 .unwrap_or(3);
915
916 use crate::vq::kmeans2;
917 match kmeans2(
918 data.view(),
919 k,
920 Some(100),
921 None,
922 None,
923 None,
924 Some(false),
925 None,
926 ) {
927 Ok((_, labels)) => Ok(labels.mapv(|x| x as i32)),
928 Err(_) => Ok(Array1::zeros(data.nrows()).mapv(|_: f64| 0i32)),
929 }
930 }
931 }
932 }
933
934 fn count_clusters(&self, labels: &Array1<i32>) -> usize {
936 let mut unique_labels = std::collections::HashSet::new();
937 for &label in labels {
938 unique_labels.insert(label);
939 }
940 unique_labels.len()
941 }
942
943 fn filter_by_quality(&self, results: &[ClusteringResult]) -> Vec<ClusteringResult> {
945 if let Some(threshold) = self.config.quality_threshold {
946 results
947 .iter()
948 .filter(|r| r.quality_score >= threshold)
949 .cloned()
950 .collect()
951 } else {
952 results.to_vec()
953 }
954 }
955
956 fn map_labels_to_full_data(
958 &self,
959 labels: &Array1<i32>,
960 sample_indices: &[usize],
961 full_size: usize,
962 ) -> Result<Array1<i32>> {
963 let mut full_labels = Array1::from_elem(full_size, -1); for (sample_idx, &label) in sample_indices.iter().zip(labels.iter()) {
966 if *sample_idx < full_size {
967 full_labels[*sample_idx] = label;
968 }
969 }
970
971 for i in 0..full_size {
973 if full_labels[i] == -1 {
974 full_labels[i] = 0; }
976 }
977
978 Ok(full_labels)
979 }
980
981 fn build_consensus(
983 &self,
984 results: &[ClusteringResult],
985 data: ArrayView2<F>,
986 ) -> Result<Array1<i32>> {
987 if results.is_empty() {
988 return Err(ClusteringError::InvalidInput(
989 "No clustering results available for consensus".to_string(),
990 ));
991 }
992
993 let n_samples = data.nrows();
994
995 match &self.config.consensus_method {
996 ConsensusMethod::MajorityVoting => {
997 let result = self.majority_voting_consensus(results, data)?;
998 Ok(result.consensus_labels)
999 }
1000 ConsensusMethod::WeightedConsensus => {
1001 let result = self.weighted_consensus(results, data)?;
1002 Ok(result.consensus_labels)
1003 }
1004 ConsensusMethod::CoAssociation { threshold } => {
1005 let result = self.co_association_consensus(results, data, *threshold)?;
1006 Ok(result.consensus_labels)
1007 }
1008 ConsensusMethod::EvidenceAccumulation => {
1009 let result = self.evidence_accumulation_consensus(results, data)?;
1010 Ok(result.consensus_labels)
1011 }
1012 ConsensusMethod::GraphBased {
1013 similarity_threshold,
1014 } => {
1015 let result = self.graph_based_consensus(results, data, *similarity_threshold)?;
1016 Ok(result.consensus_labels)
1017 }
1018 ConsensusMethod::Hierarchical { linkage_method } => {
1019 let result = self.hierarchical_consensus(results, data, linkage_method)?;
1020 Ok(result.consensus_labels)
1021 }
1022 }
1023 }
1024
1025 fn estimate_optimal_clusters(&self, linkagematrix: &Array2<f64>) -> usize {
1027 let mut max_gap = 0.0;
1029 let mut optimal_clusters = 2;
1030
1031 for i in 1..linkagematrix.nrows() {
1032 let gap = linkagematrix[[i, 2]] - linkagematrix[[i - 1, 2]];
1033 if gap > max_gap {
1034 max_gap = gap;
1035 optimal_clusters = linkagematrix.nrows() - i + 1;
1036 }
1037 }
1038
1039 optimal_clusters.min(self.config.max_clusters.unwrap_or(10))
1040 }
1041
1042 fn calculate_diversity_metrics(
1044 &self,
1045 results: &[ClusteringResult],
1046 ) -> Result<DiversityMetrics> {
1047 Ok(DiversityMetrics {
1048 average_diversity: 0.5, diversity_matrix: Array2::eye(results.len()), algorithm_distribution: HashMap::new(), parameter_diversity: HashMap::new(), })
1053 }
1054
1055 fn calculate_consensus_statistics(
1057 &self,
1058 _results: &[ClusteringResult],
1059 _consensus_labels: &Array1<i32>,
1060 ) -> Result<ConsensusStatistics> {
1061 let n_samples = _consensus_labels.len();
1062
1063 Ok(ConsensusStatistics {
1065 agreement_matrix: Array2::zeros((n_samples, n_samples)),
1066 consensus_strength: Array1::ones(n_samples),
1067 cluster_stability: vec![0.5; 10], agreement_counts: Array1::ones(n_samples),
1069 })
1070 }
1071
1072 fn calculate_consensus_stability_score(&self, _consensusstats: &ConsensusStatistics) -> f64 {
1074 0.5 }
1076}
1077
1078fn extract_samples<F: Float>(data: ArrayView2<F>, indices: &[usize]) -> Result<Array2<F>> {
1080 let n_features = data.ncols();
1081 let mut sampled_data = Array2::zeros((indices.len(), n_features));
1082
1083 for (new_idx, &old_idx) in indices.iter().enumerate() {
1084 if old_idx < data.nrows() {
1085 sampled_data.row_mut(new_idx).assign(&data.row(old_idx));
1086 }
1087 }
1088
1089 Ok(sampled_data)
1090}
1091
1092fn extract_features<F: Float>(data: ArrayView2<F>, featureindices: &[usize]) -> Result<Array2<F>> {
1094 let n_samples = data.nrows();
1095 let mut sampled_data = Array2::zeros((n_samples, featureindices.len()));
1096
1097 for (new_feat_idx, &old_feat_idx) in featureindices.iter().enumerate() {
1098 if old_feat_idx < data.ncols() {
1099 sampled_data
1100 .column_mut(new_feat_idx)
1101 .assign(&data.column(old_feat_idx));
1102 }
1103 }
1104
1105 Ok(sampled_data)
1106}
1107
1108impl Default for EnsembleConfig {
1110 fn default() -> Self {
1111 Self {
1112 n_estimators: 10,
1113 sampling_strategy: SamplingStrategy::Bootstrap { sample_ratio: 0.8 },
1114 consensus_method: ConsensusMethod::MajorityVoting,
1115 random_seed: None,
1116 diversity_strategy: Some(DiversityStrategy::AlgorithmDiversity {
1117 algorithms: vec![
1118 ClusteringAlgorithm::KMeans { k_range: (2, 10) },
1119 ClusteringAlgorithm::DBSCAN {
1120 eps_range: (0.1, 1.0),
1121 min_samples_range: (3, 10),
1122 },
1123 ],
1124 }),
1125 quality_threshold: None,
1126 max_clusters: Some(20),
1127 }
1128 }
1129}