scirs2_transform/reduction/
diffusion_maps.rs1use scirs2_core::ndarray::{Array1, Array2, ArrayBase, Data, Ix2};
22use scirs2_core::numeric::{Float, NumCast};
23use scirs2_core::validation::{check_positive, checkshape};
24use scirs2_linalg::eigh;
25
26use crate::error::{Result, TransformError};
27
28#[derive(Debug, Clone)]
42pub struct DiffusionMaps {
43 n_components: usize,
45 epsilon: Option<f64>,
47 alpha: f64,
52 diffusion_time: f64,
54 embedding: Option<Array2<f64>>,
56 training_data: Option<Array2<f64>>,
58 eigenvalues: Option<Array1<f64>>,
60 eigenvectors: Option<Array2<f64>>,
62 epsilon_used: Option<f64>,
64 spectral_gaps: Option<Array1<f64>>,
66 row_sums: Option<Array1<f64>>,
68}
69
70impl DiffusionMaps {
71 pub fn new(n_components: usize) -> Self {
76 DiffusionMaps {
77 n_components,
78 epsilon: None,
79 alpha: 0.5,
80 diffusion_time: 1.0,
81 embedding: None,
82 training_data: None,
83 eigenvalues: None,
84 eigenvectors: None,
85 epsilon_used: None,
86 spectral_gaps: None,
87 row_sums: None,
88 }
89 }
90
91 pub fn with_epsilon(mut self, epsilon: f64) -> Self {
93 self.epsilon = Some(epsilon);
94 self
95 }
96
97 pub fn with_alpha(mut self, alpha: f64) -> Self {
103 self.alpha = alpha;
104 self
105 }
106
107 pub fn with_diffusion_time(mut self, t: f64) -> Self {
111 self.diffusion_time = t;
112 self
113 }
114
115 pub fn embedding(&self) -> Option<&Array2<f64>> {
117 self.embedding.as_ref()
118 }
119
120 pub fn eigenvalues(&self) -> Option<&Array1<f64>> {
122 self.eigenvalues.as_ref()
123 }
124
125 pub fn spectral_gaps(&self) -> Option<&Array1<f64>> {
127 self.spectral_gaps.as_ref()
128 }
129
130 pub fn epsilon_used(&self) -> Option<f64> {
132 self.epsilon_used
133 }
134
135 pub fn suggest_dimensionality(&self, max_dim: usize) -> Result<usize> {
146 let eigenvalues = self
147 .eigenvalues
148 .as_ref()
149 .ok_or_else(|| TransformError::NotFitted("Model not fitted".to_string()))?;
150
151 let n_eigs = eigenvalues.len().min(max_dim);
152 if n_eigs < 2 {
153 return Ok(1);
154 }
155
156 let mut max_gap = 0.0;
157 let mut best_dim = 1;
158
159 for i in 0..(n_eigs - 1) {
160 let current = eigenvalues[i].abs();
161 let next = eigenvalues[i + 1].abs();
162
163 if next > 1e-15 {
164 let gap = current / next;
165 if gap > max_gap {
166 max_gap = gap;
167 best_dim = i + 1;
168 }
169 } else {
170 best_dim = i + 1;
172 break;
173 }
174 }
175
176 Ok(best_dim.max(1))
177 }
178
179 fn compute_sq_distances(x: &Array2<f64>) -> Array2<f64> {
181 let n = x.nrows();
182 let d = x.ncols();
183 let mut dist_sq = Array2::zeros((n, n));
184
185 for i in 0..n {
186 for j in (i + 1)..n {
187 let mut sq = 0.0;
188 for k in 0..d {
189 let diff = x[[i, k]] - x[[j, k]];
190 sq += diff * diff;
191 }
192 dist_sq[[i, j]] = sq;
193 dist_sq[[j, i]] = sq;
194 }
195 }
196
197 dist_sq
198 }
199
200 fn estimate_epsilon(dist_sq: &Array2<f64>) -> f64 {
202 let n = dist_sq.nrows();
203 let mut all_dists: Vec<f64> = Vec::with_capacity(n * (n - 1) / 2);
204
205 for i in 0..n {
206 for j in (i + 1)..n {
207 all_dists.push(dist_sq[[i, j]]);
208 }
209 }
210
211 all_dists.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
212
213 let median = all_dists[all_dists.len() / 2];
214 if median < 1e-15 {
215 1.0
216 } else {
217 median
218 }
219 }
220
221 fn construct_diffusion_operator(
228 &self,
229 dist_sq: &Array2<f64>,
230 epsilon: f64,
231 ) -> Result<(Array2<f64>, Array1<f64>)> {
232 let n = dist_sq.nrows();
233
234 let mut k = Array2::zeros((n, n));
236 for i in 0..n {
237 k[[i, i]] = 1.0; for j in (i + 1)..n {
239 let val = (-dist_sq[[i, j]] / epsilon).exp();
240 k[[i, j]] = val;
241 k[[j, i]] = val;
242 }
243 }
244
245 let mut q = Array1::zeros(n);
248 for i in 0..n {
249 q[i] = k.row(i).sum();
250 }
251
252 for i in 0..n {
254 if q[i] < 1e-15 {
255 return Err(TransformError::ComputationError(format!(
256 "Point {} has zero density. Increase epsilon.",
257 i
258 )));
259 }
260 }
261
262 if self.alpha.abs() > 1e-15 {
264 for i in 0..n {
265 let qi_alpha = q[i].powf(self.alpha);
266 for j in 0..n {
267 let qj_alpha = q[j].powf(self.alpha);
268 k[[i, j]] /= qi_alpha * qj_alpha;
269 }
270 }
271 }
272
273 let mut row_sums = Array1::zeros(n);
275 for i in 0..n {
276 row_sums[i] = k.row(i).sum();
277 }
278
279 let mut k_sym = Array2::zeros((n, n));
283 for i in 0..n {
284 let d_i_inv_sqrt = 1.0 / row_sums[i].sqrt();
285 for j in 0..n {
286 let d_j_inv_sqrt = 1.0 / row_sums[j].sqrt();
287 k_sym[[i, j]] = k[[i, j]] * d_i_inv_sqrt * d_j_inv_sqrt;
288 }
289 }
290
291 Ok((k_sym, row_sums))
292 }
293
294 pub fn fit<S>(&mut self, x: &ArrayBase<S, Ix2>) -> Result<()>
299 where
300 S: Data,
301 S::Elem: Float + NumCast,
302 {
303 let (n_samples, n_features) = x.dim();
304
305 check_positive(self.n_components, "n_components")?;
306 checkshape(x, &[n_samples, n_features], "x")?;
307
308 if self.n_components >= n_samples {
309 return Err(TransformError::InvalidInput(format!(
310 "n_components={} must be < n_samples={}",
311 self.n_components, n_samples
312 )));
313 }
314
315 if self.diffusion_time < 0.0 {
316 return Err(TransformError::InvalidInput(
317 "diffusion_time must be non-negative".to_string(),
318 ));
319 }
320
321 let x_f64 = x.mapv(|v| NumCast::from(v).unwrap_or(0.0));
322
323 let dist_sq = Self::compute_sq_distances(&x_f64);
325
326 let epsilon = self
328 .epsilon
329 .unwrap_or_else(|| Self::estimate_epsilon(&dist_sq));
330
331 let (k_sym, row_sums) = self.construct_diffusion_operator(&dist_sq, epsilon)?;
333
334 let (eigenvalues, eigenvectors) =
336 eigh(&k_sym.view(), None).map_err(TransformError::LinalgError)?;
337
338 let mut indices: Vec<usize> = (0..n_samples).collect();
340 indices.sort_by(|&i, &j| {
341 eigenvalues[j]
342 .partial_cmp(&eigenvalues[i])
343 .unwrap_or(std::cmp::Ordering::Equal)
344 });
345
346 let n_eigs = self.n_components.min(n_samples - 1);
349 let mut selected_eigenvalues = Array1::zeros(n_eigs);
350 let mut selected_eigenvectors = Array2::zeros((n_samples, n_eigs));
351
352 for j in 0..n_eigs {
353 let idx = indices[j + 1]; selected_eigenvalues[j] = eigenvalues[idx].max(0.0);
355
356 for i in 0..n_samples {
358 let d_inv_sqrt = 1.0 / row_sums[i].sqrt();
359 selected_eigenvectors[[i, j]] = eigenvectors[[i, idx]] * d_inv_sqrt;
360 }
361 }
362
363 let mut embedding = Array2::zeros((n_samples, n_eigs));
365 for i in 0..n_samples {
366 for j in 0..n_eigs {
367 let scale = selected_eigenvalues[j].powf(self.diffusion_time);
368 embedding[[i, j]] = scale * selected_eigenvectors[[i, j]];
369 }
370 }
371
372 let mut spectral_gaps = Array1::zeros(n_eigs.saturating_sub(1));
374 for j in 0..(n_eigs.saturating_sub(1)) {
375 let next = selected_eigenvalues[j + 1].abs();
376 if next > 1e-15 {
377 spectral_gaps[j] = selected_eigenvalues[j] / next;
378 } else {
379 spectral_gaps[j] = f64::INFINITY;
380 }
381 }
382
383 self.embedding = Some(embedding);
384 self.training_data = Some(x_f64);
385 self.eigenvalues = Some(selected_eigenvalues);
386 self.eigenvectors = Some(selected_eigenvectors);
387 self.epsilon_used = Some(epsilon);
388 self.spectral_gaps = Some(spectral_gaps);
389 self.row_sums = Some(row_sums);
390
391 Ok(())
392 }
393
394 pub fn transform<S>(&self, x: &ArrayBase<S, Ix2>) -> Result<Array2<f64>>
396 where
397 S: Data,
398 S::Elem: Float + NumCast,
399 {
400 let training_data = self
401 .training_data
402 .as_ref()
403 .ok_or_else(|| TransformError::NotFitted("Model not fitted".to_string()))?;
404
405 let x_f64 = x.mapv(|v| NumCast::from(v).unwrap_or(0.0));
406
407 if self.is_same_data(&x_f64, training_data) {
408 return self
409 .embedding
410 .as_ref()
411 .cloned()
412 .ok_or_else(|| TransformError::NotFitted("Embedding not available".to_string()));
413 }
414
415 self.nystrom_extension(&x_f64)
416 }
417
418 pub fn fit_transform<S>(&mut self, x: &ArrayBase<S, Ix2>) -> Result<Array2<f64>>
420 where
421 S: Data,
422 S::Elem: Float + NumCast,
423 {
424 self.fit(x)?;
425 self.transform(x)
426 }
427
428 fn nystrom_extension(&self, x_new: &Array2<f64>) -> Result<Array2<f64>> {
430 let training_data = self
431 .training_data
432 .as_ref()
433 .ok_or_else(|| TransformError::NotFitted("Training data not available".to_string()))?;
434 let eigenvectors = self
435 .eigenvectors
436 .as_ref()
437 .ok_or_else(|| TransformError::NotFitted("Eigenvectors not available".to_string()))?;
438 let eigenvalues = self
439 .eigenvalues
440 .as_ref()
441 .ok_or_else(|| TransformError::NotFitted("Eigenvalues not available".to_string()))?;
442 let row_sums = self
443 .row_sums
444 .as_ref()
445 .ok_or_else(|| TransformError::NotFitted("Row sums not available".to_string()))?;
446
447 let epsilon = self
448 .epsilon_used
449 .ok_or_else(|| TransformError::NotFitted("Epsilon not available".to_string()))?;
450
451 let n_new = x_new.nrows();
452 let n_train = training_data.nrows();
453 let n_features = training_data.ncols();
454 let n_eigs = eigenvalues.len();
455
456 if x_new.ncols() != n_features {
457 return Err(TransformError::InvalidInput(format!(
458 "Input features {} must match training features {}",
459 x_new.ncols(),
460 n_features
461 )));
462 }
463
464 let mut k_new = Array2::zeros((n_new, n_train));
466 for i in 0..n_new {
467 for j in 0..n_train {
468 let mut dist_sq = 0.0;
469 for d in 0..n_features {
470 let diff = x_new[[i, d]] - training_data[[j, d]];
471 dist_sq += diff * diff;
472 }
473 k_new[[i, j]] = (-dist_sq / epsilon).exp();
474 }
475 }
476
477 if self.alpha.abs() > 1e-15 {
479 let mut q_new = Array1::zeros(n_new);
481 for i in 0..n_new {
482 q_new[i] = k_new.row(i).sum();
483 }
484
485 for i in 0..n_new {
486 let qi_alpha = if q_new[i] > 1e-15 {
487 q_new[i].powf(self.alpha)
488 } else {
489 1.0
490 };
491 for j in 0..n_train {
492 let qj_alpha = row_sums[j].powf(self.alpha);
493 k_new[[i, j]] /= qi_alpha * qj_alpha;
494 }
495 }
496 }
497
498 let mut new_row_sums = Array1::zeros(n_new);
500 for i in 0..n_new {
501 new_row_sums[i] = k_new.row(i).sum();
502 }
503
504 let mut new_embedding = Array2::zeros((n_new, n_eigs));
506 for i in 0..n_new {
507 let d_new = new_row_sums[i];
508 if d_new < 1e-15 {
509 continue;
510 }
511
512 for c in 0..n_eigs {
513 let lambda = eigenvalues[c];
514 if lambda.abs() < 1e-15 {
515 continue;
516 }
517
518 let mut sum = 0.0;
519 for j in 0..n_train {
520 let p_ij = k_new[[i, j]] / d_new;
521 sum += p_ij * eigenvectors[[j, c]];
522 }
523
524 let scale = lambda.powf(self.diffusion_time);
526 new_embedding[[i, c]] = scale * sum / lambda;
527 }
528 }
529
530 Ok(new_embedding)
531 }
532
533 fn is_same_data(&self, x: &Array2<f64>, training_data: &Array2<f64>) -> bool {
535 if x.dim() != training_data.dim() {
536 return false;
537 }
538 let (n, m) = x.dim();
539 for i in 0..n {
540 for j in 0..m {
541 if (x[[i, j]] - training_data[[i, j]]).abs() > 1e-10 {
542 return false;
543 }
544 }
545 }
546 true
547 }
548}
549
550#[cfg(test)]
551mod tests {
552 use super::*;
553 use scirs2_core::ndarray::Array;
554
555 fn make_manifold_data(n: usize) -> Array2<f64> {
556 let mut data = Vec::with_capacity(n * 3);
557 for i in 0..n {
558 let t = 2.0 * std::f64::consts::PI * i as f64 / n as f64;
559 let r = 2.0 + 0.5 * (3.0 * t).sin();
560 data.push(r * t.cos());
561 data.push(r * t.sin());
562 data.push(0.5 * t);
563 }
564 Array::from_shape_vec((n, 3), data).expect("Failed to create data")
565 }
566
567 #[test]
568 fn test_diffusion_maps_basic() {
569 let data = make_manifold_data(30);
570 let mut dm = DiffusionMaps::new(2);
571 let embedding = dm.fit_transform(&data).expect("DM fit_transform failed");
572
573 assert_eq!(embedding.shape(), &[30, 2]);
574 for val in embedding.iter() {
575 assert!(val.is_finite());
576 }
577 }
578
579 #[test]
580 fn test_diffusion_maps_custom_epsilon() {
581 let data = make_manifold_data(25);
582 let mut dm = DiffusionMaps::new(2).with_epsilon(2.0);
583 let embedding = dm.fit_transform(&data).expect("DM fit_transform failed");
584
585 assert_eq!(embedding.shape(), &[25, 2]);
586 for val in embedding.iter() {
587 assert!(val.is_finite());
588 }
589 }
590
591 #[test]
592 fn test_diffusion_maps_alpha_zero() {
593 let data = make_manifold_data(20);
595 let mut dm = DiffusionMaps::new(2).with_alpha(0.0);
596 let embedding = dm.fit_transform(&data).expect("DM fit_transform failed");
597
598 assert_eq!(embedding.shape(), &[20, 2]);
599 for val in embedding.iter() {
600 assert!(val.is_finite());
601 }
602 }
603
604 #[test]
605 fn test_diffusion_maps_alpha_one() {
606 let data = make_manifold_data(20);
608 let mut dm = DiffusionMaps::new(2).with_alpha(1.0);
609 let embedding = dm.fit_transform(&data).expect("DM fit_transform failed");
610
611 assert_eq!(embedding.shape(), &[20, 2]);
612 for val in embedding.iter() {
613 assert!(val.is_finite());
614 }
615 }
616
617 #[test]
618 fn test_diffusion_maps_large_time() {
619 let data = make_manifold_data(20);
620 let mut dm = DiffusionMaps::new(2).with_diffusion_time(5.0);
621 let embedding = dm.fit_transform(&data).expect("DM fit_transform failed");
622
623 assert_eq!(embedding.shape(), &[20, 2]);
624 for val in embedding.iter() {
625 assert!(val.is_finite());
626 }
627 }
628
629 #[test]
630 fn test_diffusion_maps_eigenvalues() {
631 let data = make_manifold_data(25);
632 let mut dm = DiffusionMaps::new(5);
633 dm.fit(&data).expect("DM fit failed");
634
635 let eigenvalues = dm.eigenvalues().expect("Eigenvalues should exist");
636 assert_eq!(eigenvalues.len(), 5);
637
638 for i in 0..eigenvalues.len() {
640 assert!(
641 eigenvalues[i] >= -1e-10,
642 "Eigenvalue {} should be >= 0, got {}",
643 i,
644 eigenvalues[i]
645 );
646 if i > 0 {
647 assert!(
648 eigenvalues[i] <= eigenvalues[i - 1] + 1e-10,
649 "Eigenvalues should be decreasing"
650 );
651 }
652 }
653 }
654
655 #[test]
656 fn test_diffusion_maps_spectral_gaps() {
657 let data = make_manifold_data(25);
658 let mut dm = DiffusionMaps::new(5);
659 dm.fit(&data).expect("DM fit failed");
660
661 let gaps = dm.spectral_gaps().expect("Spectral gaps should exist");
662 assert_eq!(gaps.len(), 4);
663
664 for &gap in gaps.iter() {
666 if gap.is_finite() {
667 assert!(gap > 0.0, "Spectral gap should be positive, got {}", gap);
668 }
669 }
670 }
671
672 #[test]
673 fn test_diffusion_maps_suggest_dimensionality() {
674 let data = make_manifold_data(25);
675 let mut dm = DiffusionMaps::new(10);
676 dm.fit(&data).expect("DM fit failed");
677
678 let suggested = dm.suggest_dimensionality(10).expect("Suggestion failed");
679 assert!(suggested >= 1);
680 assert!(suggested <= 10);
681 }
682
683 #[test]
684 fn test_diffusion_maps_out_of_sample() {
685 let data = make_manifold_data(30);
686 let mut dm = DiffusionMaps::new(2);
687 dm.fit(&data).expect("DM fit failed");
688
689 let new_data =
690 Array::from_shape_vec((3, 3), vec![1.0, 2.0, 0.5, -1.0, 2.0, 1.0, 0.0, -2.0, 1.5])
691 .expect("Failed");
692
693 let new_embedding = dm.transform(&new_data).expect("DM transform failed");
694 assert_eq!(new_embedding.shape(), &[3, 2]);
695 for val in new_embedding.iter() {
696 assert!(val.is_finite());
697 }
698 }
699
700 #[test]
701 fn test_diffusion_maps_epsilon_used() {
702 let data = make_manifold_data(20);
703 let mut dm = DiffusionMaps::new(2);
704 dm.fit(&data).expect("DM fit failed");
705
706 let eps = dm.epsilon_used().expect("Epsilon should be set");
707 assert!(eps > 0.0);
708 assert!(eps.is_finite());
709 }
710
711 #[test]
712 fn test_diffusion_maps_invalid_params() {
713 let data = make_manifold_data(5);
714
715 let mut dm = DiffusionMaps::new(10);
716 assert!(dm.fit(&data).is_err());
717 }
718
719 #[test]
720 fn test_diffusion_maps_not_fitted() {
721 let dm = DiffusionMaps::new(2);
722 let data = make_manifold_data(10);
723 assert!(dm.transform(&data).is_err());
724 }
725
726 #[test]
727 fn test_diffusion_maps_linear_data() {
728 let mut data_vec = Vec::new();
730 for i in 0..20 {
731 let t = i as f64 / 20.0;
732 data_vec.extend_from_slice(&[t, 2.0 * t, 3.0 * t]);
733 }
734 let data = Array::from_shape_vec((20, 3), data_vec).expect("Failed");
735
736 let mut dm = DiffusionMaps::new(3);
737 let embedding = dm.fit_transform(&data).expect("DM fit_transform failed");
738
739 assert_eq!(embedding.shape(), &[20, 3]);
740 for val in embedding.iter() {
741 assert!(val.is_finite());
742 }
743 }
744
745 #[test]
746 fn test_diffusion_maps_multi_scale() {
747 let data = make_manifold_data(25);
749
750 let mut dm1 = DiffusionMaps::new(2).with_diffusion_time(0.5);
751 let emb1 = dm1.fit_transform(&data).expect("DM t=0.5 failed");
752
753 let mut dm2 = DiffusionMaps::new(2).with_diffusion_time(5.0);
754 let emb2 = dm2.fit_transform(&data).expect("DM t=5.0 failed");
755
756 assert_eq!(emb1.shape(), &[25, 2]);
757 assert_eq!(emb2.shape(), &[25, 2]);
758
759 let mut any_diff = false;
761 for i in 0..25 {
762 for j in 0..2 {
763 if (emb1[[i, j]] - emb2[[i, j]]).abs() > 1e-10 {
764 any_diff = true;
765 break;
766 }
767 }
768 if any_diff {
769 break;
770 }
771 }
772 assert!(
773 any_diff,
774 "Different diffusion times should give different embeddings"
775 );
776 }
777}