sklears_manifold/
diffusion_maps.rs

1//! Diffusion Maps implementation
2//!
3//! This module provides Diffusion Maps for non-linear dimensionality reduction through diffusion processes.
4
5use scirs2_core::ndarray::{Array2, ArrayView2};
6use scirs2_linalg::compat::{ArrayLinalgExt, UPLO};
7use sklears_core::{
8    error::{Result as SklResult, SklearsError},
9    traits::{Estimator, Fit, Transform, Untrained},
10    types::Float,
11};
12
13/// Diffusion Maps
14///
15/// Diffusion Maps is a nonlinear dimensionality reduction technique that
16/// captures the intrinsic geometry of data through the eigenfunctions of
17/// a Markov chain. It constructs a diffusion process on the data and uses
18/// the eigenvectors of the diffusion operator to embed the data.
19///
20/// # Parameters
21///
22/// * `n_components` - Number of coordinates for the manifold
23/// * `diffusion_time` - Diffusion time parameter (t)
24/// * `epsilon` - Bandwidth parameter for Gaussian kernel
25/// * `alpha` - Normalization parameter (0 for symmetric, 1 for row-stochastic)
26/// * `eigen_solver` - The eigensolver to use
27/// * `random_state` - Random state for reproducibility
28/// * `n_jobs` - Number of parallel jobs
29///
30/// # Examples
31///
32/// ```
33/// use sklears_manifold::DiffusionMaps;
34/// use sklears_core::traits::{Transform, Fit};
35/// use scirs2_core::ndarray::array;
36///
37/// let x = array![[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0], [10.0, 11.0, 12.0]];
38///
39/// let dm = DiffusionMaps::new()
40///     .n_components(2)
41///     .epsilon(1.0);
42/// let fitted = dm.fit(&x.view(), &()).unwrap();
43/// let embedded = fitted.transform(&x.view()).unwrap();
44/// ```
45#[derive(Debug, Clone)]
46pub struct DiffusionMaps<S = Untrained> {
47    state: S,
48    n_components: usize,
49    diffusion_time: usize,
50    epsilon: Option<f64>,
51    alpha: f64,
52    eigen_solver: String,
53    random_state: Option<u64>,
54    n_jobs: Option<i32>,
55}
56
57/// Trained state for Diffusion Maps
58#[derive(Debug, Clone)]
59pub struct DiffusionMapsTrained {
60    /// The low-dimensional embedding of the training data
61    pub embedding: Array2<f64>,
62    /// Eigenvalues of the diffusion operator
63    pub eigenvalues: Array2<f64>,
64    /// Eigenvectors of the diffusion operator
65    pub eigenvectors: Array2<f64>,
66    /// Affinity matrix used for diffusion
67    pub affinity_matrix: Array2<f64>,
68    /// Bandwidth parameter used
69    pub epsilon: f64,
70}
71
72impl DiffusionMaps<Untrained> {
73    /// Create a new DiffusionMaps instance
74    pub fn new() -> Self {
75        Self {
76            state: Untrained,
77            n_components: 2,
78            diffusion_time: 1,
79            epsilon: None,
80            alpha: 0.5,
81            eigen_solver: "auto".to_string(),
82            random_state: None,
83            n_jobs: None,
84        }
85    }
86
87    /// Set the number of components
88    pub fn n_components(mut self, n_components: usize) -> Self {
89        self.n_components = n_components;
90        self
91    }
92
93    /// Set the diffusion time
94    pub fn diffusion_time(mut self, diffusion_time: usize) -> Self {
95        self.diffusion_time = diffusion_time;
96        self
97    }
98
99    /// Set the epsilon parameter
100    pub fn epsilon(mut self, epsilon: f64) -> Self {
101        self.epsilon = Some(epsilon);
102        self
103    }
104
105    /// Set the alpha parameter
106    pub fn alpha(mut self, alpha: f64) -> Self {
107        self.alpha = alpha;
108        self
109    }
110
111    /// Set the eigen solver
112    pub fn eigen_solver(mut self, eigen_solver: &str) -> Self {
113        self.eigen_solver = eigen_solver.to_string();
114        self
115    }
116
117    /// Set the random state
118    pub fn random_state(mut self, random_state: Option<u64>) -> Self {
119        self.random_state = random_state;
120        self
121    }
122
123    /// Set the number of jobs
124    pub fn n_jobs(mut self, n_jobs: Option<i32>) -> Self {
125        self.n_jobs = n_jobs;
126        self
127    }
128}
129
130impl Default for DiffusionMaps<Untrained> {
131    fn default() -> Self {
132        Self::new()
133    }
134}
135
136impl Estimator for DiffusionMaps<Untrained> {
137    type Config = ();
138    type Error = SklearsError;
139    type Float = Float;
140
141    fn config(&self) -> &Self::Config {
142        &()
143    }
144}
145
146impl Fit<ArrayView2<'_, Float>, ()> for DiffusionMaps<Untrained> {
147    type Fitted = DiffusionMaps<DiffusionMapsTrained>;
148
149    fn fit(self, x: &ArrayView2<'_, Float>, _y: &()) -> SklResult<Self::Fitted> {
150        let x = x.mapv(|x| x);
151        let (n_samples, _) = x.dim();
152
153        if n_samples <= self.n_components {
154            return Err(SklearsError::InvalidInput(
155                "Number of samples must be greater than n_components".to_string(),
156            ));
157        }
158
159        // Step 1: Compute affinity matrix
160        let epsilon = self.epsilon.unwrap_or_else(|| self.estimate_epsilon(&x));
161        let affinity = self.compute_affinity_matrix(&x, epsilon)?;
162
163        // Step 2: Normalize the affinity matrix
164        let transition_matrix = self.normalize_affinity_matrix(&affinity)?;
165
166        // Step 3: Eigendecomposition
167        let (eigenvalues, eigenvectors) = self.compute_eigenvectors(&transition_matrix)?;
168
169        // Step 4: Create embedding
170        let embedding = self.create_embedding(&eigenvalues, &eigenvectors)?;
171
172        Ok(DiffusionMaps {
173            state: DiffusionMapsTrained {
174                embedding,
175                eigenvalues,
176                eigenvectors,
177                affinity_matrix: affinity,
178                epsilon,
179            },
180            n_components: self.n_components,
181            diffusion_time: self.diffusion_time,
182            epsilon: Some(epsilon),
183            alpha: self.alpha,
184            eigen_solver: self.eigen_solver,
185            random_state: self.random_state,
186            n_jobs: self.n_jobs,
187        })
188    }
189}
190
191impl DiffusionMaps<Untrained> {
192    fn estimate_epsilon(&self, x: &Array2<f64>) -> f64 {
193        let n_samples = x.nrows();
194        let mut distances = Vec::new();
195
196        // Compute pairwise distances
197        for i in 0..n_samples {
198            for j in i + 1..n_samples {
199                let diff = &x.row(i) - &x.row(j);
200                let dist = diff.mapv(|x| x * x).sum().sqrt();
201                distances.push(dist);
202            }
203        }
204
205        // Use median distance as epsilon estimate
206        distances.sort_by(|a, b| a.partial_cmp(b).unwrap());
207        distances[distances.len() / 2]
208    }
209
210    fn compute_affinity_matrix(&self, x: &Array2<f64>, epsilon: f64) -> SklResult<Array2<f64>> {
211        let n_samples = x.nrows();
212        let mut affinity = Array2::zeros((n_samples, n_samples));
213
214        // Gaussian kernel: K(x,y) = exp(-||x-y||²/(2ε²))
215        for i in 0..n_samples {
216            for j in 0..n_samples {
217                if i != j {
218                    let diff = &x.row(i) - &x.row(j);
219                    let dist_sq = diff.mapv(|x| x * x).sum();
220                    let weight = (-dist_sq / (2.0 * epsilon * epsilon)).exp();
221                    affinity[[i, j]] = weight;
222                } else {
223                    affinity[[i, j]] = 1.0;
224                }
225            }
226        }
227
228        Ok(affinity)
229    }
230
231    fn normalize_affinity_matrix(&self, affinity: &Array2<f64>) -> SklResult<Array2<f64>> {
232        let n_samples = affinity.nrows();
233        let mut normalized = affinity.clone();
234
235        // Compute degree matrix
236        let mut degrees = Array2::zeros((n_samples, n_samples));
237        for i in 0..n_samples {
238            let degree_sum: f64 = affinity.row(i).sum();
239            degrees[[i, i]] = degree_sum;
240        }
241
242        // Normalize based on alpha parameter
243        // P = D^(-α) * K * D^(-α)
244        for i in 0..n_samples {
245            for j in 0..n_samples {
246                if degrees[[i, i]] > 1e-10 && degrees[[j, j]] > 1e-10 {
247                    let degree_factor_i = degrees[[i, i]].powf(-self.alpha);
248                    let degree_factor_j = degrees[[j, j]].powf(-self.alpha);
249                    normalized[[i, j]] = degree_factor_i * affinity[[i, j]] * degree_factor_j;
250                }
251            }
252        }
253
254        // Row normalization for transition matrix
255        for i in 0..n_samples {
256            let row_sum: f64 = normalized.row(i).sum();
257            if row_sum > 1e-10 {
258                for j in 0..n_samples {
259                    normalized[[i, j]] /= row_sum;
260                }
261            }
262        }
263
264        Ok(normalized)
265    }
266
267    fn compute_eigenvectors(
268        &self,
269        transition_matrix: &Array2<f64>,
270    ) -> SklResult<(Array2<f64>, Array2<f64>)> {
271        // Symmetrize the matrix to ensure numerical stability for eigendecomposition
272        // Matrix should be symmetric but may have small numerical errors
273        let symmetric_matrix = (transition_matrix + &transition_matrix.t()) / 2.0;
274
275        // Compute eigendecomposition
276        let (eigenvals, eigenvecs) = symmetric_matrix
277            .eigh(UPLO::Lower)
278            .map_err(|e| SklearsError::InvalidInput(format!("Eigendecomposition failed: {e}")))?;
279
280        // Sort eigenvalues and eigenvectors in descending order
281        let mut eigen_pairs: Vec<(f64, usize)> = eigenvals
282            .iter()
283            .enumerate()
284            .map(|(i, &val)| (val, i))
285            .collect();
286        eigen_pairs.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap());
287
288        // Create sorted eigenvalue and eigenvector matrices
289        let n = eigenvals.len();
290        let mut sorted_eigenvals = Array2::zeros((n, 1));
291        let mut sorted_eigenvecs = Array2::zeros((n, n));
292
293        for (new_idx, &(eigenval, old_idx)) in eigen_pairs.iter().enumerate() {
294            sorted_eigenvals[[new_idx, 0]] = eigenval;
295            for i in 0..n {
296                sorted_eigenvecs[[i, new_idx]] = eigenvecs[[i, old_idx]];
297            }
298        }
299
300        Ok((sorted_eigenvals, sorted_eigenvecs))
301    }
302
303    fn create_embedding(
304        &self,
305        eigenvalues: &Array2<f64>,
306        eigenvectors: &Array2<f64>,
307    ) -> SklResult<Array2<f64>> {
308        let n_samples = eigenvectors.nrows();
309        let mut embedding = Array2::zeros((n_samples, self.n_components));
310
311        // Skip the first eigenvector (constant vector) and take the next n_components
312        for comp_idx in 0..self.n_components {
313            let eigen_idx = comp_idx + 1; // Skip first eigenvector
314            if eigen_idx < eigenvalues.nrows() {
315                let eigenval = eigenvalues[[eigen_idx, 0]];
316
317                // Apply diffusion time: λ^t
318                let diffusion_weight = eigenval.powf(self.diffusion_time as f64);
319
320                for i in 0..n_samples {
321                    embedding[[i, comp_idx]] = eigenvectors[[i, eigen_idx]] * diffusion_weight;
322                }
323            }
324        }
325
326        Ok(embedding)
327    }
328}
329
330impl Transform<ArrayView2<'_, Float>, Array2<f64>> for DiffusionMaps<DiffusionMapsTrained> {
331    fn transform(&self, _x: &ArrayView2<'_, Float>) -> SklResult<Array2<f64>> {
332        // Diffusion Maps doesn't support transforming new data in this implementation
333        Ok(self.state.embedding.clone())
334    }
335}
336
337impl DiffusionMaps<DiffusionMapsTrained> {
338    /// Get the embedding
339    pub fn embedding(&self) -> &Array2<f64> {
340        &self.state.embedding
341    }
342
343    /// Get the eigenvalues
344    pub fn eigenvalues(&self) -> &Array2<f64> {
345        &self.state.eigenvalues
346    }
347
348    /// Get the eigenvectors
349    pub fn eigenvectors(&self) -> &Array2<f64> {
350        &self.state.eigenvectors
351    }
352
353    /// Get the affinity matrix
354    pub fn affinity_matrix(&self) -> &Array2<f64> {
355        &self.state.affinity_matrix
356    }
357
358    /// Get the epsilon parameter used
359    pub fn epsilon(&self) -> f64 {
360        self.state.epsilon
361    }
362}