Skip to main content

scirs2_transform/reduction/
spectral_embedding.rs

1//! Spectral Embedding for non-linear dimensionality reduction
2//!
3//! Spectral embedding is a non-linear dimensionality reduction technique that uses
4//! the eigenvectors of a graph Laplacian to embed data points in a lower-dimensional space.
5//! It's particularly effective for data that lies on a non-linear manifold.
6
7use scirs2_core::ndarray::{Array1, Array2, ArrayBase, Data, Ix2};
8use scirs2_core::numeric::{Float, NumCast};
9use scirs2_core::validation::{check_positive, checkshape};
10use scirs2_linalg::eigh;
11
12use crate::error::{Result, TransformError};
13
14/// Affinity matrix construction methods
15#[derive(Debug, Clone, PartialEq)]
16pub enum AffinityMethod {
17    /// Gaussian (RBF) kernel with automatic bandwidth selection
18    Gaussian,
19    /// K-nearest neighbors graph
20    KNN,
21    /// Epsilon-ball graph
22    Epsilon,
23}
24
25/// Spectral Embedding dimensionality reduction
26///
27/// Spectral embedding uses the eigenvectors of the graph Laplacian matrix
28/// to find a low-dimensional representation that preserves local neighborhood structure.
29#[derive(Debug, Clone)]
30pub struct SpectralEmbedding {
31    /// Number of components (dimensions) in the embedding
32    n_components: usize,
33    /// Method for constructing affinity matrix
34    affinity_method: AffinityMethod,
35    /// Number of neighbors for KNN graph
36    n_neighbors: usize,
37    /// Bandwidth parameter for Gaussian kernel (auto if None)
38    gamma: Option<f64>,
39    /// Epsilon parameter for epsilon-ball graph
40    epsilon: f64,
41    /// Whether to use normalized Laplacian
42    normalized: bool,
43    /// Random seed for reproducibility
44    random_state: Option<u64>,
45    /// The embedding vectors
46    embedding: Option<Array2<f64>>,
47    /// Training data for out-of-sample extension
48    training_data: Option<Array2<f64>>,
49    /// Affinity matrix computed during fitting
50    affinity_matrix: Option<Array2<f64>>,
51    /// Eigenvectors of the Laplacian
52    eigenvectors: Option<Array2<f64>>,
53    /// Eigenvalues of the Laplacian
54    eigenvalues: Option<Array1<f64>>,
55}
56
57impl SpectralEmbedding {
58    /// Creates a new SpectralEmbedding instance
59    ///
60    /// # Arguments
61    /// * `n_components` - Number of dimensions in the embedding space
62    /// * `affinity_method` - Method for constructing the affinity matrix
63    pub fn new(n_components: usize, affinitymethod: AffinityMethod) -> Self {
64        SpectralEmbedding {
65            n_components,
66            affinity_method: affinitymethod,
67            n_neighbors: 10,
68            gamma: None,
69            epsilon: 1.0,
70            normalized: true,
71            random_state: None,
72            embedding: None,
73            training_data: None,
74            affinity_matrix: None,
75            eigenvectors: None,
76            eigenvalues: None,
77        }
78    }
79
80    /// Set the number of neighbors for KNN graph construction
81    pub fn with_n_neighbors(mut self, nneighbors: usize) -> Self {
82        self.n_neighbors = nneighbors;
83        self
84    }
85
86    /// Set the gamma parameter for Gaussian kernel
87    pub fn with_gamma(mut self, gamma: f64) -> Self {
88        self.gamma = Some(gamma);
89        self
90    }
91
92    /// Set the epsilon parameter for epsilon-ball graph
93    pub fn with_epsilon(mut self, epsilon: f64) -> Self {
94        self.epsilon = epsilon;
95        self
96    }
97
98    /// Set whether to use normalized Laplacian
99    pub fn with_normalized(mut self, normalized: bool) -> Self {
100        self.normalized = normalized;
101        self
102    }
103
104    /// Set random seed for reproducibility
105    pub fn with_random_state(mut self, seed: u64) -> Self {
106        self.random_state = Some(seed);
107        self
108    }
109
110    /// Compute pairwise distances between data points
111    fn compute_distances<S>(&self, x: &ArrayBase<S, Ix2>) -> Array2<f64>
112    where
113        S: Data,
114        S::Elem: Float + NumCast,
115    {
116        let n_samples = x.nrows();
117        let mut distances = Array2::zeros((n_samples, n_samples));
118
119        for i in 0..n_samples {
120            for j in i + 1..n_samples {
121                let mut dist_sq = 0.0;
122                for k in 0..x.ncols() {
123                    let diff = NumCast::from(x[[i, k]]).unwrap_or(0.0)
124                        - NumCast::from(x[[j, k]]).unwrap_or(0.0);
125                    dist_sq += diff * diff;
126                }
127                let dist = dist_sq.sqrt();
128                distances[[i, j]] = dist;
129                distances[[j, i]] = dist;
130            }
131        }
132
133        distances
134    }
135
136    /// Construct affinity matrix based on the specified method
137    fn construct_affinity_matrix(&self, distances: &Array2<f64>) -> Result<Array2<f64>> {
138        let n_samples = distances.nrows();
139        let mut affinity = Array2::zeros((n_samples, n_samples));
140
141        match &self.affinity_method {
142            AffinityMethod::Gaussian => {
143                // Automatic bandwidth selection if not specified
144                let gamma = if let Some(g) = self.gamma {
145                    g
146                } else {
147                    // Use median heuristic: gamma = 1 / (2 * median_distance^2)
148                    let mut all_distances: Vec<f64> = Vec::new();
149                    for i in 0..n_samples {
150                        for j in i + 1..n_samples {
151                            all_distances.push(distances[[i, j]]);
152                        }
153                    }
154                    all_distances.sort_by(|a, b| a.partial_cmp(b).expect("Operation failed"));
155                    let median_dist = all_distances[all_distances.len() / 2];
156                    1.0 / (2.0 * median_dist * median_dist)
157                };
158
159                // Gaussian kernel: exp(-gamma * ||x_i - x_j||^2)
160                for i in 0..n_samples {
161                    for j in 0..n_samples {
162                        if i != j {
163                            let dist_sq = distances[[i, j]] * distances[[i, j]];
164                            affinity[[i, j]] = (-gamma * dist_sq).exp();
165                        }
166                    }
167                }
168            }
169            AffinityMethod::KNN => {
170                // K-nearest neighbors graph
171                for i in 0..n_samples {
172                    // Find k nearest neighbors
173                    let mut neighbors_with_dist: Vec<(f64, usize)> = Vec::new();
174                    for j in 0..n_samples {
175                        if i != j {
176                            neighbors_with_dist.push((distances[[i, j]], j));
177                        }
178                    }
179                    neighbors_with_dist
180                        .sort_by(|a, b| a.0.partial_cmp(&b.0).expect("Operation failed"));
181
182                    // Connect to k nearest neighbors
183                    #[allow(clippy::needless_range_loop)]
184                    for k in 0..self.n_neighbors.min(neighbors_with_dist.len()) {
185                        let neighbor = neighbors_with_dist[k].1;
186                        let weight = 1.0; // Binary weights for KNN
187                        affinity[[i, neighbor]] = weight;
188                        affinity[[neighbor, i]] = weight; // Make symmetric
189                    }
190                }
191            }
192            AffinityMethod::Epsilon => {
193                // Epsilon-ball graph
194                for i in 0..n_samples {
195                    for j in i + 1..n_samples {
196                        if distances[[i, j]] <= self.epsilon {
197                            let weight = 1.0; // Binary weights for epsilon-ball
198                            affinity[[i, j]] = weight;
199                            affinity[[j, i]] = weight;
200                        }
201                    }
202                }
203            }
204        }
205
206        Ok(affinity)
207    }
208
209    /// Compute the graph Laplacian matrix
210    fn compute_laplacian(&self, affinity: &Array2<f64>) -> Result<Array2<f64>> {
211        let n_samples = affinity.nrows();
212
213        // Compute degree matrix
214        let mut degree = Array1::zeros(n_samples);
215        for i in 0..n_samples {
216            degree[i] = affinity.row(i).sum();
217        }
218
219        // Check for isolated nodes
220        for i in 0..n_samples {
221            if degree[i] < 1e-10 {
222                return Err(TransformError::ComputationError(
223                    "Graph has isolated nodes. Try increasing epsilon or n_neighbors.".to_string(),
224                ));
225            }
226        }
227
228        let mut laplacian = Array2::zeros((n_samples, n_samples));
229
230        if self.normalized {
231            // Normalized Laplacian: L = I - D^(-1/2) A D^(-1/2)
232            for i in 0..n_samples {
233                for j in 0..n_samples {
234                    if i == j {
235                        laplacian[[i, j]] = 1.0;
236                    } else {
237                        let normalized_affinity = affinity[[i, j]] / (degree[i] * degree[j]).sqrt();
238                        laplacian[[i, j]] = -normalized_affinity;
239                    }
240                }
241            }
242        } else {
243            // Unnormalized Laplacian: L = D - A
244            for i in 0..n_samples {
245                for j in 0..n_samples {
246                    if i == j {
247                        laplacian[[i, j]] = degree[i];
248                    } else {
249                        laplacian[[i, j]] = -affinity[[i, j]];
250                    }
251                }
252            }
253        }
254
255        Ok(laplacian)
256    }
257
258    /// Fits the SpectralEmbedding model to the input data
259    ///
260    /// # Arguments
261    /// * `x` - The input data, shape (n_samples, n_features)
262    ///
263    /// # Returns
264    /// * `Result<()>` - Ok if successful, Err otherwise
265    pub fn fit<S>(&mut self, x: &ArrayBase<S, Ix2>) -> Result<()>
266    where
267        S: Data,
268        S::Elem: Float + NumCast + Send + Sync,
269    {
270        let (n_samples, n_features) = x.dim();
271
272        // Validate inputs
273        check_positive(self.n_components, "n_components")?;
274        checkshape(x, &[n_samples, n_features], "x")?;
275
276        if self.n_components >= n_samples {
277            return Err(TransformError::InvalidInput(format!(
278                "n_components={} must be < n_samples={}",
279                self.n_components, n_samples
280            )));
281        }
282
283        if matches!(self.affinity_method, AffinityMethod::KNN) && n_samples <= self.n_neighbors {
284            return Err(TransformError::InvalidInput(format!(
285                "n_neighbors={} must be < n_samples={}",
286                self.n_neighbors, n_samples
287            )));
288        }
289
290        // Convert input to f64
291        let x_f64 = x.mapv(|v| NumCast::from(v).unwrap_or(0.0));
292
293        // Step 1: Compute pairwise distances
294        let distances = self.compute_distances(&x_f64.view());
295
296        // Step 2: Construct affinity matrix
297        let affinity = self.construct_affinity_matrix(&distances)?;
298
299        // Step 3: Compute Laplacian matrix
300        let laplacian = self.compute_laplacian(&affinity)?;
301
302        // Step 4: Eigendecomposition of Laplacian
303        let (eigenvalues, eigenvectors) = match eigh(&laplacian.view(), None) {
304            Ok(result) => result,
305            Err(e) => return Err(TransformError::LinalgError(e)),
306        };
307
308        // Step 5: Sort eigenvalues and eigenvectors in ascending order
309        let mut indices: Vec<usize> = (0..n_samples).collect();
310        indices.sort_by(|&i, &j| {
311            eigenvalues[i]
312                .partial_cmp(&eigenvalues[j])
313                .expect("Operation failed")
314        });
315
316        // Step 6: Select eigenvectors for embedding (skip the first if normalized)
317        let start_idx = if self.normalized { 1 } else { 0 }; // Skip constant eigenvector for normalized Laplacian
318        let mut embedding = Array2::zeros((n_samples, self.n_components));
319
320        for j in 0..self.n_components {
321            let idx = indices[start_idx + j];
322            for i in 0..n_samples {
323                embedding[[i, j]] = eigenvectors[[i, idx]];
324            }
325        }
326
327        // Store results
328        self.embedding = Some(embedding);
329        self.training_data = Some(x_f64);
330        self.affinity_matrix = Some(affinity);
331        self.eigenvectors = Some(eigenvectors);
332        self.eigenvalues = Some(eigenvalues);
333
334        Ok(())
335    }
336
337    /// Transforms the input data using the fitted SpectralEmbedding model
338    ///
339    /// # Arguments
340    /// * `x` - The input data, shape (n_samples, n_features)
341    ///
342    /// # Returns
343    /// * `Result<Array2<f64>>` - The transformed data, shape (n_samples, n_components)
344    pub fn transform<S>(&self, x: &ArrayBase<S, Ix2>) -> Result<Array2<f64>>
345    where
346        S: Data,
347        S::Elem: Float + NumCast,
348    {
349        if self.embedding.is_none() {
350            return Err(TransformError::NotFitted(
351                "SpectralEmbedding model has not been fitted".to_string(),
352            ));
353        }
354
355        let training_data = self
356            .training_data
357            .as_ref()
358            .ok_or_else(|| TransformError::NotFitted("Training data not available".to_string()))?;
359
360        let x_f64 = x.mapv(|v| NumCast::from(v).unwrap_or(0.0));
361
362        // Check if this is the training data
363        if self.is_same_data(&x_f64, training_data) {
364            return Ok(self.embedding.as_ref().expect("Operation failed").clone());
365        }
366
367        // For new data, use Nyström method for out-of-sample extension
368        self.nystrom_extension(&x_f64)
369    }
370
371    /// Fits the model and transforms the data in one step
372    ///
373    /// # Arguments
374    /// * `x` - The input data, shape (n_samples, n_features)
375    ///
376    /// # Returns
377    /// * `Result<Array2<f64>>` - The transformed data, shape (n_samples, n_components)
378    pub fn fit_transform<S>(&mut self, x: &ArrayBase<S, Ix2>) -> Result<Array2<f64>>
379    where
380        S: Data,
381        S::Elem: Float + NumCast + Send + Sync,
382    {
383        self.fit(x)?;
384        self.transform(x)
385    }
386
387    /// Returns the embedding
388    pub fn embedding(&self) -> Option<&Array2<f64>> {
389        self.embedding.as_ref()
390    }
391
392    /// Returns the affinity matrix
393    pub fn affinity_matrix(&self) -> Option<&Array2<f64>> {
394        self.affinity_matrix.as_ref()
395    }
396
397    /// Returns the eigenvalues of the Laplacian
398    pub fn eigenvalues(&self) -> Option<&Array1<f64>> {
399        self.eigenvalues.as_ref()
400    }
401
402    /// Check if the input data is the same as training data
403    fn is_same_data(&self, x: &Array2<f64>, trainingdata: &Array2<f64>) -> bool {
404        if x.dim() != trainingdata.dim() {
405            return false;
406        }
407
408        let (n_samples, n_features) = x.dim();
409        for i in 0..n_samples {
410            for j in 0..n_features {
411                if (x[[i, j]] - trainingdata[[i, j]]).abs() > 1e-10 {
412                    return false;
413                }
414            }
415        }
416        true
417    }
418
419    /// Nyström method for out-of-sample extension
420    fn nystrom_extension(&self, xnew: &Array2<f64>) -> Result<Array2<f64>> {
421        let training_data = self.training_data.as_ref().expect("Operation failed");
422        let training_embedding = self.embedding.as_ref().expect("Operation failed");
423        let eigenvalues = self.eigenvalues.as_ref().expect("Operation failed");
424
425        let (n_new, n_features) = xnew.dim();
426        let (n_training_, _) = training_data.dim();
427
428        if n_features != training_data.ncols() {
429            return Err(TransformError::InvalidInput(format!(
430                "Input features {} must match training features {}",
431                n_features,
432                training_data.ncols()
433            )));
434        }
435
436        // Compute affinity between _new points and training points
437        let mut new_to_training_affinity = Array2::zeros((n_new, n_training_));
438
439        for i in 0..n_new {
440            for j in 0..n_training_ {
441                let mut dist_sq = 0.0;
442                for k in 0..n_features {
443                    let diff = xnew[[i, k]] - training_data[[j, k]];
444                    dist_sq += diff * diff;
445                }
446
447                let affinity_value = match &self.affinity_method {
448                    AffinityMethod::Gaussian => {
449                        let gamma = self.gamma.unwrap_or(1.0); // Use stored gamma or default
450                        (-gamma * dist_sq).exp()
451                    }
452                    AffinityMethod::KNN | AffinityMethod::Epsilon => {
453                        // For discrete methods, use distance-based weighting
454                        let dist = dist_sq.sqrt();
455                        if dist > 0.0 {
456                            1.0 / (1.0 + dist)
457                        } else {
458                            1.0
459                        }
460                    }
461                };
462
463                new_to_training_affinity[[i, j]] = affinity_value;
464            }
465        }
466
467        // Nyström approximation: Y_new = K_new_train * V * Λ^(-1)
468        // where V are the eigenvectors and Λ are the eigenvalues
469        let mut new_embedding = Array2::zeros((n_new, self.n_components));
470
471        let start_idx = if self.normalized { 1 } else { 0 };
472
473        for i in 0..n_new {
474            for j in 0..self.n_components {
475                let eigenvalue = eigenvalues[start_idx + j];
476                if eigenvalue.abs() > 1e-10 {
477                    let mut coord = 0.0;
478                    for k in 0..n_training_ {
479                        coord += new_to_training_affinity[[i, k]] * training_embedding[[k, j]];
480                    }
481                    new_embedding[[i, j]] = coord / eigenvalue.sqrt();
482                }
483            }
484        }
485
486        Ok(new_embedding)
487    }
488}
489
490#[cfg(test)]
491mod tests {
492    use super::*;
493    use scirs2_core::ndarray::Array;
494
495    #[test]
496    fn test_spectral_embedding_gaussian() {
497        // Create a simple 2D dataset with two clusters
498        let data = vec![1.0, 1.0, 1.1, 1.1, 1.2, 0.9, 5.0, 5.0, 5.1, 5.1, 4.9, 5.2];
499        let x = Array::from_shape_vec((6, 2), data).expect("Operation failed");
500
501        let mut spectral = SpectralEmbedding::new(2, AffinityMethod::Gaussian);
502        let embedding = spectral.fit_transform(&x).expect("Operation failed");
503
504        assert_eq!(embedding.shape(), &[6, 2]);
505
506        // Check that values are finite
507        for val in embedding.iter() {
508            assert!(val.is_finite());
509        }
510    }
511
512    #[test]
513    fn test_spectral_embedding_knn() {
514        let x: Array2<f64> = Array::eye(8);
515
516        let mut spectral = SpectralEmbedding::new(3, AffinityMethod::KNN).with_n_neighbors(3);
517        let embedding = spectral.fit_transform(&x).expect("Operation failed");
518
519        assert_eq!(embedding.shape(), &[8, 3]);
520
521        // Check that values are finite
522        for val in embedding.iter() {
523            assert!(val.is_finite());
524        }
525    }
526
527    #[test]
528    fn test_spectral_embedding_out_of_sample() {
529        let x_train: Array2<f64> = Array::eye(5);
530        let x_test = Array::from_shape_vec(
531            (2, 5),
532            vec![0.1, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1, 0.1, 0.0],
533        )
534        .expect("Operation failed");
535
536        let mut spectral = SpectralEmbedding::new(2, AffinityMethod::Gaussian);
537        spectral.fit(&x_train).expect("Operation failed");
538        let test_embedding = spectral.transform(&x_test).expect("Operation failed");
539
540        assert_eq!(test_embedding.shape(), &[2, 2]);
541
542        // Check that values are finite
543        for val in test_embedding.iter() {
544            assert!(val.is_finite());
545        }
546    }
547}