scirs2_transform/reduction/
spectral_embedding.rs

1//! Spectral Embedding for non-linear dimensionality reduction
2//!
3//! Spectral embedding is a non-linear dimensionality reduction technique that uses
4//! the eigenvectors of a graph Laplacian to embed data points in a lower-dimensional space.
5//! It's particularly effective for data that lies on a non-linear manifold.
6
7use scirs2_core::ndarray::{Array1, Array2, ArrayBase, Data, Ix2};
8use scirs2_core::numeric::{Float, NumCast};
9use scirs2_core::validation::{check_positive, checkshape};
10use scirs2_linalg::eigh;
11
12use crate::error::{Result, TransformError};
13
14/// Affinity matrix construction methods
15#[derive(Debug, Clone, PartialEq)]
16pub enum AffinityMethod {
17    /// Gaussian (RBF) kernel with automatic bandwidth selection
18    Gaussian,
19    /// K-nearest neighbors graph
20    KNN,
21    /// Epsilon-ball graph
22    Epsilon,
23}
24
25/// Spectral Embedding dimensionality reduction
26///
27/// Spectral embedding uses the eigenvectors of the graph Laplacian matrix
28/// to find a low-dimensional representation that preserves local neighborhood structure.
29#[derive(Debug, Clone)]
30pub struct SpectralEmbedding {
31    /// Number of components (dimensions) in the embedding
32    n_components: usize,
33    /// Method for constructing affinity matrix
34    affinity_method: AffinityMethod,
35    /// Number of neighbors for KNN graph
36    n_neighbors: usize,
37    /// Bandwidth parameter for Gaussian kernel (auto if None)
38    gamma: Option<f64>,
39    /// Epsilon parameter for epsilon-ball graph
40    epsilon: f64,
41    /// Whether to use normalized Laplacian
42    normalized: bool,
43    /// Random seed for reproducibility
44    random_state: Option<u64>,
45    /// The embedding vectors
46    embedding: Option<Array2<f64>>,
47    /// Training data for out-of-sample extension
48    training_data: Option<Array2<f64>>,
49    /// Affinity matrix computed during fitting
50    affinity_matrix: Option<Array2<f64>>,
51    /// Eigenvectors of the Laplacian
52    eigenvectors: Option<Array2<f64>>,
53    /// Eigenvalues of the Laplacian
54    eigenvalues: Option<Array1<f64>>,
55}
56
57impl SpectralEmbedding {
58    /// Creates a new SpectralEmbedding instance
59    ///
60    /// # Arguments
61    /// * `n_components` - Number of dimensions in the embedding space
62    /// * `affinity_method` - Method for constructing the affinity matrix
63    pub fn new(n_components: usize, affinitymethod: AffinityMethod) -> Self {
64        SpectralEmbedding {
65            n_components,
66            affinity_method: affinitymethod,
67            n_neighbors: 10,
68            gamma: None,
69            epsilon: 1.0,
70            normalized: true,
71            random_state: None,
72            embedding: None,
73            training_data: None,
74            affinity_matrix: None,
75            eigenvectors: None,
76            eigenvalues: None,
77        }
78    }
79
80    /// Set the number of neighbors for KNN graph construction
81    pub fn with_n_neighbors(mut self, nneighbors: usize) -> Self {
82        self.n_neighbors = nneighbors;
83        self
84    }
85
86    /// Set the gamma parameter for Gaussian kernel
87    pub fn with_gamma(mut self, gamma: f64) -> Self {
88        self.gamma = Some(gamma);
89        self
90    }
91
92    /// Set the epsilon parameter for epsilon-ball graph
93    pub fn with_epsilon(mut self, epsilon: f64) -> Self {
94        self.epsilon = epsilon;
95        self
96    }
97
98    /// Set whether to use normalized Laplacian
99    pub fn with_normalized(mut self, normalized: bool) -> Self {
100        self.normalized = normalized;
101        self
102    }
103
104    /// Set random seed for reproducibility
105    pub fn with_random_state(mut self, seed: u64) -> Self {
106        self.random_state = Some(seed);
107        self
108    }
109
110    /// Compute pairwise distances between data points
111    fn compute_distances<S>(&self, x: &ArrayBase<S, Ix2>) -> Array2<f64>
112    where
113        S: Data,
114        S::Elem: Float + NumCast,
115    {
116        let n_samples = x.nrows();
117        let mut distances = Array2::zeros((n_samples, n_samples));
118
119        for i in 0..n_samples {
120            for j in i + 1..n_samples {
121                let mut dist_sq = 0.0;
122                for k in 0..x.ncols() {
123                    let diff = NumCast::from(x[[i, k]]).unwrap_or(0.0)
124                        - NumCast::from(x[[j, k]]).unwrap_or(0.0);
125                    dist_sq += diff * diff;
126                }
127                let dist = dist_sq.sqrt();
128                distances[[i, j]] = dist;
129                distances[[j, i]] = dist;
130            }
131        }
132
133        distances
134    }
135
136    /// Construct affinity matrix based on the specified method
137    fn construct_affinity_matrix(&self, distances: &Array2<f64>) -> Result<Array2<f64>> {
138        let n_samples = distances.nrows();
139        let mut affinity = Array2::zeros((n_samples, n_samples));
140
141        match &self.affinity_method {
142            AffinityMethod::Gaussian => {
143                // Automatic bandwidth selection if not specified
144                let gamma = if let Some(g) = self.gamma {
145                    g
146                } else {
147                    // Use median heuristic: gamma = 1 / (2 * median_distance^2)
148                    let mut all_distances: Vec<f64> = Vec::new();
149                    for i in 0..n_samples {
150                        for j in i + 1..n_samples {
151                            all_distances.push(distances[[i, j]]);
152                        }
153                    }
154                    all_distances.sort_by(|a, b| a.partial_cmp(b).unwrap());
155                    let median_dist = all_distances[all_distances.len() / 2];
156                    1.0 / (2.0 * median_dist * median_dist)
157                };
158
159                // Gaussian kernel: exp(-gamma * ||x_i - x_j||^2)
160                for i in 0..n_samples {
161                    for j in 0..n_samples {
162                        if i != j {
163                            let dist_sq = distances[[i, j]] * distances[[i, j]];
164                            affinity[[i, j]] = (-gamma * dist_sq).exp();
165                        }
166                    }
167                }
168            }
169            AffinityMethod::KNN => {
170                // K-nearest neighbors graph
171                for i in 0..n_samples {
172                    // Find k nearest neighbors
173                    let mut neighbors_with_dist: Vec<(f64, usize)> = Vec::new();
174                    for j in 0..n_samples {
175                        if i != j {
176                            neighbors_with_dist.push((distances[[i, j]], j));
177                        }
178                    }
179                    neighbors_with_dist.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap());
180
181                    // Connect to k nearest neighbors
182                    #[allow(clippy::needless_range_loop)]
183                    for k in 0..self.n_neighbors.min(neighbors_with_dist.len()) {
184                        let neighbor = neighbors_with_dist[k].1;
185                        let weight = 1.0; // Binary weights for KNN
186                        affinity[[i, neighbor]] = weight;
187                        affinity[[neighbor, i]] = weight; // Make symmetric
188                    }
189                }
190            }
191            AffinityMethod::Epsilon => {
192                // Epsilon-ball graph
193                for i in 0..n_samples {
194                    for j in i + 1..n_samples {
195                        if distances[[i, j]] <= self.epsilon {
196                            let weight = 1.0; // Binary weights for epsilon-ball
197                            affinity[[i, j]] = weight;
198                            affinity[[j, i]] = weight;
199                        }
200                    }
201                }
202            }
203        }
204
205        Ok(affinity)
206    }
207
208    /// Compute the graph Laplacian matrix
209    fn compute_laplacian(&self, affinity: &Array2<f64>) -> Result<Array2<f64>> {
210        let n_samples = affinity.nrows();
211
212        // Compute degree matrix
213        let mut degree = Array1::zeros(n_samples);
214        for i in 0..n_samples {
215            degree[i] = affinity.row(i).sum();
216        }
217
218        // Check for isolated nodes
219        for i in 0..n_samples {
220            if degree[i] < 1e-10 {
221                return Err(TransformError::ComputationError(
222                    "Graph has isolated nodes. Try increasing epsilon or n_neighbors.".to_string(),
223                ));
224            }
225        }
226
227        let mut laplacian = Array2::zeros((n_samples, n_samples));
228
229        if self.normalized {
230            // Normalized Laplacian: L = I - D^(-1/2) A D^(-1/2)
231            for i in 0..n_samples {
232                for j in 0..n_samples {
233                    if i == j {
234                        laplacian[[i, j]] = 1.0;
235                    } else {
236                        let normalized_affinity = affinity[[i, j]] / (degree[i] * degree[j]).sqrt();
237                        laplacian[[i, j]] = -normalized_affinity;
238                    }
239                }
240            }
241        } else {
242            // Unnormalized Laplacian: L = D - A
243            for i in 0..n_samples {
244                for j in 0..n_samples {
245                    if i == j {
246                        laplacian[[i, j]] = degree[i];
247                    } else {
248                        laplacian[[i, j]] = -affinity[[i, j]];
249                    }
250                }
251            }
252        }
253
254        Ok(laplacian)
255    }
256
257    /// Fits the SpectralEmbedding model to the input data
258    ///
259    /// # Arguments
260    /// * `x` - The input data, shape (n_samples, n_features)
261    ///
262    /// # Returns
263    /// * `Result<()>` - Ok if successful, Err otherwise
264    pub fn fit<S>(&mut self, x: &ArrayBase<S, Ix2>) -> Result<()>
265    where
266        S: Data,
267        S::Elem: Float + NumCast + Send + Sync,
268    {
269        let (n_samples, n_features) = x.dim();
270
271        // Validate inputs
272        check_positive(self.n_components, "n_components")?;
273        checkshape(x, &[n_samples, n_features], "x")?;
274
275        if self.n_components >= n_samples {
276            return Err(TransformError::InvalidInput(format!(
277                "n_components={} must be < n_samples={}",
278                self.n_components, n_samples
279            )));
280        }
281
282        if matches!(self.affinity_method, AffinityMethod::KNN) && n_samples <= self.n_neighbors {
283            return Err(TransformError::InvalidInput(format!(
284                "n_neighbors={} must be < n_samples={}",
285                self.n_neighbors, n_samples
286            )));
287        }
288
289        // Convert input to f64
290        let x_f64 = x.mapv(|v| NumCast::from(v).unwrap_or(0.0));
291
292        // Step 1: Compute pairwise distances
293        let distances = self.compute_distances(&x_f64.view());
294
295        // Step 2: Construct affinity matrix
296        let affinity = self.construct_affinity_matrix(&distances)?;
297
298        // Step 3: Compute Laplacian matrix
299        let laplacian = self.compute_laplacian(&affinity)?;
300
301        // Step 4: Eigendecomposition of Laplacian
302        let (eigenvalues, eigenvectors) = match eigh(&laplacian.view(), None) {
303            Ok(result) => result,
304            Err(e) => return Err(TransformError::LinalgError(e)),
305        };
306
307        // Step 5: Sort eigenvalues and eigenvectors in ascending order
308        let mut indices: Vec<usize> = (0..n_samples).collect();
309        indices.sort_by(|&i, &j| eigenvalues[i].partial_cmp(&eigenvalues[j]).unwrap());
310
311        // Step 6: Select eigenvectors for embedding (skip the first if normalized)
312        let start_idx = if self.normalized { 1 } else { 0 }; // Skip constant eigenvector for normalized Laplacian
313        let mut embedding = Array2::zeros((n_samples, self.n_components));
314
315        for j in 0..self.n_components {
316            let idx = indices[start_idx + j];
317            for i in 0..n_samples {
318                embedding[[i, j]] = eigenvectors[[i, idx]];
319            }
320        }
321
322        // Store results
323        self.embedding = Some(embedding);
324        self.training_data = Some(x_f64);
325        self.affinity_matrix = Some(affinity);
326        self.eigenvectors = Some(eigenvectors);
327        self.eigenvalues = Some(eigenvalues);
328
329        Ok(())
330    }
331
332    /// Transforms the input data using the fitted SpectralEmbedding model
333    ///
334    /// # Arguments
335    /// * `x` - The input data, shape (n_samples, n_features)
336    ///
337    /// # Returns
338    /// * `Result<Array2<f64>>` - The transformed data, shape (n_samples, n_components)
339    pub fn transform<S>(&self, x: &ArrayBase<S, Ix2>) -> Result<Array2<f64>>
340    where
341        S: Data,
342        S::Elem: Float + NumCast,
343    {
344        if self.embedding.is_none() {
345            return Err(TransformError::NotFitted(
346                "SpectralEmbedding model has not been fitted".to_string(),
347            ));
348        }
349
350        let training_data = self
351            .training_data
352            .as_ref()
353            .ok_or_else(|| TransformError::NotFitted("Training data not available".to_string()))?;
354
355        let x_f64 = x.mapv(|v| NumCast::from(v).unwrap_or(0.0));
356
357        // Check if this is the training data
358        if self.is_same_data(&x_f64, training_data) {
359            return Ok(self.embedding.as_ref().unwrap().clone());
360        }
361
362        // For new data, use Nyström method for out-of-sample extension
363        self.nystrom_extension(&x_f64)
364    }
365
366    /// Fits the model and transforms the data in one step
367    ///
368    /// # Arguments
369    /// * `x` - The input data, shape (n_samples, n_features)
370    ///
371    /// # Returns
372    /// * `Result<Array2<f64>>` - The transformed data, shape (n_samples, n_components)
373    pub fn fit_transform<S>(&mut self, x: &ArrayBase<S, Ix2>) -> Result<Array2<f64>>
374    where
375        S: Data,
376        S::Elem: Float + NumCast + Send + Sync,
377    {
378        self.fit(x)?;
379        self.transform(x)
380    }
381
382    /// Returns the embedding
383    pub fn embedding(&self) -> Option<&Array2<f64>> {
384        self.embedding.as_ref()
385    }
386
387    /// Returns the affinity matrix
388    pub fn affinity_matrix(&self) -> Option<&Array2<f64>> {
389        self.affinity_matrix.as_ref()
390    }
391
392    /// Returns the eigenvalues of the Laplacian
393    pub fn eigenvalues(&self) -> Option<&Array1<f64>> {
394        self.eigenvalues.as_ref()
395    }
396
397    /// Check if the input data is the same as training data
398    fn is_same_data(&self, x: &Array2<f64>, trainingdata: &Array2<f64>) -> bool {
399        if x.dim() != trainingdata.dim() {
400            return false;
401        }
402
403        let (n_samples, n_features) = x.dim();
404        for i in 0..n_samples {
405            for j in 0..n_features {
406                if (x[[i, j]] - trainingdata[[i, j]]).abs() > 1e-10 {
407                    return false;
408                }
409            }
410        }
411        true
412    }
413
414    /// Nyström method for out-of-sample extension
415    fn nystrom_extension(&self, xnew: &Array2<f64>) -> Result<Array2<f64>> {
416        let training_data = self.training_data.as_ref().unwrap();
417        let training_embedding = self.embedding.as_ref().unwrap();
418        let eigenvalues = self.eigenvalues.as_ref().unwrap();
419
420        let (n_new, n_features) = xnew.dim();
421        let (n_training_, _) = training_data.dim();
422
423        if n_features != training_data.ncols() {
424            return Err(TransformError::InvalidInput(format!(
425                "Input features {} must match training features {}",
426                n_features,
427                training_data.ncols()
428            )));
429        }
430
431        // Compute affinity between _new points and training points
432        let mut new_to_training_affinity = Array2::zeros((n_new, n_training_));
433
434        for i in 0..n_new {
435            for j in 0..n_training_ {
436                let mut dist_sq = 0.0;
437                for k in 0..n_features {
438                    let diff = xnew[[i, k]] - training_data[[j, k]];
439                    dist_sq += diff * diff;
440                }
441
442                let affinity_value = match &self.affinity_method {
443                    AffinityMethod::Gaussian => {
444                        let gamma = self.gamma.unwrap_or(1.0); // Use stored gamma or default
445                        (-gamma * dist_sq).exp()
446                    }
447                    AffinityMethod::KNN | AffinityMethod::Epsilon => {
448                        // For discrete methods, use distance-based weighting
449                        let dist = dist_sq.sqrt();
450                        if dist > 0.0 {
451                            1.0 / (1.0 + dist)
452                        } else {
453                            1.0
454                        }
455                    }
456                };
457
458                new_to_training_affinity[[i, j]] = affinity_value;
459            }
460        }
461
462        // Nyström approximation: Y_new = K_new_train * V * Λ^(-1)
463        // where V are the eigenvectors and Λ are the eigenvalues
464        let mut new_embedding = Array2::zeros((n_new, self.n_components));
465
466        let start_idx = if self.normalized { 1 } else { 0 };
467
468        for i in 0..n_new {
469            for j in 0..self.n_components {
470                let eigenvalue = eigenvalues[start_idx + j];
471                if eigenvalue.abs() > 1e-10 {
472                    let mut coord = 0.0;
473                    for k in 0..n_training_ {
474                        coord += new_to_training_affinity[[i, k]] * training_embedding[[k, j]];
475                    }
476                    new_embedding[[i, j]] = coord / eigenvalue.sqrt();
477                }
478            }
479        }
480
481        Ok(new_embedding)
482    }
483}
484
485#[cfg(test)]
486mod tests {
487    use super::*;
488    use scirs2_core::ndarray::Array;
489
490    #[test]
491    fn test_spectral_embedding_gaussian() {
492        // Create a simple 2D dataset with two clusters
493        let data = vec![1.0, 1.0, 1.1, 1.1, 1.2, 0.9, 5.0, 5.0, 5.1, 5.1, 4.9, 5.2];
494        let x = Array::from_shape_vec((6, 2), data).unwrap();
495
496        let mut spectral = SpectralEmbedding::new(2, AffinityMethod::Gaussian);
497        let embedding = spectral.fit_transform(&x).unwrap();
498
499        assert_eq!(embedding.shape(), &[6, 2]);
500
501        // Check that values are finite
502        for val in embedding.iter() {
503            assert!(val.is_finite());
504        }
505    }
506
507    #[test]
508    fn test_spectral_embedding_knn() {
509        let x: Array2<f64> = Array::eye(8);
510
511        let mut spectral = SpectralEmbedding::new(3, AffinityMethod::KNN).with_n_neighbors(3);
512        let embedding = spectral.fit_transform(&x).unwrap();
513
514        assert_eq!(embedding.shape(), &[8, 3]);
515
516        // Check that values are finite
517        for val in embedding.iter() {
518            assert!(val.is_finite());
519        }
520    }
521
522    #[test]
523    fn test_spectral_embedding_out_of_sample() {
524        let x_train: Array2<f64> = Array::eye(5);
525        let x_test = Array::from_shape_vec(
526            (2, 5),
527            vec![0.1, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1, 0.1, 0.0],
528        )
529        .unwrap();
530
531        let mut spectral = SpectralEmbedding::new(2, AffinityMethod::Gaussian);
532        spectral.fit(&x_train).unwrap();
533        let test_embedding = spectral.transform(&x_test).unwrap();
534
535        assert_eq!(test_embedding.shape(), &[2, 2]);
536
537        // Check that values are finite
538        for val in test_embedding.iter() {
539            assert!(val.is_finite());
540        }
541    }
542}