linfa_reduction/diffusion_map/
algorithms.rs

1//! Diffusion Map
2//!
3//! The diffusion map computes an embedding of the data by applying PCA on the diffusion operator
4//! of the data. It transforms the data along the direction of the largest diffusion flow and is therefore
5//! a non-linear dimensionality reduction technique. A normalized kernel describes the high dimensional
6//! diffusion graph with the (i, j) entry the probability that a diffusion happens from point i to
7//! j.
8//!
9#[cfg(not(feature = "blas"))]
10use linfa_linalg::{
11    eigh::*,
12    lobpcg::{self, LobpcgResult, Order as TruncatedOrder},
13};
14use ndarray::{Array1, Array2};
15#[cfg(feature = "blas")]
16use ndarray_linalg::{eigh::EighInto, lobpcg, lobpcg::LobpcgResult, Scalar, TruncatedOrder, UPLO};
17use ndarray_rand::{rand_distr::Uniform, RandomExt};
18
19use linfa::dataset::{WithLapack, WithoutLapack};
20use linfa::{traits::Transformer, Float};
21use linfa_kernel::Kernel;
22use rand::{prelude::SmallRng, SeedableRng};
23
24use super::hyperparams::DiffusionMapValidParams;
25
26/// Embedding of diffusion map technique
27///
28/// After transforming the dataset with diffusion map this structure store the embedding for
29/// further use. No straightforward prediction can be made from the embedding and the algorithm
30/// falls therefore in the class of transformers.
31///
32/// The diffusion map computes an embedding of the data by applying PCA on the diffusion operator
33/// of the data. It transforms the data along the direction of the largest diffusion flow and is therefore
34/// a non-linear dimensionality reduction technique. A normalized kernel describes the high dimensional
35/// diffusion graph with the (i, j) entry the probability that a diffusion happens from point i to
36/// j.
37///
38/// # Example
39///
40/// ```
41/// use linfa::traits::Transformer;
42/// use linfa_kernel::{Kernel, KernelType, KernelMethod};
43/// use linfa_reduction::DiffusionMap;
44///
45/// let dataset = linfa_datasets::iris();
46///
47/// // generate sparse gaussian kernel with eps = 2 and 15 neighbors
48/// let kernel = Kernel::params()
49///     .kind(KernelType::Sparse(15))
50///     .method(KernelMethod::Gaussian(2.0))
51///     .transform(dataset.records());
52///
53/// // create embedding from kernel matrix using diffusion maps
54/// let mapped_kernel = DiffusionMap::<f64>::params(2)
55///     .steps(1)
56///     .transform(&kernel)
57///     .unwrap();
58///
59/// // get embedding from the transformed kernel matrix
60/// let embedding = mapped_kernel.embedding();
61/// ```
62///
63#[derive(Debug, Clone, PartialEq)]
64pub struct DiffusionMap<F> {
65    embedding: Array2<F>,
66    eigvals: Array1<F>,
67}
68
69impl<'a, F: Float> Transformer<&'a Kernel<F>, DiffusionMap<F>> for DiffusionMapValidParams {
70    /// Project a kernel matrix to its embedding
71    ///
72    /// # Parameter
73    ///
74    /// * `kernel`: Kernel matrix
75    ///
76    /// # Returns
77    ///
78    /// Embedding for each observation in the kernel matrix
79    fn transform(&self, kernel: &'a Kernel<F>) -> DiffusionMap<F> {
80        // compute spectral embedding with diffusion map
81        let (embedding, eigvals) =
82            compute_diffusion_map(kernel, self.steps(), 0.0, self.embedding_size(), None);
83
84        DiffusionMap { embedding, eigvals }
85    }
86}
87
88impl<F: Float> DiffusionMap<F> {
89    /// Estimate the number of clusters in this embedding (very crude for now)
90    pub fn estimate_clusters(&self) -> usize {
91        let mean = self.eigvals.sum() / F::cast(self.eigvals.len());
92        self.eigvals.iter().filter(|x| *x > &mean).count() + 1
93    }
94
95    /// Return the eigenvalue of the diffusion operator
96    pub fn eigvals(&self) -> &Array1<F> {
97        &self.eigvals
98    }
99
100    /// Return the embedding
101    pub fn embedding(&self) -> &Array2<F> {
102        &self.embedding
103    }
104}
105
106#[allow(unused)]
107fn compute_diffusion_map<F: Float>(
108    kernel: &Kernel<F>,
109    steps: usize,
110    alpha: f32,
111    embedding_size: usize,
112    guess: Option<Array2<F>>,
113) -> (Array2<F>, Array1<F>) {
114    assert!(embedding_size < kernel.size());
115
116    let d = kernel.sum().mapv(|x| x.recip());
117
118    let (vals, vecs) = if kernel.size() < 5 * embedding_size + 1 {
119        // use full eigenvalue decomposition for small problem sizes
120        let mut matrix = kernel.dot(&Array2::from_diag(&d).view());
121        matrix
122            .columns_mut()
123            .into_iter()
124            .zip(d.iter())
125            .for_each(|(mut a, b)| a *= *b);
126
127        let matrix = matrix.with_lapack();
128        // Calculate the eigen decomposition sorted from largest to lowest
129        #[cfg(feature = "blas")]
130        let (vals, vecs) = {
131            let (vals, vecs) = matrix.eigh_into(UPLO::Lower).unwrap();
132            (
133                vals.slice_move(s![..; -1]).mapv(Scalar::from_real),
134                vecs.slice_move(s![.., ..; -1]),
135            )
136        };
137        #[cfg(not(feature = "blas"))]
138        let (vals, vecs) = matrix.eigh_into().unwrap().sort_eig_desc();
139        (
140            vals.slice_move(s![1..=embedding_size]),
141            vecs.slice_move(s![.., 1..=embedding_size]),
142        )
143    } else {
144        let d2 = d.mapv(|x| x.powf(F::cast(0.5 + alpha)));
145        // calculate truncated eigenvalue decomposition
146        let x = guess
147            .unwrap_or_else(|| {
148                Array2::random_using(
149                    (kernel.size(), embedding_size + 1),
150                    Uniform::new(0.0f64, 1.0),
151                    &mut SmallRng::seed_from_u64(31),
152                )
153                .mapv(F::cast)
154            })
155            .with_lapack();
156
157        let result = lobpcg::lobpcg(
158            |y| {
159                let mut y = y.to_owned().without_lapack();
160                y.rows_mut()
161                    .into_iter()
162                    .zip(d2.iter())
163                    .for_each(|(mut a, b)| a *= *b);
164                let mut y = kernel.dot(&y.view());
165
166                y.rows_mut()
167                    .into_iter()
168                    .zip(d2.iter())
169                    .for_each(|(mut a, b)| a *= *b);
170
171                y.with_lapack()
172            },
173            x,
174            |_| {},
175            None,
176            1e-7,
177            200,
178            TruncatedOrder::Largest,
179        );
180
181        let (vals, vecs) = match result {
182            #[cfg(feature = "blas")]
183            LobpcgResult::Ok(vals, vecs, _) | LobpcgResult::Err(vals, vecs, _, _) => (vals, vecs),
184            #[cfg(not(feature = "blas"))]
185            LobpcgResult::Ok(lobpcg) | LobpcgResult::Err((_, Some(lobpcg))) => {
186                (lobpcg.eigvals, lobpcg.eigvecs)
187            }
188            _ => panic!("Eigendecomposition failed!"),
189        };
190
191        // cut away first eigenvalue/eigenvector
192        (vals.slice_move(s![1..]), vecs.slice_move(s![.., 1..]))
193    };
194
195    let (vals, mut vecs): (Array1<F>, _) = (vals.without_lapack(), vecs.without_lapack());
196    let d = d.mapv(|x| x.sqrt());
197
198    for (mut col, val) in vecs.rows_mut().into_iter().zip(d.iter()) {
199        col *= *val;
200    }
201
202    let steps = F::cast(steps);
203    for (mut vec, val) in vecs.columns_mut().into_iter().zip(vals.iter()) {
204        vec *= val.powf(steps);
205    }
206
207    (vecs, vals)
208}