sphereql_embed/
laplacian.rs

1//! Laplacian eigenmap projection — connectivity-preserving embedding on S².
2//!
3//! Standard PCA/kernel PCA maximize *variance* — the directions along which
4//! data spreads. On sparse, noise-heavy embeddings where signal lives in a
5//! handful of co-activated axes and the rest is low-amplitude noise, variance
6//! maximization pulls the projection toward the noise axes and collapses
7//! genuine category structure. Laplacian eigenmaps instead preserve
8//! *connectivity*: two points are "close" on the sphere if they share many
9//! active axes in the original space, regardless of how small those
10//! activations are in absolute magnitude.
11//!
12//! # Algorithm
13//!
14//! 1. For each embedding, extract the set of axes whose absolute weight
15//!    exceeds `active_threshold` (noise filter).
16//! 2. Build a pairwise Jaccard-similarity matrix W over those active sets.
17//! 3. Sparsify W to a mutual-k-nearest-neighbors graph (union semantics).
18//! 4. Compute the normalized affinity matrix W_norm = D^(-1/2) W D^(-1/2).
19//! 5. Find the top-3 *non-trivial* eigenvectors of W_norm via power iteration
20//!    with deflation against the trivial eigenvector (μ=1, ∝ sqrt(D)).
21//! 6. Project: for a new embedding, apply the Nyström out-of-sample formula
22//!    u_k(y) = (1/μ_k) Σ_j w_norm(y, j) · u_k(j), then L2-normalize the
23//!    three coordinates to land on S².
24//!
25//! # References
26//!
27//! * Belkin & Niyogi. "Laplacian Eigenmaps for Dimensionality Reduction and
28//!   Data Representation." *Neural Computation* 15 (2003) 1373–1396.
29//! * Bengio et al. "Out-of-Sample Extensions for LLE, Isomap, MDS, Eigenmaps,
30//!   and Spectral Clustering." *NIPS* 2003.
31
32use sphereql_core::SphericalPoint;
33
34use crate::projection::{
35    Projection, ProjectionError, SplitMix64, dot, normalize_vec, project_xyz_to_spherical,
36};
37use crate::types::{Embedding, ProjectedPoint, RadialStrategy};
38
39// ── Defaults ───────────────────────────────────────────────────────────
40
41/// Absolute-weight cutoff below which an axis is treated as noise and
42/// excluded from the active set used to compute Jaccard similarity.
43pub const DEFAULT_ACTIVE_THRESHOLD: f64 = 0.05;
44
45/// k in the k-NN graph sparsification step. Each point keeps edges to its
46/// top-k most similar neighbors; the graph is the *union* of those edges.
47pub const DEFAULT_K_NEIGHBORS: usize = 15;
48
49/// Floor on the degree of any graph node. Zero-degree nodes would produce
50/// NaN during D^(-1/2) normalization; this regularization keeps the spectrum
51/// well-defined on disconnected graphs.
52pub const DEGREE_REGULARIZATION: f64 = 1e-6;
53
54const MAX_POWER_ITERS: usize = 400;
55const POWER_ITER_TOL: f64 = 1e-10;
56const RNG_SEED: u64 = 0xC0FF_EECA_FE00;
57
58// ── Projection ─────────────────────────────────────────────────────────
59
60/// Laplacian-eigenmap projection: embeds concepts on S² by the spectral
61/// structure of the axis-co-activation graph rather than raw variance.
62///
63/// Fit cost: O(n² · d) to build the Jaccard graph plus O(n² · q · iters)
64/// for power iteration on the n×n affinity matrix. Memory: O(n²) during
65/// fit, O(n) per eigenvector after.
66///
67/// Project cost (Nyström out-of-sample): O(n · d) per query.
68#[derive(Clone)]
69pub struct LaplacianEigenmapProjection {
70    // ── Fit parameters ──
71    active_threshold: f64,
72    radial: RadialStrategy,
73    dim: usize,
74
75    // ── Fitted corpus state ──
76    /// For each corpus point, the sorted list of active-axis indices.
77    /// Used at query time to compute Jaccard similarity with new points.
78    corpus_active_sets: Vec<Vec<usize>>,
79    /// Sparsified degree of each corpus point: Σ_j W[i][j].
80    /// Used in the Nyström extension's D^(-1/2) normalization.
81    corpus_degrees: Vec<f64>,
82
83    /// Top-3 non-trivial eigenvectors of the normalized affinity W_norm,
84    /// in descending eigenvalue order. Each vector has length n.
85    eigenvectors: [Vec<f64>; 3],
86    /// Corresponding eigenvalues μ_1, μ_2, μ_3 ∈ (−1, 1). The trivial
87    /// μ_0 = 1 is excluded.
88    eigenvalues: [f64; 3],
89
90    /// Mean of |μ_k| across the three retained eigenvalues. A proxy for
91    /// how much "low-frequency" connectivity structure the embedding
92    /// captured. Reported in place of PCA's explained_variance_ratio for
93    /// API compatibility; see [`Self::connectivity_ratio`].
94    connectivity_ratio: f64,
95}
96
97impl LaplacianEigenmapProjection {
98    /// Fit with default parameters (`k=15`, `active_threshold=0.05`).
99    pub fn fit(embeddings: &[Embedding], radial: RadialStrategy) -> Result<Self, ProjectionError> {
100        Self::fit_with_params(
101            embeddings,
102            DEFAULT_K_NEIGHBORS,
103            DEFAULT_ACTIVE_THRESHOLD,
104            radial,
105        )
106    }
107
108    /// Fit with explicit graph parameters.
109    ///
110    /// - `k` — k-NN graph density. Higher k = more edges = smoother embedding,
111    ///   less sensitive to local noise, but also more blurred category
112    ///   boundaries. Typical range: 10–30.
113    /// - `active_threshold` — absolute-weight cutoff for the active-axis
114    ///   filter. Axes with |v_i| ≤ threshold contribute nothing to the
115    ///   similarity metric.
116    pub fn fit_with_params(
117        embeddings: &[Embedding],
118        k: usize,
119        active_threshold: f64,
120        radial: RadialStrategy,
121    ) -> Result<Self, ProjectionError> {
122        let n = embeddings.len();
123        if n < 4 {
124            return Err(ProjectionError::TooFewEmbeddings {
125                got: n,
126                required: 4,
127            });
128        }
129        let dim = embeddings[0].dimension();
130        if dim == 0 {
131            return Err(ProjectionError::DimensionTooLow {
132                got: dim,
133                required: 1,
134            });
135        }
136        let k = k.min(n - 1).max(1);
137
138        // 1. Active-axis sets (sorted for merge-style intersection).
139        let corpus_active_sets: Vec<Vec<usize>> = embeddings
140            .iter()
141            .map(|e| {
142                let mut idxs: Vec<usize> = e
143                    .values
144                    .iter()
145                    .enumerate()
146                    .filter_map(|(i, v)| (v.abs() > active_threshold).then_some(i))
147                    .collect();
148                idxs.sort_unstable();
149                idxs
150            })
151            .collect();
152
153        // 2–4. Sparse top-k graph construction.
154        //
155        // Previously this pass materialized four separate n×n matrices
156        // (`sim`, `keep`, `w`, then `w_norm`) totaling ~3.2 GB at
157        // n = 10 000. The only buffer the eigensolver actually reads
158        // is `w_norm`, so every intermediate above was pure RAM tax.
159        //
160        // New shape:
161        //   per-row top-k edge list  (≈ n · k entries, O(n · k) memory)
162        //     → union-symmetrize into a `HashSet<(i, j)>`
163        //     → compute degrees from the set
164        //     → scatter directly into one dense `w_norm` the
165        //       eigensolver can consume
166        //
167        // Memory: from 4 · n² · 8 ≈ 3.2 GB down to ~1 · n² · 8 + O(n·k)
168        // ≈ 800 MB + a few MB of scratch.
169        //
170        // Jaccard is computed once per unordered pair; the upper-
171        // triangular scan fills both rows' candidate lists in one step.
172
173        // Per-row top-k min-heap keyed on (sim, j) — keeps the
174        // *largest* k similarities by evicting the current minimum.
175        // Using `std::cmp::Reverse` converts the default max-heap into
176        // the min-heap we need.
177        use std::cmp::Reverse;
178        use std::collections::BinaryHeap;
179        #[derive(PartialEq)]
180        struct TopKEntry(f64, usize);
181        impl Eq for TopKEntry {}
182        impl PartialOrd for TopKEntry {
183            fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
184                Some(self.cmp(other))
185            }
186        }
187        impl Ord for TopKEntry {
188            fn cmp(&self, other: &Self) -> std::cmp::Ordering {
189                // Compare on sim first (total_cmp — NaN-safe), break
190                // ties on index so the order is deterministic.
191                self.0
192                    .total_cmp(&other.0)
193                    .then_with(|| self.1.cmp(&other.1))
194            }
195        }
196
197        let mut top_k_heaps: Vec<BinaryHeap<Reverse<TopKEntry>>> =
198            (0..n).map(|_| BinaryHeap::with_capacity(k + 1)).collect();
199
200        for i in 0..n {
201            for j in (i + 1)..n {
202                let s = jaccard_sorted(&corpus_active_sets[i], &corpus_active_sets[j]);
203                if s <= 0.0 {
204                    continue;
205                }
206                for (owner, other) in [(i, j), (j, i)] {
207                    let h = &mut top_k_heaps[owner];
208                    h.push(Reverse(TopKEntry(s, other)));
209                    if h.len() > k {
210                        h.pop();
211                    }
212                }
213            }
214        }
215
216        // Union-symmetrize the per-row top-k candidates into a dedup
217        // edge set. Jaccard is symmetric by construction, so we can
218        // recover the edge weight from the heap entry.
219        let mut edges: std::collections::HashMap<(usize, usize), f64> =
220            std::collections::HashMap::with_capacity(n * k);
221        for (i, heap) in top_k_heaps.into_iter().enumerate() {
222            for Reverse(TopKEntry(s, j)) in heap.into_iter() {
223                let key = if i < j { (i, j) } else { (j, i) };
224                edges.insert(key, s);
225            }
226        }
227
228        // Degrees: sum of weights on each node's incident edges.
229        let mut degrees = vec![0.0f64; n];
230        for (&(a, b), &s) in edges.iter() {
231            degrees[a] += s;
232            degrees[b] += s;
233        }
234
235        // Regularize isolated/near-isolated nodes so D^(-1/2) stays finite.
236        let safe_degrees: Vec<f64> = degrees
237            .iter()
238            .map(|&d| d.max(DEGREE_REGULARIZATION))
239            .collect();
240        let d_inv_sqrt: Vec<f64> = safe_degrees.iter().map(|&d| 1.0 / d.sqrt()).collect();
241
242        // 5. Scatter the symmetric W_norm = D^(-1/2) · W · D^(-1/2)
243        //    directly from the edge map. Cells with no edge stay 0.
244        let mut w_norm = vec![0.0f64; n * n];
245        for (&(a, b), &s) in edges.iter() {
246            let v = s * d_inv_sqrt[a] * d_inv_sqrt[b];
247            w_norm[a * n + b] = v;
248            w_norm[b * n + a] = v;
249        }
250
251        // 6. Trivial eigenvector of W_norm at μ=1 is proportional to sqrt(D).
252        //    Explicitly constructed so deflation against it pulls it out of
253        //    the subsequent power-iteration subspace.
254        let sum_d: f64 = safe_degrees.iter().sum();
255        let trivial_ev: Vec<f64> = safe_degrees
256            .iter()
257            .map(|&d| d.sqrt() / sum_d.sqrt())
258            .collect();
259
260        // 7. Top-3 non-trivial eigenvectors + eigenvalues.
261        let (eigenvectors_vec, eigenvalues_vec) =
262            top_k_symmetric_excluding(&w_norm, n, 3, &[trivial_ev]);
263
264        let eigenvalues: [f64; 3] = [eigenvalues_vec[0], eigenvalues_vec[1], eigenvalues_vec[2]];
265        let eigenvectors: [Vec<f64>; 3] = [
266            eigenvectors_vec[0].clone(),
267            eigenvectors_vec[1].clone(),
268            eigenvectors_vec[2].clone(),
269        ];
270
271        let connectivity_ratio =
272            (eigenvalues[0].abs() + eigenvalues[1].abs() + eigenvalues[2].abs()) / 3.0;
273
274        Ok(Self {
275            active_threshold,
276            radial,
277            dim,
278            corpus_active_sets,
279            corpus_degrees: safe_degrees,
280            eigenvectors,
281            eigenvalues,
282            connectivity_ratio,
283        })
284    }
285
286    /// Top-3 non-trivial eigenvalues of the normalized affinity matrix
287    /// (in descending order). Values close to 1.0 indicate strong block
288    /// structure along that spectral direction.
289    pub fn eigenvalues(&self) -> [f64; 3] {
290        self.eigenvalues
291    }
292
293    /// Scalar quality proxy analogous to PCA's explained-variance ratio.
294    ///
295    /// Defined as the mean of `|μ_k|` across the three retained eigenvalues;
296    /// bounded in [0, 1]. Higher = stronger low-frequency community structure
297    /// captured by the 3D embedding.
298    ///
299    /// *Not* directly comparable to PCA EVR — the quantities have different
300    /// mathematical meaning. Use as a signal of projection health for this
301    /// projection family specifically.
302    pub fn connectivity_ratio(&self) -> f64 {
303        self.connectivity_ratio
304    }
305
306    /// Returned for API compatibility with `PcaProjection::explained_variance_ratio`.
307    /// See [`Self::connectivity_ratio`] for semantics — the two are not the
308    /// same metric, but both live in [0, 1] and can feed EVR-adaptive
309    /// thresholds downstream.
310    pub fn explained_variance_ratio(&self) -> f64 {
311        self.connectivity_ratio
312    }
313
314    /// Nyström-extend the three retained eigenvectors to a new embedding y,
315    /// returning raw (unnormalized) 3D coordinates.
316    fn project_to_3d(&self, embedding: &Embedding) -> (f64, f64, f64) {
317        let mut active_y: Vec<usize> = embedding
318            .values
319            .iter()
320            .enumerate()
321            .filter_map(|(i, v)| (v.abs() > self.active_threshold).then_some(i))
322            .collect();
323        active_y.sort_unstable();
324
325        let n = self.corpus_active_sets.len();
326        let sims: Vec<f64> = self
327            .corpus_active_sets
328            .iter()
329            .map(|s| jaccard_sorted(&active_y, s))
330            .collect();
331
332        // Treat the query as a new graph node whose similarities to corpus
333        // points are the sims vector. Its degree is the sum of those sims.
334        let degree_y = sims.iter().sum::<f64>().max(DEGREE_REGULARIZATION);
335        let inv_sqrt_dy = 1.0 / degree_y.sqrt();
336
337        // Nyström extension: u_k(y) = (1/μ_k) Σ_j w_norm(y, j) · u_k(j).
338        // w_norm(y, j) = sims[j] / (sqrt(d_y) · sqrt(d_j)).
339        let mut coords = [0.0f64; 3];
340        for (k, (ev, &mu)) in self
341            .eigenvectors
342            .iter()
343            .zip(self.eigenvalues.iter())
344            .enumerate()
345        {
346            if mu.abs() < 1e-10 {
347                continue;
348            }
349            let mut s = 0.0;
350            for j in 0..n {
351                let w_norm_yj = sims[j] * inv_sqrt_dy / self.corpus_degrees[j].sqrt();
352                s += w_norm_yj * ev[j];
353            }
354            coords[k] = s / mu;
355        }
356        (coords[0], coords[1], coords[2])
357    }
358}
359
360impl Projection for LaplacianEigenmapProjection {
361    fn project(&self, embedding: &Embedding) -> SphericalPoint {
362        let (x, y, z) = self.project_to_3d(embedding);
363        let r = self.radial.compute(embedding.magnitude());
364        project_xyz_to_spherical(x, y, z, r)
365    }
366
367    fn project_rich(&self, embedding: &Embedding) -> ProjectedPoint {
368        let (x, y, z) = self.project_to_3d(embedding);
369        let r = self.radial.compute(embedding.magnitude());
370        let position = project_xyz_to_spherical(x, y, z, r);
371        let proj_mag = (x * x + y * y + z * z).sqrt();
372        // Certainty reflects whether the Nyström extension produced a
373        // non-degenerate coordinate. A zero-magnitude projection means the
374        // query had no active axes in common with the corpus — the placement
375        // is effectively arbitrary, so certainty should be low.
376        let certainty = proj_mag.tanh();
377        ProjectedPoint {
378            position,
379            certainty,
380            intensity: embedding.magnitude(),
381            projection_magnitude: proj_mag,
382        }
383    }
384
385    fn dimensionality(&self) -> usize {
386        self.dim
387    }
388}
389
390// ── Helpers ────────────────────────────────────────────────────────────
391
392/// Jaccard similarity between two sorted, deduplicated index sets.
393/// Merge-intersect in O(|a| + |b|).
394fn jaccard_sorted(a: &[usize], b: &[usize]) -> f64 {
395    if a.is_empty() && b.is_empty() {
396        return 0.0;
397    }
398    let mut ia = 0;
399    let mut ib = 0;
400    let mut intersection: usize = 0;
401    while ia < a.len() && ib < b.len() {
402        match a[ia].cmp(&b[ib]) {
403            std::cmp::Ordering::Equal => {
404                intersection += 1;
405                ia += 1;
406                ib += 1;
407            }
408            std::cmp::Ordering::Less => ia += 1,
409            std::cmp::Ordering::Greater => ib += 1,
410        }
411    }
412    let union = a.len() + b.len() - intersection;
413    if union == 0 {
414        0.0
415    } else {
416        intersection as f64 / union as f64
417    }
418}
419
420/// Power iteration with deflation on a symmetric n×n matrix (row-major).
421/// Finds the top-k eigenvectors orthogonal to everything in `exclude`.
422///
423/// Returns (vectors, values) in descending eigenvalue order.
424fn top_k_symmetric_excluding(
425    matrix: &[f64],
426    n: usize,
427    k: usize,
428    exclude: &[Vec<f64>],
429) -> (Vec<Vec<f64>>, Vec<f64>) {
430    let mut vectors: Vec<Vec<f64>> = Vec::with_capacity(k);
431    let mut values: Vec<f64> = Vec::with_capacity(k);
432    let mut rng = SplitMix64::new(RNG_SEED);
433
434    let matvec = |dst: &mut [f64], src: &[f64]| {
435        for (i, dst_i) in dst.iter_mut().enumerate() {
436            let row = i * n;
437            let mut s = 0.0;
438            for j in 0..n {
439                s += matrix[row + j] * src[j];
440            }
441            *dst_i = s;
442        }
443    };
444
445    for _ in 0..k {
446        let mut v: Vec<f64> = (0..n).map(|_| rng.normal()).collect();
447        // Initial orthogonalization against exclude + previously-found vectors.
448        for prev in exclude.iter().chain(vectors.iter()) {
449            let proj = dot(&v, prev);
450            for (vi, &pi) in v.iter_mut().zip(prev.iter()) {
451                *vi -= proj * pi;
452            }
453        }
454        normalize_vec(&mut v);
455        let mut eigenvalue = 0.0;
456
457        // Cached `matrix · v` from the previous iteration's Rayleigh
458        // step. Each iteration's `matrix · v_new = matrix · u_previous`,
459        // so we can skip one mat-vec per step after the first.
460        let mut mv_cache: Option<Vec<f64>> = None;
461
462        for _ in 0..MAX_POWER_ITERS {
463            let mut u = mv_cache.take().unwrap_or_else(|| {
464                let mut u = vec![0.0f64; n];
465                matvec(&mut u, &v);
466                u
467            });
468            // Deflate each iteration so drift back into the excluded subspace
469            // is continuously removed.
470            for prev in exclude.iter().chain(vectors.iter()) {
471                let proj = dot(&u, prev);
472                for (ui, &pi) in u.iter_mut().zip(prev.iter()) {
473                    *ui -= proj * pi;
474                }
475            }
476            let mag = normalize_vec(&mut u);
477            if mag < f64::EPSILON {
478                break;
479            }
480
481            // Rayleigh quotient: λ ≈ uᵀ · (matrix · u). Cache
482            // `matrix · u` for next iteration's start — halves the
483            // steady-state mat-vec cost.
484            let mut mv_next = vec![0.0f64; n];
485            matvec(&mut mv_next, &u);
486            eigenvalue = dot(&u, &mv_next);
487
488            let change = (1.0 - dot(&u, &v).abs()).max(0.0);
489            v = u;
490            mv_cache = Some(mv_next);
491            if change < POWER_ITER_TOL {
492                break;
493            }
494        }
495
496        vectors.push(v);
497        values.push(eigenvalue);
498    }
499
500    // Sort in descending eigenvalue order (power iteration usually finds
501    // them in that order already, but deflation drift can swap adjacent
502    // pairs on near-degenerate spectra).
503    let mut order: Vec<usize> = (0..vectors.len()).collect();
504    order.sort_by(|&a, &b| {
505        values[b]
506            .partial_cmp(&values[a])
507            .unwrap_or(std::cmp::Ordering::Equal)
508    });
509    let sorted_vectors: Vec<Vec<f64>> = order.iter().map(|&i| vectors[i].clone()).collect();
510    let sorted_values: Vec<f64> = order.iter().map(|&i| values[i]).collect();
511    (sorted_vectors, sorted_values)
512}
513
514// ── Tests ──────────────────────────────────────────────────────────────
515
516#[cfg(test)]
517mod tests {
518    use super::*;
519    use sphereql_core::angular_distance;
520
521    fn emb(vals: &[f64]) -> Embedding {
522        Embedding::new(vals.to_vec())
523    }
524
525    /// Three well-separated 8-dim clusters. Each cluster activates 3
526    /// distinct axes strongly; the remaining 5 dims carry ±0.02 noise.
527    fn three_cluster_corpus() -> Vec<Embedding> {
528        let noise = 0.02;
529        let mut corpus = Vec::new();
530
531        // Cluster A: axes 0, 1, 2
532        for i in 0..5 {
533            let delta = i as f64 * 0.01;
534            corpus.push(emb(&[
535                1.0 + delta,
536                0.8 - delta,
537                0.7 + delta,
538                noise,
539                -noise,
540                noise,
541                -noise,
542                noise,
543            ]));
544        }
545        // Cluster B: axes 3, 4, 5
546        for i in 0..5 {
547            let delta = i as f64 * 0.01;
548            corpus.push(emb(&[
549                noise,
550                -noise,
551                noise,
552                1.0 + delta,
553                0.8 - delta,
554                0.7 + delta,
555                -noise,
556                noise,
557            ]));
558        }
559        // Cluster C: axes 5, 6, 7 (shares axis 5 with B to exercise a bridge)
560        for i in 0..5 {
561            let delta = i as f64 * 0.01;
562            corpus.push(emb(&[
563                -noise,
564                noise,
565                -noise,
566                noise,
567                -noise,
568                0.9 + delta,
569                1.0 - delta,
570                0.8 + delta,
571            ]));
572        }
573        corpus
574    }
575
576    #[test]
577    fn jaccard_empty_sets_zero() {
578        assert_eq!(jaccard_sorted(&[], &[]), 0.0);
579        assert_eq!(jaccard_sorted(&[1, 2], &[]), 0.0);
580    }
581
582    #[test]
583    fn jaccard_identical_sets_one() {
584        assert!((jaccard_sorted(&[1, 2, 3], &[1, 2, 3]) - 1.0).abs() < 1e-12);
585    }
586
587    #[test]
588    fn jaccard_disjoint_sets_zero() {
589        assert_eq!(jaccard_sorted(&[1, 2], &[3, 4]), 0.0);
590    }
591
592    #[test]
593    fn jaccard_partial_overlap() {
594        // {1,2,3} ∩ {2,3,4} = {2,3} (2), union = {1,2,3,4} (4) → 0.5
595        assert!((jaccard_sorted(&[1, 2, 3], &[2, 3, 4]) - 0.5).abs() < 1e-12);
596    }
597
598    #[test]
599    fn fit_produces_non_trivial_eigenvalues() {
600        let corpus = three_cluster_corpus();
601        let lap = LaplacianEigenmapProjection::fit(&corpus, RadialStrategy::Fixed(1.0)).unwrap();
602        let [m1, m2, m3] = lap.eigenvalues();
603        // All retained eigenvalues must be strictly below 1 (trivial excluded)
604        // and above 0 (the 3-cluster structure should produce meaningful
605        // non-zero connectivity eigenvalues).
606        assert!(m1 < 1.0 && m1 > 0.0, "μ_1 = {m1}");
607        assert!(m2 < 1.0 && m2 > -1.0, "μ_2 = {m2}");
608        assert!(m3 < 1.0 && m3 > -1.0, "μ_3 = {m3}");
609        // Descending order.
610        assert!(m1 >= m2 - 1e-10);
611        assert!(m2 >= m3 - 1e-10);
612    }
613
614    #[test]
615    fn connectivity_ratio_in_unit_interval() {
616        let corpus = three_cluster_corpus();
617        let lap = LaplacianEigenmapProjection::fit(&corpus, RadialStrategy::Fixed(1.0)).unwrap();
618        let r = lap.connectivity_ratio();
619        assert!((0.0..=1.0).contains(&r), "connectivity_ratio = {r}");
620        assert_eq!(r, lap.explained_variance_ratio());
621    }
622
623    #[test]
624    fn projection_lands_on_unit_sphere() {
625        let corpus = three_cluster_corpus();
626        let lap = LaplacianEigenmapProjection::fit(&corpus, RadialStrategy::Fixed(1.0)).unwrap();
627        for e in &corpus {
628            let sp = lap.project(e);
629            assert!((sp.r - 1.0).abs() < 1e-9, "r = {}", sp.r);
630            assert!(sp.theta >= 0.0 && sp.theta <= std::f64::consts::TAU);
631            assert!(sp.phi >= 0.0 && sp.phi <= std::f64::consts::PI);
632        }
633    }
634
635    #[test]
636    fn same_cluster_points_closer_than_cross_cluster() {
637        let corpus = three_cluster_corpus();
638        let lap = LaplacianEigenmapProjection::fit(&corpus, RadialStrategy::Fixed(1.0)).unwrap();
639        let positions: Vec<SphericalPoint> = corpus.iter().map(|e| lap.project(e)).collect();
640
641        // Within cluster A (indices 0..5): max pairwise distance.
642        let mut max_within_a = 0.0f64;
643        for i in 0..5 {
644            for j in (i + 1)..5 {
645                max_within_a = max_within_a.max(angular_distance(&positions[i], &positions[j]));
646            }
647        }
648        // A vs C (indices 0..5 vs 10..15): min pairwise distance. A and C
649        // share no axes, so C should be the farthest-out of the three
650        // clusters from A.
651        let mut min_a_to_c = f64::INFINITY;
652        for i in 0..5 {
653            for j in 10..15 {
654                min_a_to_c = min_a_to_c.min(angular_distance(&positions[i], &positions[j]));
655            }
656        }
657        assert!(
658            max_within_a < min_a_to_c,
659            "within-A max ({max_within_a}) should be less than A-to-C min ({min_a_to_c})"
660        );
661    }
662
663    #[test]
664    fn nystrom_roundtrip_approximates_training_position() {
665        // A corpus point fed back through project() should land near its
666        // Nyström-implied position. We don't demand exact recovery (the
667        // Nyström extension differs subtly from the in-sample eigenvector
668        // coord because Jaccard(x, x) = 1 but W[i][i] = 0 during fit), but
669        // the same corpus point should project consistently across calls.
670        let corpus = three_cluster_corpus();
671        let lap = LaplacianEigenmapProjection::fit(&corpus, RadialStrategy::Fixed(1.0)).unwrap();
672        let first = lap.project(&corpus[0]);
673        let again = lap.project(&corpus[0]);
674        assert!((first.theta - again.theta).abs() < 1e-12);
675        assert!((first.phi - again.phi).abs() < 1e-12);
676    }
677
678    #[test]
679    fn new_point_in_known_cluster_routes_to_cluster_region() {
680        let corpus = three_cluster_corpus();
681        let lap = LaplacianEigenmapProjection::fit(&corpus, RadialStrategy::Fixed(1.0)).unwrap();
682        // Synthesize a fresh "cluster-A" point and check it lands closer to
683        // an existing cluster-A member than to a cluster-C member.
684        let query = emb(&[1.0, 0.8, 0.7, 0.02, -0.02, 0.02, -0.02, 0.02]);
685        let q = lap.project(&query);
686        let a_member = lap.project(&corpus[0]);
687        let c_member = lap.project(&corpus[10]);
688        let d_to_a = angular_distance(&q, &a_member);
689        let d_to_c = angular_distance(&q, &c_member);
690        assert!(
691            d_to_a < d_to_c,
692            "query→A {d_to_a} should be less than query→C {d_to_c}"
693        );
694    }
695
696    #[test]
697    fn dimensionality_matches_input() {
698        let corpus = three_cluster_corpus();
699        let lap = LaplacianEigenmapProjection::fit(&corpus, RadialStrategy::Fixed(1.0)).unwrap();
700        assert_eq!(lap.dimensionality(), 8);
701    }
702
703    #[test]
704    fn fit_rejects_tiny_corpus() {
705        let corpus = vec![emb(&[1.0, 0.0]), emb(&[0.0, 1.0])];
706        assert!(matches!(
707            LaplacianEigenmapProjection::fit(&corpus, RadialStrategy::Fixed(1.0)),
708            Err(ProjectionError::TooFewEmbeddings {
709                got: 2,
710                required: 4
711            })
712        ));
713    }
714
715    #[test]
716    fn project_rich_certainty_in_range() {
717        let corpus = three_cluster_corpus();
718        let lap = LaplacianEigenmapProjection::fit(&corpus, RadialStrategy::Fixed(1.0)).unwrap();
719        for e in &corpus {
720            let pp = lap.project_rich(e);
721            assert!(pp.certainty >= 0.0 && pp.certainty <= 1.0);
722        }
723    }
724
725    #[test]
726    fn disconnected_query_gracefully_placed() {
727        // A query with no active axes (all below threshold) has no similarity
728        // to the corpus. It should still produce a valid SphericalPoint
729        // rather than NaN.
730        let corpus = three_cluster_corpus();
731        let lap = LaplacianEigenmapProjection::fit(&corpus, RadialStrategy::Fixed(1.0)).unwrap();
732        let query = emb(&[0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001]);
733        let sp = lap.project(&query);
734        assert!(sp.r.is_finite());
735        assert!(sp.theta.is_finite());
736        assert!(sp.phi.is_finite());
737    }
738}
sphereql_embed/laplacian.rs

sphereql_embed/
laplacian.rs