sphereql_embed/laplacian.rs
1//! Laplacian eigenmap projection — connectivity-preserving embedding on S².
2//!
3//! Standard PCA/kernel PCA maximize *variance* — the directions along which
4//! data spreads. On sparse, noise-heavy embeddings where signal lives in a
5//! handful of co-activated axes and the rest is low-amplitude noise, variance
6//! maximization pulls the projection toward the noise axes and collapses
7//! genuine category structure. Laplacian eigenmaps instead preserve
8//! *connectivity*: two points are "close" on the sphere if they share many
9//! active axes in the original space, regardless of how small those
10//! activations are in absolute magnitude.
11//!
12//! # Algorithm
13//!
14//! 1. For each embedding, extract the set of axes whose absolute weight
15//! exceeds `active_threshold` (noise filter).
16//! 2. Build a pairwise Jaccard-similarity matrix W over those active sets.
17//! 3. Sparsify W to a mutual-k-nearest-neighbors graph (union semantics).
18//! 4. Compute the normalized affinity matrix W_norm = D^(-1/2) W D^(-1/2).
19//! 5. Find the top-3 *non-trivial* eigenvectors of W_norm via power iteration
20//! with deflation against the trivial eigenvector (μ=1, ∝ sqrt(D)).
21//! 6. Project: for a new embedding, apply the Nyström out-of-sample formula
22//! u_k(y) = (1/μ_k) Σ_j w_norm(y, j) · u_k(j), then L2-normalize the
23//! three coordinates to land on S².
24//!
25//! # References
26//!
27//! * Belkin & Niyogi. "Laplacian Eigenmaps for Dimensionality Reduction and
28//! Data Representation." *Neural Computation* 15 (2003) 1373–1396.
29//! * Bengio et al. "Out-of-Sample Extensions for LLE, Isomap, MDS, Eigenmaps,
30//! and Spectral Clustering." *NIPS* 2003.
31
32use sphereql_core::SphericalPoint;
33
34use crate::projection::{
35 Projection, ProjectionError, SplitMix64, dot, normalize_vec, project_xyz_to_spherical,
36};
37use crate::types::{Embedding, ProjectedPoint, RadialStrategy};
38
39// ── Defaults ───────────────────────────────────────────────────────────
40
41/// Absolute-weight cutoff below which an axis is treated as noise and
42/// excluded from the active set used to compute Jaccard similarity.
43pub const DEFAULT_ACTIVE_THRESHOLD: f64 = 0.05;
44
45/// k in the k-NN graph sparsification step. Each point keeps edges to its
46/// top-k most similar neighbors; the graph is the *union* of those edges.
47pub const DEFAULT_K_NEIGHBORS: usize = 15;
48
49/// Floor on the degree of any graph node. Zero-degree nodes would produce
50/// NaN during D^(-1/2) normalization; this regularization keeps the spectrum
51/// well-defined on disconnected graphs.
52pub const DEGREE_REGULARIZATION: f64 = 1e-6;
53
54const MAX_POWER_ITERS: usize = 400;
55const POWER_ITER_TOL: f64 = 1e-10;
56const RNG_SEED: u64 = 0xC0FF_EECA_FE00;
57
58// ── Projection ─────────────────────────────────────────────────────────
59
60/// Laplacian-eigenmap projection: embeds concepts on S² by the spectral
61/// structure of the axis-co-activation graph rather than raw variance.
62///
63/// Fit cost: O(n² · d) to build the Jaccard graph plus O(n² · q · iters)
64/// for power iteration on the n×n affinity matrix. Memory: O(n²) during
65/// fit, O(n) per eigenvector after.
66///
67/// Project cost (Nyström out-of-sample): O(n · d) per query.
68#[derive(Clone)]
69pub struct LaplacianEigenmapProjection {
70 // ── Fit parameters ──
71 active_threshold: f64,
72 radial: RadialStrategy,
73 dim: usize,
74
75 // ── Fitted corpus state ──
76 /// For each corpus point, the sorted list of active-axis indices.
77 /// Used at query time to compute Jaccard similarity with new points.
78 corpus_active_sets: Vec<Vec<usize>>,
79 /// Sparsified degree of each corpus point: Σ_j W[i][j].
80 /// Used in the Nyström extension's D^(-1/2) normalization.
81 corpus_degrees: Vec<f64>,
82
83 /// Top-3 non-trivial eigenvectors of the normalized affinity W_norm,
84 /// in descending eigenvalue order. Each vector has length n.
85 eigenvectors: [Vec<f64>; 3],
86 /// Corresponding eigenvalues μ_1, μ_2, μ_3 ∈ (−1, 1). The trivial
87 /// μ_0 = 1 is excluded.
88 eigenvalues: [f64; 3],
89
90 /// Mean of |μ_k| across the three retained eigenvalues. A proxy for
91 /// how much "low-frequency" connectivity structure the embedding
92 /// captured. Reported in place of PCA's explained_variance_ratio for
93 /// API compatibility; see [`Self::connectivity_ratio`].
94 connectivity_ratio: f64,
95}
96
97impl LaplacianEigenmapProjection {
98 /// Fit with default parameters (`k=15`, `active_threshold=0.05`).
99 pub fn fit(embeddings: &[Embedding], radial: RadialStrategy) -> Result<Self, ProjectionError> {
100 Self::fit_with_params(
101 embeddings,
102 DEFAULT_K_NEIGHBORS,
103 DEFAULT_ACTIVE_THRESHOLD,
104 radial,
105 )
106 }
107
108 /// Fit with explicit graph parameters.
109 ///
110 /// - `k` — k-NN graph density. Higher k = more edges = smoother embedding,
111 /// less sensitive to local noise, but also more blurred category
112 /// boundaries. Typical range: 10–30.
113 /// - `active_threshold` — absolute-weight cutoff for the active-axis
114 /// filter. Axes with |v_i| ≤ threshold contribute nothing to the
115 /// similarity metric.
116 pub fn fit_with_params(
117 embeddings: &[Embedding],
118 k: usize,
119 active_threshold: f64,
120 radial: RadialStrategy,
121 ) -> Result<Self, ProjectionError> {
122 let n = embeddings.len();
123 if n < 4 {
124 return Err(ProjectionError::TooFewEmbeddings {
125 got: n,
126 required: 4,
127 });
128 }
129 let dim = embeddings[0].dimension();
130 if dim == 0 {
131 return Err(ProjectionError::DimensionTooLow {
132 got: dim,
133 required: 1,
134 });
135 }
136 let k = k.min(n - 1).max(1);
137
138 // 1. Active-axis sets (sorted for merge-style intersection).
139 let corpus_active_sets: Vec<Vec<usize>> = embeddings
140 .iter()
141 .map(|e| {
142 let mut idxs: Vec<usize> = e
143 .values
144 .iter()
145 .enumerate()
146 .filter_map(|(i, v)| (v.abs() > active_threshold).then_some(i))
147 .collect();
148 idxs.sort_unstable();
149 idxs
150 })
151 .collect();
152
153 // 2–4. Sparse top-k graph construction.
154 //
155 // Previously this pass materialized four separate n×n matrices
156 // (`sim`, `keep`, `w`, then `w_norm`) totaling ~3.2 GB at
157 // n = 10 000. The only buffer the eigensolver actually reads
158 // is `w_norm`, so every intermediate above was pure RAM tax.
159 //
160 // New shape:
161 // per-row top-k edge list (≈ n · k entries, O(n · k) memory)
162 // → union-symmetrize into a `HashSet<(i, j)>`
163 // → compute degrees from the set
164 // → scatter directly into one dense `w_norm` the
165 // eigensolver can consume
166 //
167 // Memory: from 4 · n² · 8 ≈ 3.2 GB down to ~1 · n² · 8 + O(n·k)
168 // ≈ 800 MB + a few MB of scratch.
169 //
170 // Jaccard is computed once per unordered pair; the upper-
171 // triangular scan fills both rows' candidate lists in one step.
172
173 // Per-row top-k min-heap keyed on (sim, j) — keeps the
174 // *largest* k similarities by evicting the current minimum.
175 // Using `std::cmp::Reverse` converts the default max-heap into
176 // the min-heap we need.
177 use std::cmp::Reverse;
178 use std::collections::BinaryHeap;
179 #[derive(PartialEq)]
180 struct TopKEntry(f64, usize);
181 impl Eq for TopKEntry {}
182 impl PartialOrd for TopKEntry {
183 fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
184 Some(self.cmp(other))
185 }
186 }
187 impl Ord for TopKEntry {
188 fn cmp(&self, other: &Self) -> std::cmp::Ordering {
189 // Compare on sim first (total_cmp — NaN-safe), break
190 // ties on index so the order is deterministic.
191 self.0
192 .total_cmp(&other.0)
193 .then_with(|| self.1.cmp(&other.1))
194 }
195 }
196
197 let mut top_k_heaps: Vec<BinaryHeap<Reverse<TopKEntry>>> =
198 (0..n).map(|_| BinaryHeap::with_capacity(k + 1)).collect();
199
200 for i in 0..n {
201 for j in (i + 1)..n {
202 let s = jaccard_sorted(&corpus_active_sets[i], &corpus_active_sets[j]);
203 if s <= 0.0 {
204 continue;
205 }
206 for (owner, other) in [(i, j), (j, i)] {
207 let h = &mut top_k_heaps[owner];
208 h.push(Reverse(TopKEntry(s, other)));
209 if h.len() > k {
210 h.pop();
211 }
212 }
213 }
214 }
215
216 // Union-symmetrize the per-row top-k candidates into a dedup
217 // edge set. Jaccard is symmetric by construction, so we can
218 // recover the edge weight from the heap entry.
219 let mut edges: std::collections::HashMap<(usize, usize), f64> =
220 std::collections::HashMap::with_capacity(n * k);
221 for (i, heap) in top_k_heaps.into_iter().enumerate() {
222 for Reverse(TopKEntry(s, j)) in heap.into_iter() {
223 let key = if i < j { (i, j) } else { (j, i) };
224 edges.insert(key, s);
225 }
226 }
227
228 // Degrees: sum of weights on each node's incident edges.
229 let mut degrees = vec![0.0f64; n];
230 for (&(a, b), &s) in edges.iter() {
231 degrees[a] += s;
232 degrees[b] += s;
233 }
234
235 // Regularize isolated/near-isolated nodes so D^(-1/2) stays finite.
236 let safe_degrees: Vec<f64> = degrees
237 .iter()
238 .map(|&d| d.max(DEGREE_REGULARIZATION))
239 .collect();
240 let d_inv_sqrt: Vec<f64> = safe_degrees.iter().map(|&d| 1.0 / d.sqrt()).collect();
241
242 // 5. Scatter the symmetric W_norm = D^(-1/2) · W · D^(-1/2)
243 // directly from the edge map. Cells with no edge stay 0.
244 let mut w_norm = vec![0.0f64; n * n];
245 for (&(a, b), &s) in edges.iter() {
246 let v = s * d_inv_sqrt[a] * d_inv_sqrt[b];
247 w_norm[a * n + b] = v;
248 w_norm[b * n + a] = v;
249 }
250
251 // 6. Trivial eigenvector of W_norm at μ=1 is proportional to sqrt(D).
252 // Explicitly constructed so deflation against it pulls it out of
253 // the subsequent power-iteration subspace.
254 let sum_d: f64 = safe_degrees.iter().sum();
255 let trivial_ev: Vec<f64> = safe_degrees
256 .iter()
257 .map(|&d| d.sqrt() / sum_d.sqrt())
258 .collect();
259
260 // 7. Top-3 non-trivial eigenvectors + eigenvalues.
261 let (eigenvectors_vec, eigenvalues_vec) =
262 top_k_symmetric_excluding(&w_norm, n, 3, &[trivial_ev]);
263
264 let eigenvalues: [f64; 3] = [eigenvalues_vec[0], eigenvalues_vec[1], eigenvalues_vec[2]];
265 let eigenvectors: [Vec<f64>; 3] = [
266 eigenvectors_vec[0].clone(),
267 eigenvectors_vec[1].clone(),
268 eigenvectors_vec[2].clone(),
269 ];
270
271 let connectivity_ratio =
272 (eigenvalues[0].abs() + eigenvalues[1].abs() + eigenvalues[2].abs()) / 3.0;
273
274 Ok(Self {
275 active_threshold,
276 radial,
277 dim,
278 corpus_active_sets,
279 corpus_degrees: safe_degrees,
280 eigenvectors,
281 eigenvalues,
282 connectivity_ratio,
283 })
284 }
285
286 /// Top-3 non-trivial eigenvalues of the normalized affinity matrix
287 /// (in descending order). Values close to 1.0 indicate strong block
288 /// structure along that spectral direction.
289 pub fn eigenvalues(&self) -> [f64; 3] {
290 self.eigenvalues
291 }
292
293 /// Scalar quality proxy analogous to PCA's explained-variance ratio.
294 ///
295 /// Defined as the mean of `|μ_k|` across the three retained eigenvalues;
296 /// bounded in [0, 1]. Higher = stronger low-frequency community structure
297 /// captured by the 3D embedding.
298 ///
299 /// *Not* directly comparable to PCA EVR — the quantities have different
300 /// mathematical meaning. Use as a signal of projection health for this
301 /// projection family specifically.
302 pub fn connectivity_ratio(&self) -> f64 {
303 self.connectivity_ratio
304 }
305
306 /// Returned for API compatibility with `PcaProjection::explained_variance_ratio`.
307 /// See [`Self::connectivity_ratio`] for semantics — the two are not the
308 /// same metric, but both live in [0, 1] and can feed EVR-adaptive
309 /// thresholds downstream.
310 pub fn explained_variance_ratio(&self) -> f64 {
311 self.connectivity_ratio
312 }
313
314 /// Nyström-extend the three retained eigenvectors to a new embedding y,
315 /// returning raw (unnormalized) 3D coordinates.
316 fn project_to_3d(&self, embedding: &Embedding) -> (f64, f64, f64) {
317 let mut active_y: Vec<usize> = embedding
318 .values
319 .iter()
320 .enumerate()
321 .filter_map(|(i, v)| (v.abs() > self.active_threshold).then_some(i))
322 .collect();
323 active_y.sort_unstable();
324
325 let n = self.corpus_active_sets.len();
326 let sims: Vec<f64> = self
327 .corpus_active_sets
328 .iter()
329 .map(|s| jaccard_sorted(&active_y, s))
330 .collect();
331
332 // Treat the query as a new graph node whose similarities to corpus
333 // points are the sims vector. Its degree is the sum of those sims.
334 let degree_y = sims.iter().sum::<f64>().max(DEGREE_REGULARIZATION);
335 let inv_sqrt_dy = 1.0 / degree_y.sqrt();
336
337 // Nyström extension: u_k(y) = (1/μ_k) Σ_j w_norm(y, j) · u_k(j).
338 // w_norm(y, j) = sims[j] / (sqrt(d_y) · sqrt(d_j)).
339 let mut coords = [0.0f64; 3];
340 for (k, (ev, &mu)) in self
341 .eigenvectors
342 .iter()
343 .zip(self.eigenvalues.iter())
344 .enumerate()
345 {
346 if mu.abs() < 1e-10 {
347 continue;
348 }
349 let mut s = 0.0;
350 for j in 0..n {
351 let w_norm_yj = sims[j] * inv_sqrt_dy / self.corpus_degrees[j].sqrt();
352 s += w_norm_yj * ev[j];
353 }
354 coords[k] = s / mu;
355 }
356 (coords[0], coords[1], coords[2])
357 }
358}
359
360impl Projection for LaplacianEigenmapProjection {
361 fn project(&self, embedding: &Embedding) -> SphericalPoint {
362 let (x, y, z) = self.project_to_3d(embedding);
363 let r = self.radial.compute(embedding.magnitude());
364 project_xyz_to_spherical(x, y, z, r)
365 }
366
367 fn project_rich(&self, embedding: &Embedding) -> ProjectedPoint {
368 let (x, y, z) = self.project_to_3d(embedding);
369 let r = self.radial.compute(embedding.magnitude());
370 let position = project_xyz_to_spherical(x, y, z, r);
371 let proj_mag = (x * x + y * y + z * z).sqrt();
372 // Certainty reflects whether the Nyström extension produced a
373 // non-degenerate coordinate. A zero-magnitude projection means the
374 // query had no active axes in common with the corpus — the placement
375 // is effectively arbitrary, so certainty should be low.
376 let certainty = proj_mag.tanh();
377 ProjectedPoint {
378 position,
379 certainty,
380 intensity: embedding.magnitude(),
381 projection_magnitude: proj_mag,
382 }
383 }
384
385 fn dimensionality(&self) -> usize {
386 self.dim
387 }
388}
389
390// ── Helpers ────────────────────────────────────────────────────────────
391
392/// Jaccard similarity between two sorted, deduplicated index sets.
393/// Merge-intersect in O(|a| + |b|).
394fn jaccard_sorted(a: &[usize], b: &[usize]) -> f64 {
395 if a.is_empty() && b.is_empty() {
396 return 0.0;
397 }
398 let mut ia = 0;
399 let mut ib = 0;
400 let mut intersection: usize = 0;
401 while ia < a.len() && ib < b.len() {
402 match a[ia].cmp(&b[ib]) {
403 std::cmp::Ordering::Equal => {
404 intersection += 1;
405 ia += 1;
406 ib += 1;
407 }
408 std::cmp::Ordering::Less => ia += 1,
409 std::cmp::Ordering::Greater => ib += 1,
410 }
411 }
412 let union = a.len() + b.len() - intersection;
413 if union == 0 {
414 0.0
415 } else {
416 intersection as f64 / union as f64
417 }
418}
419
420/// Power iteration with deflation on a symmetric n×n matrix (row-major).
421/// Finds the top-k eigenvectors orthogonal to everything in `exclude`.
422///
423/// Returns (vectors, values) in descending eigenvalue order.
424fn top_k_symmetric_excluding(
425 matrix: &[f64],
426 n: usize,
427 k: usize,
428 exclude: &[Vec<f64>],
429) -> (Vec<Vec<f64>>, Vec<f64>) {
430 let mut vectors: Vec<Vec<f64>> = Vec::with_capacity(k);
431 let mut values: Vec<f64> = Vec::with_capacity(k);
432 let mut rng = SplitMix64::new(RNG_SEED);
433
434 let matvec = |dst: &mut [f64], src: &[f64]| {
435 for (i, dst_i) in dst.iter_mut().enumerate() {
436 let row = i * n;
437 let mut s = 0.0;
438 for j in 0..n {
439 s += matrix[row + j] * src[j];
440 }
441 *dst_i = s;
442 }
443 };
444
445 for _ in 0..k {
446 let mut v: Vec<f64> = (0..n).map(|_| rng.normal()).collect();
447 // Initial orthogonalization against exclude + previously-found vectors.
448 for prev in exclude.iter().chain(vectors.iter()) {
449 let proj = dot(&v, prev);
450 for (vi, &pi) in v.iter_mut().zip(prev.iter()) {
451 *vi -= proj * pi;
452 }
453 }
454 normalize_vec(&mut v);
455 let mut eigenvalue = 0.0;
456
457 // Cached `matrix · v` from the previous iteration's Rayleigh
458 // step. Each iteration's `matrix · v_new = matrix · u_previous`,
459 // so we can skip one mat-vec per step after the first.
460 let mut mv_cache: Option<Vec<f64>> = None;
461
462 for _ in 0..MAX_POWER_ITERS {
463 let mut u = mv_cache.take().unwrap_or_else(|| {
464 let mut u = vec![0.0f64; n];
465 matvec(&mut u, &v);
466 u
467 });
468 // Deflate each iteration so drift back into the excluded subspace
469 // is continuously removed.
470 for prev in exclude.iter().chain(vectors.iter()) {
471 let proj = dot(&u, prev);
472 for (ui, &pi) in u.iter_mut().zip(prev.iter()) {
473 *ui -= proj * pi;
474 }
475 }
476 let mag = normalize_vec(&mut u);
477 if mag < f64::EPSILON {
478 break;
479 }
480
481 // Rayleigh quotient: λ ≈ uᵀ · (matrix · u). Cache
482 // `matrix · u` for next iteration's start — halves the
483 // steady-state mat-vec cost.
484 let mut mv_next = vec![0.0f64; n];
485 matvec(&mut mv_next, &u);
486 eigenvalue = dot(&u, &mv_next);
487
488 let change = (1.0 - dot(&u, &v).abs()).max(0.0);
489 v = u;
490 mv_cache = Some(mv_next);
491 if change < POWER_ITER_TOL {
492 break;
493 }
494 }
495
496 vectors.push(v);
497 values.push(eigenvalue);
498 }
499
500 // Sort in descending eigenvalue order (power iteration usually finds
501 // them in that order already, but deflation drift can swap adjacent
502 // pairs on near-degenerate spectra).
503 let mut order: Vec<usize> = (0..vectors.len()).collect();
504 order.sort_by(|&a, &b| {
505 values[b]
506 .partial_cmp(&values[a])
507 .unwrap_or(std::cmp::Ordering::Equal)
508 });
509 let sorted_vectors: Vec<Vec<f64>> = order.iter().map(|&i| vectors[i].clone()).collect();
510 let sorted_values: Vec<f64> = order.iter().map(|&i| values[i]).collect();
511 (sorted_vectors, sorted_values)
512}
513
514// ── Tests ──────────────────────────────────────────────────────────────
515
516#[cfg(test)]
517mod tests {
518 use super::*;
519 use sphereql_core::angular_distance;
520
521 fn emb(vals: &[f64]) -> Embedding {
522 Embedding::new(vals.to_vec())
523 }
524
525 /// Three well-separated 8-dim clusters. Each cluster activates 3
526 /// distinct axes strongly; the remaining 5 dims carry ±0.02 noise.
527 fn three_cluster_corpus() -> Vec<Embedding> {
528 let noise = 0.02;
529 let mut corpus = Vec::new();
530
531 // Cluster A: axes 0, 1, 2
532 for i in 0..5 {
533 let delta = i as f64 * 0.01;
534 corpus.push(emb(&[
535 1.0 + delta,
536 0.8 - delta,
537 0.7 + delta,
538 noise,
539 -noise,
540 noise,
541 -noise,
542 noise,
543 ]));
544 }
545 // Cluster B: axes 3, 4, 5
546 for i in 0..5 {
547 let delta = i as f64 * 0.01;
548 corpus.push(emb(&[
549 noise,
550 -noise,
551 noise,
552 1.0 + delta,
553 0.8 - delta,
554 0.7 + delta,
555 -noise,
556 noise,
557 ]));
558 }
559 // Cluster C: axes 5, 6, 7 (shares axis 5 with B to exercise a bridge)
560 for i in 0..5 {
561 let delta = i as f64 * 0.01;
562 corpus.push(emb(&[
563 -noise,
564 noise,
565 -noise,
566 noise,
567 -noise,
568 0.9 + delta,
569 1.0 - delta,
570 0.8 + delta,
571 ]));
572 }
573 corpus
574 }
575
576 #[test]
577 fn jaccard_empty_sets_zero() {
578 assert_eq!(jaccard_sorted(&[], &[]), 0.0);
579 assert_eq!(jaccard_sorted(&[1, 2], &[]), 0.0);
580 }
581
582 #[test]
583 fn jaccard_identical_sets_one() {
584 assert!((jaccard_sorted(&[1, 2, 3], &[1, 2, 3]) - 1.0).abs() < 1e-12);
585 }
586
587 #[test]
588 fn jaccard_disjoint_sets_zero() {
589 assert_eq!(jaccard_sorted(&[1, 2], &[3, 4]), 0.0);
590 }
591
592 #[test]
593 fn jaccard_partial_overlap() {
594 // {1,2,3} ∩ {2,3,4} = {2,3} (2), union = {1,2,3,4} (4) → 0.5
595 assert!((jaccard_sorted(&[1, 2, 3], &[2, 3, 4]) - 0.5).abs() < 1e-12);
596 }
597
598 #[test]
599 fn fit_produces_non_trivial_eigenvalues() {
600 let corpus = three_cluster_corpus();
601 let lap = LaplacianEigenmapProjection::fit(&corpus, RadialStrategy::Fixed(1.0)).unwrap();
602 let [m1, m2, m3] = lap.eigenvalues();
603 // All retained eigenvalues must be strictly below 1 (trivial excluded)
604 // and above 0 (the 3-cluster structure should produce meaningful
605 // non-zero connectivity eigenvalues).
606 assert!(m1 < 1.0 && m1 > 0.0, "μ_1 = {m1}");
607 assert!(m2 < 1.0 && m2 > -1.0, "μ_2 = {m2}");
608 assert!(m3 < 1.0 && m3 > -1.0, "μ_3 = {m3}");
609 // Descending order.
610 assert!(m1 >= m2 - 1e-10);
611 assert!(m2 >= m3 - 1e-10);
612 }
613
614 #[test]
615 fn connectivity_ratio_in_unit_interval() {
616 let corpus = three_cluster_corpus();
617 let lap = LaplacianEigenmapProjection::fit(&corpus, RadialStrategy::Fixed(1.0)).unwrap();
618 let r = lap.connectivity_ratio();
619 assert!((0.0..=1.0).contains(&r), "connectivity_ratio = {r}");
620 assert_eq!(r, lap.explained_variance_ratio());
621 }
622
623 #[test]
624 fn projection_lands_on_unit_sphere() {
625 let corpus = three_cluster_corpus();
626 let lap = LaplacianEigenmapProjection::fit(&corpus, RadialStrategy::Fixed(1.0)).unwrap();
627 for e in &corpus {
628 let sp = lap.project(e);
629 assert!((sp.r - 1.0).abs() < 1e-9, "r = {}", sp.r);
630 assert!(sp.theta >= 0.0 && sp.theta <= std::f64::consts::TAU);
631 assert!(sp.phi >= 0.0 && sp.phi <= std::f64::consts::PI);
632 }
633 }
634
635 #[test]
636 fn same_cluster_points_closer_than_cross_cluster() {
637 let corpus = three_cluster_corpus();
638 let lap = LaplacianEigenmapProjection::fit(&corpus, RadialStrategy::Fixed(1.0)).unwrap();
639 let positions: Vec<SphericalPoint> = corpus.iter().map(|e| lap.project(e)).collect();
640
641 // Within cluster A (indices 0..5): max pairwise distance.
642 let mut max_within_a = 0.0f64;
643 for i in 0..5 {
644 for j in (i + 1)..5 {
645 max_within_a = max_within_a.max(angular_distance(&positions[i], &positions[j]));
646 }
647 }
648 // A vs C (indices 0..5 vs 10..15): min pairwise distance. A and C
649 // share no axes, so C should be the farthest-out of the three
650 // clusters from A.
651 let mut min_a_to_c = f64::INFINITY;
652 for i in 0..5 {
653 for j in 10..15 {
654 min_a_to_c = min_a_to_c.min(angular_distance(&positions[i], &positions[j]));
655 }
656 }
657 assert!(
658 max_within_a < min_a_to_c,
659 "within-A max ({max_within_a}) should be less than A-to-C min ({min_a_to_c})"
660 );
661 }
662
663 #[test]
664 fn nystrom_roundtrip_approximates_training_position() {
665 // A corpus point fed back through project() should land near its
666 // Nyström-implied position. We don't demand exact recovery (the
667 // Nyström extension differs subtly from the in-sample eigenvector
668 // coord because Jaccard(x, x) = 1 but W[i][i] = 0 during fit), but
669 // the same corpus point should project consistently across calls.
670 let corpus = three_cluster_corpus();
671 let lap = LaplacianEigenmapProjection::fit(&corpus, RadialStrategy::Fixed(1.0)).unwrap();
672 let first = lap.project(&corpus[0]);
673 let again = lap.project(&corpus[0]);
674 assert!((first.theta - again.theta).abs() < 1e-12);
675 assert!((first.phi - again.phi).abs() < 1e-12);
676 }
677
678 #[test]
679 fn new_point_in_known_cluster_routes_to_cluster_region() {
680 let corpus = three_cluster_corpus();
681 let lap = LaplacianEigenmapProjection::fit(&corpus, RadialStrategy::Fixed(1.0)).unwrap();
682 // Synthesize a fresh "cluster-A" point and check it lands closer to
683 // an existing cluster-A member than to a cluster-C member.
684 let query = emb(&[1.0, 0.8, 0.7, 0.02, -0.02, 0.02, -0.02, 0.02]);
685 let q = lap.project(&query);
686 let a_member = lap.project(&corpus[0]);
687 let c_member = lap.project(&corpus[10]);
688 let d_to_a = angular_distance(&q, &a_member);
689 let d_to_c = angular_distance(&q, &c_member);
690 assert!(
691 d_to_a < d_to_c,
692 "query→A {d_to_a} should be less than query→C {d_to_c}"
693 );
694 }
695
696 #[test]
697 fn dimensionality_matches_input() {
698 let corpus = three_cluster_corpus();
699 let lap = LaplacianEigenmapProjection::fit(&corpus, RadialStrategy::Fixed(1.0)).unwrap();
700 assert_eq!(lap.dimensionality(), 8);
701 }
702
703 #[test]
704 fn fit_rejects_tiny_corpus() {
705 let corpus = vec![emb(&[1.0, 0.0]), emb(&[0.0, 1.0])];
706 assert!(matches!(
707 LaplacianEigenmapProjection::fit(&corpus, RadialStrategy::Fixed(1.0)),
708 Err(ProjectionError::TooFewEmbeddings {
709 got: 2,
710 required: 4
711 })
712 ));
713 }
714
715 #[test]
716 fn project_rich_certainty_in_range() {
717 let corpus = three_cluster_corpus();
718 let lap = LaplacianEigenmapProjection::fit(&corpus, RadialStrategy::Fixed(1.0)).unwrap();
719 for e in &corpus {
720 let pp = lap.project_rich(e);
721 assert!(pp.certainty >= 0.0 && pp.certainty <= 1.0);
722 }
723 }
724
725 #[test]
726 fn disconnected_query_gracefully_placed() {
727 // A query with no active axes (all below threshold) has no similarity
728 // to the corpus. It should still produce a valid SphericalPoint
729 // rather than NaN.
730 let corpus = three_cluster_corpus();
731 let lap = LaplacianEigenmapProjection::fit(&corpus, RadialStrategy::Fixed(1.0)).unwrap();
732 let query = emb(&[0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001]);
733 let sp = lap.project(&query);
734 assert!(sp.r.is_finite());
735 assert!(sp.theta.is_finite());
736 assert!(sp.phi.is_finite());
737 }
738}