sphereql_embed/
kernel_pca.rs

1use sphereql_core::{CartesianPoint, SphericalPoint, cartesian_to_spherical};
2
3use crate::projection::{
4    ProjectionError, SplitMix64, dot, normalize_vec, project_xyz_to_spherical,
5};
6use crate::types::{Embedding, ProjectedPoint, RadialStrategy};
7
8use crate::projection::Projection;
9
10/// Corpus-fitted projection via kernel PCA with a Gaussian (RBF) kernel.
11///
12/// # Mathematical background
13///
14/// Standard PCA finds the 3 directions of maximum *linear* variance.
15/// Kernel PCA first maps data into an infinite-dimensional feature space
16/// **F** via the kernel trick, then performs PCA there. With the Gaussian
17/// kernel k(**x**, **y**) = exp(−‖**x**−**y**‖²/(2σ²)), every data point
18/// Φ(**x**) lies on a hypersphere **S** in **F** (since k(**x**,**x**) = 1
19/// for all **x**). This is a natural fit for SphereQL's spherical geometry.
20///
21/// The key advantage over linear PCA: kernel PCA captures non-linear
22/// manifold structure (curved clusters, rings, spirals) that linear PCA
23/// crushes flat. For embedding spaces with complex semantic geometry,
24/// this preserves more meaningful neighborhood relationships.
25///
26/// # Limit behaviour
27///
28/// * **σ → ∞**: kernel PCA converges to standard PCA (Hoffmann, Appendix A).
29/// * **σ → 0**: all points become orthogonal in **F**; PCA is meaningless.
30///
31/// # Complexity
32///
33/// * **Fitting**: O(n²·d) to build the kernel matrix + O(n²·q·iters)
34///   for power iteration on the n×n centered kernel matrix.
35/// * **Projection**: O(n·d) per embedding (n kernel evaluations).
36/// * **Memory**: O(n·d) for training data + O(n) per eigenvector.
37///
38/// # References
39///
40/// * Schölkopf, Smola, Müller. "Nonlinear component analysis as a kernel
41///   eigenvalue problem." *Neural Computation* 10 (1998) 1299–1319.
42/// * Hoffmann. "Kernel PCA for novelty detection." *Pattern Recognition*
43///   40 (2007) 863–874.
44#[derive(Clone)]
45pub struct KernelPcaProjection {
46    /// Normalized training data — needed at query time for kernel evaluations.
47    /// Each inner Vec has length `dim`.
48    training_data: Vec<Vec<f64>>,
49
50    /// Gaussian kernel width.
51    sigma: f64,
52
53    /// Precomputed 1/(2σ²) to avoid repeated division.
54    inv_two_sigma_sq: f64,
55
56    /// Top-3 eigenvector coefficients in ℝⁿ, scaled so ‖α^l‖² = 1/λ_l.
57    /// Each Vec has length n = training_data.len().
58    alphas: [Vec<f64>; 3],
59
60    /// Top-3 eigenvalues of the centered kernel matrix (descending).
61    eigenvalues: [f64; 3],
62
63    /// Column means of the **raw** kernel matrix K.
64    /// col_means\[i\] = (1/n) Σⱼ K_{ij}.
65    col_means: Vec<f64>,
66
67    /// Grand mean of the **raw** kernel matrix K.
68    /// (1/n²) Σ_{i,j} K_{ij}.
69    grand_mean: f64,
70
71    /// Total variance in centred feature space = trace(K̃)/n.
72    total_variance: f64,
73
74    /// Embedding dimensionality.
75    dim: usize,
76
77    /// Controls the radial coordinate.
78    radial: RadialStrategy,
79
80    /// When true, r comes from the projection magnitude rather than
81    /// the embedding magnitude.
82    volumetric: bool,
83}
84
85impl KernelPcaProjection {
86    /// Fit kernel PCA with automatic σ selection.
87    ///
88    /// σ is set to the median pairwise Euclidean distance on the normalised
89    /// embeddings divided by √2, so that the kernel value at the median
90    /// distance is exp(−1) ≈ 0.37. This is a standard heuristic in the
91    /// kernel methods literature.
92    pub fn fit(embeddings: &[Embedding], radial: RadialStrategy) -> Result<Self, ProjectionError> {
93        Self::fit_impl(embeddings, None, radial)
94    }
95
96    /// Fit kernel PCA with an explicit kernel width σ.
97    ///
98    /// Use this when you have domain knowledge about the appropriate scale,
99    /// or when benchmarking different σ values. Returns
100    /// [`ProjectionError::InvalidSigma`] if `sigma <= 0.0`.
101    pub fn fit_with_sigma(
102        embeddings: &[Embedding],
103        sigma: f64,
104        radial: RadialStrategy,
105    ) -> Result<Self, ProjectionError> {
106        if sigma <= 0.0 {
107            return Err(ProjectionError::InvalidSigma { got: sigma });
108        }
109        Self::fit_impl(embeddings, Some(sigma), radial)
110    }
111
112    /// Convenience: fit with default radial strategy and auto σ.
113    pub fn fit_default(embeddings: &[Embedding]) -> Result<Self, ProjectionError> {
114        Self::fit(embeddings, RadialStrategy::default())
115    }
116
117    /// Enable volumetric mode: r comes from the kernel PCA projection
118    /// magnitude instead of the embedding magnitude.
119    pub fn with_volumetric(mut self, enabled: bool) -> Self {
120        self.volumetric = enabled;
121        self
122    }
123
124    /// The Gaussian kernel width used for this projection.
125    pub fn sigma(&self) -> f64 {
126        self.sigma
127    }
128
129    /// Number of training points stored (needed for kernel evaluations).
130    pub fn num_training_points(&self) -> usize {
131        self.training_data.len()
132    }
133
134    /// The fraction of total feature-space variance captured by the top-3
135    /// kernel principal components.
136    ///
137    /// Analogous to `PcaProjection::explained_variance_ratio()` but in the
138    /// (infinite-dimensional) Gaussian feature space.
139    pub fn explained_variance_ratio(&self) -> f64 {
140        if self.total_variance < f64::EPSILON {
141            return 1.0;
142        }
143        let explained: f64 = self.eigenvalues.iter().sum();
144        // eigenvalues from power iteration on K̃ are the raw eigenvalues;
145        // total_variance = trace(K̃)/n, and eigenvalues relate to variance
146        // via λ_l/n (since K̃ = n·Cov in feature space). Divide both by n.
147        (explained / (self.total_variance * self.training_data.len() as f64)).clamp(0.0, 1.0)
148    }
149
150    /// The top-3 eigenvalues of the centred kernel matrix.
151    pub fn eigenvalues(&self) -> [f64; 3] {
152        self.eigenvalues
153    }
154
155    // ── Implementation ─────────────────────────────────────────────────
156
157    fn fit_impl(
158        embeddings: &[Embedding],
159        sigma: Option<f64>,
160        radial: RadialStrategy,
161    ) -> Result<Self, ProjectionError> {
162        if embeddings.is_empty() {
163            return Err(ProjectionError::EmptyCorpus);
164        }
165        let dim = embeddings[0].dimension();
166        if dim < 3 {
167            return Err(ProjectionError::DimensionTooLow {
168                got: dim,
169                required: 3,
170            });
171        }
172        for (i, e) in embeddings.iter().enumerate() {
173            if e.dimension() != dim {
174                return Err(ProjectionError::InconsistentDimension {
175                    index: i,
176                    expected: dim,
177                    got: e.dimension(),
178                });
179            }
180        }
181
182        // 1. Normalise to unit sphere (angular similarity only)
183        let normalized: Vec<Vec<f64>> = embeddings.iter().map(|e| e.normalized()).collect();
184        let n = normalized.len();
185
186        // 2. Choose σ
187        let sigma = sigma.unwrap_or_else(|| auto_sigma(&normalized));
188        let inv_two_sigma_sq = 0.5 / (sigma * sigma);
189
190        // 3. Build the n×n kernel matrix K
191        //    K_{ij} = exp(−‖x_i − x_j‖² / (2σ²))
192        //    For normalised data, ‖x_i − x_j‖² = 2 − 2(x_i · x_j)
193        //    so K_{ij} = exp(−(1 − x_i·x_j) / σ²)
194        let mut kernel_flat = vec![0.0_f64; n * n];
195        for i in 0..n {
196            kernel_flat[i * n + i] = 1.0; // k(x,x) = 1
197            for j in (i + 1)..n {
198                let val = gaussian_kernel(&normalized[i], &normalized[j], inv_two_sigma_sq);
199                kernel_flat[i * n + j] = val;
200                kernel_flat[j * n + i] = val;
201            }
202        }
203
204        // 4. Centering statistics on the raw kernel matrix
205        //    col_means[i] = (1/n) Σ_j K_{ij}
206        //    grand_mean   = (1/n²) Σ_{i,j} K_{ij}
207        let mut col_means = vec![0.0; n];
208        let mut grand_sum = 0.0;
209        for i in 0..n {
210            let mut row_sum = 0.0;
211            for j in 0..n {
212                row_sum += kernel_flat[i * n + j];
213            }
214            col_means[i] = row_sum / n as f64;
215            grand_sum += row_sum;
216        }
217        let grand_mean = grand_sum / (n * n) as f64;
218
219        // 5. Centre the kernel matrix (Schölkopf eq. 4):
220        //    K̃_{ij} = K_{ij} − col_means[i] − col_means[j] + grand_mean
221        //
222        //    We overwrite kernel_flat in place to save memory. K̃ is
223        //    symmetric in closed form, but `col_means[i] + col_means[j]
224        //    - grand_mean` accumulates FP rounding such that K̃[i,j] can
225        //    drift from K̃[j,i] by ~1e-15. Across 300 power-iteration
226        //    steps on n ≈ 1000 that drift compounds into eigenvector
227        //    error; symmetrize once here to eliminate the source.
228        for i in 0..n {
229            for j in i..n {
230                let v = kernel_flat[i * n + j] - col_means[i] - col_means[j] + grand_mean;
231                kernel_flat[i * n + j] = v;
232                kernel_flat[j * n + i] = v;
233            }
234        }
235
236        // 6. Total variance in feature space = trace(K̃) / n
237        let trace: f64 = (0..n).map(|i| kernel_flat[i * n + i]).sum();
238        let total_variance = trace / n as f64;
239
240        // 7. Top-3 eigenvectors via power iteration on K̃
241        let (raw_vectors, raw_values) = top_k_symmetric(&kernel_flat, n, 3);
242
243        // 8. Scale eigenvectors: ‖α^l‖² = 1/λ_l so that principal
244        //    components V^l in F have unit norm.
245        let mut alphas: [Vec<f64>; 3] = [Vec::new(), Vec::new(), Vec::new()];
246        let mut eigenvalues = [0.0_f64; 3];
247        for l in 0..3 {
248            let lambda = raw_values.get(l).copied().unwrap_or(0.0);
249            eigenvalues[l] = lambda;
250            if lambda > f64::EPSILON {
251                let scale = 1.0 / lambda.sqrt();
252                alphas[l] = raw_vectors[l].iter().map(|&a| a * scale).collect();
253            } else {
254                alphas[l] = vec![0.0; n];
255            }
256        }
257
258        Ok(Self {
259            training_data: normalized,
260            sigma,
261            inv_two_sigma_sq,
262            alphas,
263            eigenvalues,
264            col_means,
265            grand_mean,
266            total_variance,
267            dim,
268            radial,
269            volumetric: false,
270        })
271    }
272
273    /// Compute centred kernel vector and project onto top-3 components.
274    ///
275    /// Returns (f₁, f₂, f₃, spherical_potential).
276    fn kernel_project(&self, normalized: &[f64]) -> (f64, f64, f64, f64) {
277        let n = self.training_data.len();
278
279        // k_z[i] = k(z, x_i)
280        let k_z: Vec<f64> = self
281            .training_data
282            .iter()
283            .map(|x_i| gaussian_kernel(normalized, x_i, self.inv_two_sigma_sq))
284            .collect();
285
286        // mean(k_z)
287        let mean_k_z: f64 = k_z.iter().sum::<f64>() / n as f64;
288
289        // Centre: k̃_z[i] = k_z[i] − mean(k_z) − col_means[i] + grand_mean
290        let k_z_centered: Vec<f64> = (0..n)
291            .map(|i| k_z[i] - mean_k_z - self.col_means[i] + self.grand_mean)
292            .collect();
293
294        // Project: f_l = Σ_i α^l_i · k̃_z[i]
295        let f1 = dot(&self.alphas[0], &k_z_centered);
296        let f2 = dot(&self.alphas[1], &k_z_centered);
297        let f3 = dot(&self.alphas[2], &k_z_centered);
298
299        // Spherical potential (Hoffmann eq. 6):
300        //   p_S(z) = k(z,z) − (2/n)Σ k(z,x_i) + (1/n²)ΣΣ k(x_i,x_j)
301        //          = 1 − 2·mean_k_z + grand_mean
302        //
303        // This is ‖Φ̃(z)‖² — the total "energy" of the centred point.
304        let spherical_potential = (1.0 - 2.0 * mean_k_z + self.grand_mean).max(0.0);
305
306        (f1, f2, f3, spherical_potential)
307    }
308}
309
310impl Projection for KernelPcaProjection {
311    fn project(&self, embedding: &Embedding) -> SphericalPoint {
312        assert_eq!(
313            embedding.dimension(),
314            self.dim,
315            "expected dimension {}, got {}",
316            self.dim,
317            embedding.dimension()
318        );
319
320        let normalized = embedding.normalized();
321        let (x, y, z, _) = self.kernel_project(&normalized);
322
323        if self.volumetric {
324            let sp = cartesian_to_spherical(&CartesianPoint::new(x, y, z));
325            if sp.r < f64::EPSILON {
326                return SphericalPoint::new_unchecked(0.0, 0.0, 0.0);
327            }
328            SphericalPoint::new_unchecked(sp.r, sp.theta, sp.phi)
329        } else {
330            let r = self.radial.compute(embedding.magnitude());
331            project_xyz_to_spherical(x, y, z, r)
332        }
333    }
334
335    fn project_rich(&self, embedding: &Embedding) -> ProjectedPoint {
336        assert_eq!(
337            embedding.dimension(),
338            self.dim,
339            "expected dimension {}, got {}",
340            self.dim,
341            embedding.dimension()
342        );
343
344        let intensity = embedding.magnitude();
345        let normalized = embedding.normalized();
346        let (x, y, z, spherical_potential) = self.kernel_project(&normalized);
347
348        let projection_magnitude = (x * x + y * y + z * z).sqrt();
349
350        // Per-point certainty from reconstruction error (Hoffmann eq. 11):
351        //   p(z)  = p_S(z) − Σ f_l(z)²
352        //   certainty = 1 − p(z)/p_S(z) = (Σ f_l²) / p_S(z)
353        let projection_sq = x * x + y * y + z * z;
354        let certainty = if spherical_potential > f64::EPSILON {
355            (projection_sq / spherical_potential).clamp(0.0, 1.0)
356        } else {
357            0.0
358        };
359
360        let position = if self.volumetric {
361            let sp = cartesian_to_spherical(&CartesianPoint::new(x, y, z));
362            if sp.r < f64::EPSILON {
363                SphericalPoint::new_unchecked(0.0, 0.0, 0.0)
364            } else {
365                SphericalPoint::new_unchecked(sp.r, sp.theta, sp.phi)
366            }
367        } else {
368            let r = self.radial.compute(intensity);
369            project_xyz_to_spherical(x, y, z, r)
370        };
371
372        ProjectedPoint::new(position, certainty, intensity, projection_magnitude)
373    }
374
375    fn dimensionality(&self) -> usize {
376        self.dim
377    }
378}
379
380// ── Kernel function ────────────────────────────────────────────────────
381
382/// Gaussian (RBF) kernel: k(**x**, **y**) = exp(−‖**x**−**y**‖² · inv_two_sigma_sq)
383#[inline]
384fn gaussian_kernel(a: &[f64], b: &[f64], inv_two_sigma_sq: f64) -> f64 {
385    let sq_dist: f64 = a
386        .iter()
387        .zip(b.iter())
388        .map(|(&x, &y)| {
389            let d = x - y;
390            d * d
391        })
392        .sum();
393    (-sq_dist * inv_two_sigma_sq).exp()
394}
395
396// ── Automatic σ selection ──────────────────────────────────────────────
397
398/// Median-heuristic σ: σ = median(pairwise distances) / √2.
399///
400/// At the median distance, k = exp(−median²/(2σ²)) = exp(−1) ≈ 0.37.
401/// For small datasets (n ≤ 100) all pairs are used; otherwise 5 000
402/// random pairs are sampled.
403fn auto_sigma(data: &[Vec<f64>]) -> f64 {
404    let n = data.len();
405    if n < 2 {
406        return 1.0;
407    }
408
409    let mut distances = Vec::new();
410    let mut rng = SplitMix64::new(0xCAFE_BABE);
411
412    if n <= 100 {
413        distances.reserve(n * (n - 1) / 2);
414        for i in 0..n {
415            for j in (i + 1)..n {
416                distances.push(euclidean_dist(&data[i], &data[j]));
417            }
418        }
419    } else {
420        let num_pairs = 5000.min(n * (n - 1) / 2);
421        distances.reserve(num_pairs);
422        for _ in 0..num_pairs {
423            let i = (rng.next_u64() as usize) % n;
424            let mut j = (rng.next_u64() as usize) % n;
425            if j == i {
426                j = (j + 1) % n;
427            }
428            distances.push(euclidean_dist(&data[i], &data[j]));
429        }
430    }
431
432    // Median via `select_nth_unstable_by` is O(n) vs the O(n log n)
433    // full sort — for the 5000-pair sampled branch above that's the
434    // difference between ~65k and ~5k comparisons. `total_cmp` is
435    // NaN-safe (previously `.partial_cmp().unwrap()` panicked on
436    // degenerate zero-distance input).
437    let mid = distances.len() / 2;
438    let (_, median_ref, _) = distances.select_nth_unstable_by(mid, |a, b| a.total_cmp(b));
439    let median = *median_ref;
440
441    (median * std::f64::consts::FRAC_1_SQRT_2).max(1e-8)
442}
443
444fn euclidean_dist(a: &[f64], b: &[f64]) -> f64 {
445    a.iter()
446        .zip(b.iter())
447        .map(|(&x, &y)| {
448            let d = x - y;
449            d * d
450        })
451        .sum::<f64>()
452        .sqrt()
453}
454
455// ── Eigensolver for symmetric n×n matrix ───────────────────────────────
456
457/// Power iteration with deflation for the top-k eigenvectors of a
458/// symmetric n×n matrix stored as a flat row-major array.
459///
460/// Returns (eigenvectors, eigenvalues) sorted by decreasing eigenvalue.
461fn top_k_symmetric(matrix: &[f64], n: usize, k: usize) -> (Vec<Vec<f64>>, Vec<f64>) {
462    let max_iters = 300;
463    let tol = 1e-10;
464    let mut vectors: Vec<Vec<f64>> = Vec::with_capacity(k);
465    let mut values: Vec<f64> = Vec::with_capacity(k);
466    let mut rng = SplitMix64::new(0xBEEF_CAFE);
467
468    // Closure: write `matrix · src` into `dst`. Extracted because the
469    // inner loop needs this twice — once for the iterate and once for
470    // the Rayleigh quotient — and we cache the second call's result
471    // into the next iteration's iterate.
472    let matvec = |dst: &mut [f64], src: &[f64]| {
473        for (i, dst_i) in dst.iter_mut().enumerate() {
474            let row_start = i * n;
475            let mut s = 0.0;
476            for j in 0..n {
477                s += matrix[row_start + j] * src[j];
478            }
479            *dst_i = s;
480        }
481    };
482
483    for _ in 0..k {
484        let mut v: Vec<f64> = (0..n).map(|_| rng.normal()).collect();
485        normalize_vec(&mut v);
486        let mut eigenvalue = 0.0;
487
488        // Cached `matrix · v` from the previous iteration's Rayleigh
489        // step. `None` on first pass; populated afterward. Saves one
490        // full mat-vec per iteration.
491        //
492        // Note: a `.skip(1)` bug in earlier versions left `u[0] = 0`;
493        // see the `first_coordinate_is_nonzero` regression test.
494        let mut mv_cache: Option<Vec<f64>> = None;
495
496        for _ in 0..max_iters {
497            let mut u = mv_cache.take().unwrap_or_else(|| {
498                let mut u = vec![0.0; n];
499                matvec(&mut u, &v);
500                u
501            });
502
503            for prev in &vectors {
504                let proj = dot(&u, prev);
505                for (ui, &pi) in u.iter_mut().zip(prev.iter()) {
506                    *ui -= proj * pi;
507                }
508            }
509
510            let mag = normalize_vec(&mut u);
511            if mag < f64::EPSILON {
512                break;
513            }
514
515            // Rayleigh quotient: λ ≈ uᵀ · (matrix · u). Save the
516            // `matrix · u` vector — next iteration starts with
517            // `u_next = matrix · v_next = matrix · u_current`.
518            let mut mv_next = vec![0.0; n];
519            matvec(&mut mv_next, &u);
520            eigenvalue = u.iter().zip(mv_next.iter()).map(|(a, b)| a * b).sum();
521
522            // Convergence: |⟨u, v⟩| → 1 as u aligns with v.
523            // `.max(0.0)` clamps the FP noise that briefly pushes the
524            // value below zero near convergence.
525            let change = (1.0 - dot(&u, &v).abs()).max(0.0);
526            v = u;
527            mv_cache = Some(mv_next);
528
529            if change < tol {
530                break;
531            }
532        }
533
534        vectors.push(v);
535        values.push(eigenvalue);
536    }
537
538    while vectors.len() < k {
539        let mut v: Vec<f64> = (0..n).map(|_| rng.normal()).collect();
540        for prev in &vectors {
541            let proj = dot(&v, prev);
542            for (vi, &pi) in v.iter_mut().zip(prev.iter()) {
543                *vi -= proj * pi;
544            }
545        }
546        normalize_vec(&mut v);
547        vectors.push(v);
548        values.push(0.0);
549    }
550
551    (vectors, values)
552}
553
554#[cfg(test)]
555mod tests {
556    use super::*;
557    use sphereql_core::angular_distance;
558    use std::f64::consts::{PI, TAU};
559
560    fn emb(vals: &[f64]) -> Embedding {
561        Embedding::new(vals.to_vec())
562    }
563
564    fn corpus_10d() -> Vec<Embedding> {
565        vec![
566            emb(&[1.0, 0.0, 0.0, 0.1, 0.05, -0.02, 0.03, -0.01, 0.04, 0.02]),
567            emb(&[0.0, 1.0, 0.0, -0.05, 0.1, 0.03, -0.02, 0.01, -0.03, 0.04]),
568            emb(&[0.0, 0.0, 1.0, 0.02, -0.03, 0.1, 0.05, 0.02, -0.01, -0.04]),
569            emb(&[1.0, 1.0, 0.0, 0.05, 0.08, 0.01, 0.01, -0.02, 0.02, 0.03]),
570            emb(&[0.0, 1.0, 1.0, -0.02, 0.07, 0.07, 0.01, 0.02, -0.02, 0.01]),
571            emb(&[1.0, 0.0, 1.0, 0.06, 0.01, 0.05, -0.03, -0.01, 0.03, -0.02]),
572            emb(&[-1.0, 0.0, 0.0, -0.08, 0.02, 0.01, 0.02, 0.03, -0.02, 0.01]),
573            emb(&[0.0, -1.0, 0.0, 0.03, -0.09, -0.02, 0.01, -0.01, 0.02, -0.03]),
574        ]
575    }
576
577    fn assert_valid_spherical(sp: &SphericalPoint) {
578        assert!(sp.r >= 0.0, "r must be >= 0, got {}", sp.r);
579        assert!(
580            sp.theta >= 0.0 && sp.theta < TAU,
581            "theta must be in [0, 2π), got {}",
582            sp.theta
583        );
584        assert!(
585            sp.phi >= 0.0 && sp.phi <= PI,
586            "phi must be in [0, π], got {}",
587            sp.phi
588        );
589    }
590
591    /// Regression test for the `.skip(1)` bug in `top_k_symmetric`:
592    /// the mat-vec loop used to start at row 1, leaving `u[0] = 0.0`
593    /// on every power-iteration step. That silently zeroed the first
594    /// coordinate of every kernel PCA eigenvector. This test fails
595    /// loudly if the skip comes back.
596    #[test]
597    fn top_k_symmetric_first_coordinate_nonzero() {
598        // Symmetric, dominant-diagonal 4×4 matrix whose top eigenvector
599        // is approximately the all-ones vector (each coordinate ≈ 0.5).
600        let n = 4;
601        #[rustfmt::skip]
602        let matrix = vec![
603            4.0, 1.0, 0.5, 0.2,
604            1.0, 4.0, 1.0, 0.5,
605            0.5, 1.0, 4.0, 1.0,
606            0.2, 0.5, 1.0, 4.0,
607        ];
608        let (vectors, values) = top_k_symmetric(&matrix, n, 2);
609        assert_eq!(vectors.len(), 2);
610        for (idx, v) in vectors.iter().enumerate() {
611            assert!(
612                v[0].abs() > 1e-6,
613                "eigenvector {idx} has a suspiciously small first \
614                 coordinate ({:.2e}) — the .skip(1) bug is back",
615                v[0]
616            );
617        }
618        assert!(values[0] > values[1], "eigenvalues must be sorted");
619    }
620
621    #[test]
622    fn kernel_pca_fit_auto_sigma() {
623        let corpus = corpus_10d();
624        let kpca = KernelPcaProjection::fit(&corpus, RadialStrategy::Fixed(1.0)).unwrap();
625        assert_eq!(kpca.dimensionality(), 10);
626        assert!(kpca.sigma() > 0.0);
627        assert_eq!(kpca.num_training_points(), 8);
628    }
629
630    #[test]
631    fn kernel_pca_fit_explicit_sigma() {
632        let corpus = corpus_10d();
633        let kpca =
634            KernelPcaProjection::fit_with_sigma(&corpus, 0.5, RadialStrategy::Fixed(1.0)).unwrap();
635        assert!((kpca.sigma() - 0.5).abs() < 1e-12);
636    }
637
638    #[test]
639    fn kernel_pca_fit_default() {
640        let corpus = corpus_10d();
641        let kpca = KernelPcaProjection::fit_default(&corpus).unwrap();
642        assert_eq!(kpca.dimensionality(), 10);
643    }
644
645    #[test]
646    fn kernel_pca_empty_corpus_returns_err() {
647        assert!(matches!(
648            KernelPcaProjection::fit(&[], RadialStrategy::Fixed(1.0)),
649            Err(ProjectionError::EmptyCorpus)
650        ));
651    }
652
653    #[test]
654    fn kernel_pca_too_few_dims_returns_err() {
655        assert!(matches!(
656            KernelPcaProjection::fit(&[emb(&[1.0, 2.0])], RadialStrategy::Fixed(1.0)),
657            Err(ProjectionError::DimensionTooLow {
658                got: 2,
659                required: 3
660            })
661        ));
662    }
663
664    #[test]
665    fn kernel_pca_zero_sigma_returns_err() {
666        let corpus = corpus_10d();
667        assert!(matches!(
668            KernelPcaProjection::fit_with_sigma(&corpus, 0.0, RadialStrategy::Fixed(1.0)),
669            Err(ProjectionError::InvalidSigma { got }) if got == 0.0
670        ));
671    }
672
673    #[test]
674    fn kernel_pca_produces_valid_spherical_points() {
675        let corpus = corpus_10d();
676        let kpca = KernelPcaProjection::fit(&corpus, RadialStrategy::Fixed(1.0)).unwrap();
677        for e in &corpus {
678            assert_valid_spherical(&kpca.project(e));
679        }
680    }
681
682    #[test]
683    fn kernel_pca_out_of_sample_produces_valid_points() {
684        let corpus = corpus_10d();
685        let kpca = KernelPcaProjection::fit(&corpus, RadialStrategy::Fixed(1.0)).unwrap();
686        let novel = emb(&[0.5, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]);
687        assert_valid_spherical(&kpca.project(&novel));
688    }
689
690    #[test]
691    fn kernel_pca_preserves_angular_ordering() {
692        let corpus = corpus_10d();
693        let kpca = KernelPcaProjection::fit(&corpus, RadialStrategy::Fixed(1.0)).unwrap();
694        let a = emb(&[1.0, 0.1, 0.0, 0.05, 0.02, -0.01, 0.01, 0.0, 0.02, 0.01]);
695        let b = emb(&[0.9, 0.2, 0.1, 0.04, 0.03, 0.0, 0.02, -0.01, 0.01, 0.02]);
696        let c = emb(&[-1.0, -0.1, 0.0, -0.04, 0.01, 0.02, 0.01, 0.02, -0.01, 0.01]);
697        let pa = kpca.project(&a);
698        let pb = kpca.project(&b);
699        let pc = kpca.project(&c);
700        let d_ab = angular_distance(&pa, &pb);
701        let d_ac = angular_distance(&pa, &pc);
702        assert!(
703            d_ab < d_ac,
704            "similar items should be closer: d(a,b)={d_ab:.4} < d(a,c)={d_ac:.4}"
705        );
706    }
707
708    #[test]
709    fn kernel_pca_fixed_radial() {
710        let corpus = corpus_10d();
711        let kpca = KernelPcaProjection::fit(&corpus, RadialStrategy::Fixed(2.5)).unwrap();
712        let sp = kpca.project(&corpus[0]);
713        assert!((sp.r - 2.5).abs() < 1e-12);
714    }
715
716    #[test]
717    fn kernel_pca_magnitude_radial() {
718        let corpus = corpus_10d();
719        let kpca = KernelPcaProjection::fit(&corpus, RadialStrategy::Magnitude).unwrap();
720        let short = emb(&[0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]);
721        let long = emb(&[10.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]);
722        let ps = kpca.project(&short);
723        let pl = kpca.project(&long);
724        assert!(ps.r < pl.r, "longer vector should have larger radius");
725        assert!((ps.r - 0.1).abs() < 1e-10);
726        assert!((pl.r - 10.0).abs() < 1e-10);
727    }
728
729    #[test]
730    fn kernel_pca_volumetric() {
731        let corpus = corpus_10d();
732        let kpca = KernelPcaProjection::fit(&corpus, RadialStrategy::Fixed(1.0))
733            .unwrap()
734            .with_volumetric(true);
735        let sp = kpca.project(&corpus[0]);
736        assert!(sp.r >= 0.0);
737    }
738
739    #[test]
740    fn kernel_pca_project_rich_has_valid_certainty() {
741        let corpus = corpus_10d();
742        let kpca = KernelPcaProjection::fit(&corpus, RadialStrategy::Fixed(1.0)).unwrap();
743        for e in &corpus {
744            let rich = kpca.project_rich(e);
745            assert!(
746                rich.certainty >= 0.0 && rich.certainty <= 1.0,
747                "certainty must be in [0,1], got {}",
748                rich.certainty
749            );
750            assert!(rich.intensity > 0.0);
751            assert!(rich.projection_magnitude >= 0.0);
752        }
753    }
754
755    #[test]
756    fn kernel_pca_certainty_is_meaningful() {
757        let corpus = corpus_10d();
758        let kpca = KernelPcaProjection::fit(&corpus, RadialStrategy::Fixed(1.0)).unwrap();
759        let total_certainty: f64 = corpus.iter().map(|e| kpca.project_rich(e).certainty).sum();
760        let mean_certainty = total_certainty / corpus.len() as f64;
761        assert!(
762            mean_certainty > 0.01,
763            "mean certainty of training data should be non-trivial, got {mean_certainty}"
764        );
765    }
766
767    #[test]
768    fn kernel_pca_explained_variance_in_range() {
769        let corpus = corpus_10d();
770        let kpca = KernelPcaProjection::fit(&corpus, RadialStrategy::Fixed(1.0)).unwrap();
771        let ratio = kpca.explained_variance_ratio();
772        assert!(
773            ratio > 0.0 && ratio <= 1.0,
774            "explained variance ratio should be in (0, 1], got {ratio}"
775        );
776    }
777
778    #[test]
779    fn kernel_pca_eigenvalues_descending() {
780        let corpus = corpus_10d();
781        let kpca = KernelPcaProjection::fit(&corpus, RadialStrategy::Fixed(1.0)).unwrap();
782        let ev = kpca.eigenvalues();
783        assert!(ev[0] >= ev[1], "eigenvalues must descend: {ev:?}");
784        assert!(ev[1] >= ev[2], "eigenvalues must descend: {ev:?}");
785        assert!(ev[0] > 0.0, "first eigenvalue should be positive");
786    }
787
788    #[test]
789    fn gaussian_kernel_self_similarity() {
790        let x = vec![1.0, 0.0, 0.0, 0.0, 0.0];
791        let inv = 0.5;
792        assert!((gaussian_kernel(&x, &x, inv) - 1.0).abs() < 1e-12);
793    }
794
795    #[test]
796    fn gaussian_kernel_symmetry() {
797        let a = vec![1.0, 0.0, 0.5];
798        let b = vec![0.0, 1.0, -0.3];
799        let inv = 1.0;
800        assert!((gaussian_kernel(&a, &b, inv) - gaussian_kernel(&b, &a, inv)).abs() < 1e-15);
801    }
802
803    #[test]
804    fn gaussian_kernel_range() {
805        let a = vec![1.0, 0.0, 0.0];
806        let b = vec![0.0, 1.0, 0.0];
807        let inv = 0.5;
808        let k = gaussian_kernel(&a, &b, inv);
809        assert!(
810            k > 0.0 && k <= 1.0,
811            "Gaussian kernel must be in (0, 1], got {k}"
812        );
813    }
814
815    #[test]
816    fn auto_sigma_is_positive() {
817        let corpus = corpus_10d();
818        let normalized: Vec<Vec<f64>> = corpus.iter().map(|e| e.normalized()).collect();
819        let sigma = auto_sigma(&normalized);
820        assert!(sigma > 0.0);
821    }
822
823    #[test]
824    fn auto_sigma_single_point() {
825        let data = vec![vec![1.0, 0.0, 0.0]];
826        assert!((auto_sigma(&data) - 1.0).abs() < 1e-12);
827    }
828
829    #[test]
830    #[should_panic(expected = "expected dimension 10")]
831    fn kernel_pca_dimension_mismatch_panics() {
832        let corpus = corpus_10d();
833        let kpca = KernelPcaProjection::fit(&corpus, RadialStrategy::Fixed(1.0)).unwrap();
834        let _ = kpca.project(&emb(&[1.0, 2.0, 3.0]));
835    }
836
837    #[test]
838    fn kernel_pca_clone_produces_identical_results() {
839        let corpus = corpus_10d();
840        let kpca = KernelPcaProjection::fit(&corpus, RadialStrategy::Fixed(1.0)).unwrap();
841        let kpca2 = kpca.clone();
842        for e in &corpus {
843            let sp1 = kpca.project(e);
844            let sp2 = kpca2.project(e);
845            assert!((sp1.theta - sp2.theta).abs() < 1e-12);
846            assert!((sp1.phi - sp2.phi).abs() < 1e-12);
847        }
848    }
849
850    #[test]
851    fn kernel_pca_works_with_embedding_index() {
852        use crate::query::EmbeddingIndex;
853        let corpus = corpus_10d();
854        let kpca = KernelPcaProjection::fit(&corpus, RadialStrategy::Fixed(1.0)).unwrap();
855        let mut idx = EmbeddingIndex::builder(kpca)
856            .theta_divisions(4)
857            .phi_divisions(3)
858            .build();
859        for (i, e) in corpus.iter().enumerate() {
860            idx.insert(format!("item-{i}"), e);
861        }
862        assert_eq!(idx.len(), corpus.len());
863        let query = emb(&[0.9, 0.1, 0.0, 0.05, 0.02, -0.01, 0.01, 0.0, 0.02, 0.01]);
864        let results = idx.search_nearest(&query, 3);
865        assert_eq!(results.len(), 3);
866        assert!(results[0].distance <= results[1].distance);
867    }
868
869    #[test]
870    fn large_sigma_approaches_pca_ordering() {
871        use crate::projection::PcaProjection;
872        let corpus = corpus_10d();
873        let pca = PcaProjection::fit(&corpus, RadialStrategy::Fixed(1.0)).unwrap();
874        let kpca = KernelPcaProjection::fit_with_sigma(&corpus, 100.0, RadialStrategy::Fixed(1.0))
875            .unwrap();
876        let query = emb(&[1.0, 0.1, 0.0, 0.05, 0.02, -0.01, 0.01, 0.0, 0.02, 0.01]);
877        let pca_pt = pca.project(&query);
878        let kpca_pt = kpca.project(&query);
879        let mut pca_dists: Vec<(usize, f64)> = corpus
880            .iter()
881            .enumerate()
882            .map(|(i, e)| (i, angular_distance(&pca_pt, &pca.project(e))))
883            .collect();
884        let mut kpca_dists: Vec<(usize, f64)> = corpus
885            .iter()
886            .enumerate()
887            .map(|(i, e)| (i, angular_distance(&kpca_pt, &kpca.project(e))))
888            .collect();
889        pca_dists.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
890        kpca_dists.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
891        assert_eq!(
892            pca_dists[0].0, kpca_dists[0].0,
893            "nearest neighbour should match between PCA and kernel PCA with large σ"
894        );
895    }
896}
sphereql_embed/kernel_pca.rs

sphereql_embed/
kernel_pca.rs