sphereql_embed/
kernel_pca.rs

1use sphereql_core::{CartesianPoint, SphericalPoint, cartesian_to_spherical};
2
3use crate::projection::{
4    ProjectionError, SplitMix64, dot, normalize_vec, project_xyz_to_spherical,
5};
6use crate::types::{Embedding, ProjectedPoint, RadialStrategy};
7
8use crate::projection::Projection;
9
10/// Corpus-fitted projection via kernel PCA with a Gaussian (RBF) kernel.
11///
12/// # Mathematical background
13///
14/// Standard PCA finds the 3 directions of maximum *linear* variance.
15/// Kernel PCA first maps data into an infinite-dimensional feature space
16/// **F** via the kernel trick, then performs PCA there. With the Gaussian
17/// kernel k(**x**, **y**) = exp(−‖**x**−**y**‖²/(2σ²)), every data point
18/// Φ(**x**) lies on a hypersphere **S** in **F** (since k(**x**,**x**) = 1
19/// for all **x**). This is a natural fit for SphereQL's spherical geometry.
20///
21/// The key advantage over linear PCA: kernel PCA captures non-linear
22/// manifold structure (curved clusters, rings, spirals) that linear PCA
23/// crushes flat. For embedding spaces with complex semantic geometry,
24/// this preserves more meaningful neighborhood relationships.
25///
26/// # Limit behaviour
27///
28/// * **σ → ∞**: kernel PCA converges to standard PCA (Hoffmann, Appendix A).
29/// * **σ → 0**: all points become orthogonal in **F**; PCA is meaningless.
30///
31/// # Complexity
32///
33/// * **Fitting**: O(n²·d) to build the kernel matrix + O(n²·q·iters)
34///   for power iteration on the n×n centered kernel matrix.
35/// * **Projection**: O(n·d) per embedding (n kernel evaluations).
36/// * **Memory**: O(n·d) for training data + O(n) per eigenvector.
37///
38/// # References
39///
40/// * Schölkopf, Smola, Müller. "Nonlinear component analysis as a kernel
41///   eigenvalue problem." *Neural Computation* 10 (1998) 1299–1319.
42/// * Hoffmann. "Kernel PCA for novelty detection." *Pattern Recognition*
43///   40 (2007) 863–874.
44#[derive(Clone)]
45pub struct KernelPcaProjection {
46    /// Normalized training data — needed at query time for kernel evaluations.
47    /// Each inner Vec has length `dim`.
48    training_data: Vec<Vec<f64>>,
49
50    /// Gaussian kernel width.
51    sigma: f64,
52
53    /// Precomputed 1/(2σ²) to avoid repeated division.
54    inv_two_sigma_sq: f64,
55
56    /// Top-3 eigenvector coefficients in ℝⁿ, scaled so ‖α^l‖² = 1/λ_l.
57    /// Each Vec has length n = training_data.len().
58    alphas: [Vec<f64>; 3],
59
60    /// Top-3 eigenvalues of the centered kernel matrix (descending).
61    eigenvalues: [f64; 3],
62
63    /// Column means of the **raw** kernel matrix K.
64    /// col_means\[i\] = (1/n) Σⱼ K_{ij}.
65    col_means: Vec<f64>,
66
67    /// Grand mean of the **raw** kernel matrix K.
68    /// (1/n²) Σ_{i,j} K_{ij}.
69    grand_mean: f64,
70
71    /// Total variance in centred feature space = trace(K̃)/n.
72    total_variance: f64,
73
74    /// Embedding dimensionality.
75    dim: usize,
76
77    /// Controls the radial coordinate.
78    radial: RadialStrategy,
79
80    /// When true, r comes from the projection magnitude rather than
81    /// the embedding magnitude.
82    volumetric: bool,
83}
84
85/// At or above this corpus size, [`KernelPcaProjection::fit`] emits a
86/// stderr warning. Power iteration on the n×n centered kernel matrix
87/// becomes an interactive-development hazard around this scale.
88const KPCA_FIT_WARN_THRESHOLD: usize = 2_000;
89
90impl KernelPcaProjection {
91    /// Fit kernel PCA with automatic σ selection.
92    ///
93    /// σ is set to the median pairwise Euclidean distance on the normalised
94    /// embeddings divided by √2, so that the kernel value at the median
95    /// distance is exp(−1) ≈ 0.37. This is a standard heuristic in the
96    /// kernel methods literature.
97    pub fn fit(embeddings: &[Embedding], radial: RadialStrategy) -> Result<Self, ProjectionError> {
98        Self::fit_impl(embeddings, None, radial)
99    }
100
101    /// Fit kernel PCA with an explicit kernel width σ.
102    ///
103    /// Use this when you have domain knowledge about the appropriate scale,
104    /// or when benchmarking different σ values. Returns
105    /// [`ProjectionError::InvalidSigma`] if `sigma <= 0.0`.
106    pub fn fit_with_sigma(
107        embeddings: &[Embedding],
108        sigma: f64,
109        radial: RadialStrategy,
110    ) -> Result<Self, ProjectionError> {
111        if sigma <= 0.0 {
112            return Err(ProjectionError::InvalidSigma { got: sigma });
113        }
114        Self::fit_impl(embeddings, Some(sigma), radial)
115    }
116
117    /// Convenience: fit with default radial strategy and auto σ.
118    pub fn fit_default(embeddings: &[Embedding]) -> Result<Self, ProjectionError> {
119        Self::fit(embeddings, RadialStrategy::default())
120    }
121
122    /// Enable volumetric mode: r comes from the kernel PCA projection
123    /// magnitude instead of the embedding magnitude.
124    pub fn with_volumetric(mut self, enabled: bool) -> Self {
125        self.volumetric = enabled;
126        self
127    }
128
129    /// The Gaussian kernel width used for this projection.
130    pub fn sigma(&self) -> f64 {
131        self.sigma
132    }
133
134    /// Number of training points stored (needed for kernel evaluations).
135    pub fn num_training_points(&self) -> usize {
136        self.training_data.len()
137    }
138
139    /// The fraction of total feature-space variance captured by the top-3
140    /// kernel principal components.
141    ///
142    /// Analogous to `PcaProjection::explained_variance_ratio()` but in the
143    /// (infinite-dimensional) Gaussian feature space.
144    pub fn explained_variance_ratio(&self) -> f64 {
145        if self.total_variance < f64::EPSILON {
146            return 1.0;
147        }
148        let explained: f64 = self.eigenvalues.iter().sum();
149        // eigenvalues from power iteration on K̃ are the raw eigenvalues;
150        // total_variance = trace(K̃)/n, and eigenvalues relate to variance
151        // via λ_l/n (since K̃ = n·Cov in feature space). Divide both by n.
152        (explained / (self.total_variance * self.training_data.len() as f64)).clamp(0.0, 1.0)
153    }
154
155    /// The top-3 eigenvalues of the centred kernel matrix.
156    pub fn eigenvalues(&self) -> [f64; 3] {
157        self.eigenvalues
158    }
159
160    // ── Implementation ─────────────────────────────────────────────────
161
162    fn fit_impl(
163        embeddings: &[Embedding],
164        sigma: Option<f64>,
165        radial: RadialStrategy,
166    ) -> Result<Self, ProjectionError> {
167        if embeddings.is_empty() {
168            return Err(ProjectionError::EmptyCorpus);
169        }
170        let dim = embeddings[0].dimension();
171        if dim < 3 {
172            return Err(ProjectionError::DimensionTooLow {
173                got: dim,
174                required: 3,
175            });
176        }
177        // O(n²·d) kernel matrix + O(n²·q·iters) power iteration. At
178        // n ≈ 10k this approaches an hour; warn callers who didn't
179        // see the doc comment so they can switch to PCA or downsample.
180        let n = embeddings.len();
181        if n >= KPCA_FIT_WARN_THRESHOLD {
182            tracing::warn!(
183                n,
184                dim,
185                "KernelPcaProjection::fit: O(n²·d) cost, expect minutes-to-hours. \
186                 Consider PCA or downsampling."
187            );
188        }
189        for (i, e) in embeddings.iter().enumerate() {
190            if e.dimension() != dim {
191                return Err(ProjectionError::InconsistentDimension {
192                    index: i,
193                    expected: dim,
194                    got: e.dimension(),
195                });
196            }
197        }
198
199        // 1. Normalise to unit sphere (angular similarity only)
200        let normalized: Vec<Vec<f64>> = embeddings.iter().map(|e| e.normalized()).collect();
201        let n = normalized.len();
202
203        // 2. Choose σ
204        let sigma = sigma.unwrap_or_else(|| auto_sigma(&normalized));
205        let inv_two_sigma_sq = 0.5 / (sigma * sigma);
206
207        // 3. Build the n×n kernel matrix K
208        //    K_{ij} = exp(−‖x_i − x_j‖² / (2σ²))
209        //    For normalised data, ‖x_i − x_j‖² = 2 − 2(x_i · x_j)
210        //    so K_{ij} = exp(−(1 − x_i·x_j) / σ²)
211        let mut kernel_flat = vec![0.0_f64; n * n];
212        for i in 0..n {
213            kernel_flat[i * n + i] = 1.0; // k(x,x) = 1
214            for j in (i + 1)..n {
215                let val = gaussian_kernel(&normalized[i], &normalized[j], inv_two_sigma_sq);
216                kernel_flat[i * n + j] = val;
217                kernel_flat[j * n + i] = val;
218            }
219        }
220
221        // 4. Centering statistics on the raw kernel matrix
222        //    col_means[i] = (1/n) Σ_j K_{ij}
223        //    grand_mean   = (1/n²) Σ_{i,j} K_{ij}
224        let mut col_means = vec![0.0; n];
225        let mut grand_sum = 0.0;
226        for i in 0..n {
227            let mut row_sum = 0.0;
228            for j in 0..n {
229                row_sum += kernel_flat[i * n + j];
230            }
231            col_means[i] = row_sum / n as f64;
232            grand_sum += row_sum;
233        }
234        let grand_mean = grand_sum / (n * n) as f64;
235
236        // 5. Centre the kernel matrix (Schölkopf eq. 4):
237        //    K̃_{ij} = K_{ij} − col_means[i] − col_means[j] + grand_mean
238        //
239        //    We overwrite kernel_flat in place to save memory. K̃ is
240        //    symmetric in closed form, but `col_means[i] + col_means[j]
241        //    - grand_mean` accumulates FP rounding such that K̃[i,j] can
242        //    drift from K̃[j,i] by ~1e-15. Across 300 power-iteration
243        //    steps on n ≈ 1000 that drift compounds into eigenvector
244        //    error; symmetrize once here to eliminate the source.
245        for i in 0..n {
246            for j in i..n {
247                let v = kernel_flat[i * n + j] - col_means[i] - col_means[j] + grand_mean;
248                kernel_flat[i * n + j] = v;
249                kernel_flat[j * n + i] = v;
250            }
251        }
252
253        // 6. Total variance in feature space = trace(K̃) / n
254        let trace: f64 = (0..n).map(|i| kernel_flat[i * n + i]).sum();
255        let total_variance = trace / n as f64;
256
257        // 7. Top-3 eigenvectors via power iteration on K̃
258        let (raw_vectors, raw_values) = top_k_symmetric(&kernel_flat, n, 3);
259
260        // 8. Scale eigenvectors: ‖α^l‖² = 1/λ_l so that principal
261        //    components V^l in F have unit norm.
262        let mut alphas: [Vec<f64>; 3] = [Vec::new(), Vec::new(), Vec::new()];
263        let mut eigenvalues = [0.0_f64; 3];
264        for l in 0..3 {
265            let lambda = raw_values.get(l).copied().unwrap_or(0.0);
266            eigenvalues[l] = lambda;
267            if lambda > f64::EPSILON {
268                let scale = 1.0 / lambda.sqrt();
269                alphas[l] = raw_vectors[l].iter().map(|&a| a * scale).collect();
270            } else {
271                alphas[l] = vec![0.0; n];
272            }
273        }
274
275        Ok(Self {
276            training_data: normalized,
277            sigma,
278            inv_two_sigma_sq,
279            alphas,
280            eigenvalues,
281            col_means,
282            grand_mean,
283            total_variance,
284            dim,
285            radial,
286            volumetric: false,
287        })
288    }
289
290    /// Compute centred kernel vector and project onto top-3 components.
291    ///
292    /// Returns (f₁, f₂, f₃, spherical_potential).
293    fn kernel_project(&self, normalized: &[f64]) -> (f64, f64, f64, f64) {
294        let n = self.training_data.len();
295
296        // k_z[i] = k(z, x_i)
297        let k_z: Vec<f64> = self
298            .training_data
299            .iter()
300            .map(|x_i| gaussian_kernel(normalized, x_i, self.inv_two_sigma_sq))
301            .collect();
302
303        // mean(k_z)
304        let mean_k_z: f64 = k_z.iter().sum::<f64>() / n as f64;
305
306        // Centre: k̃_z[i] = k_z[i] − mean(k_z) − col_means[i] + grand_mean
307        let k_z_centered: Vec<f64> = (0..n)
308            .map(|i| k_z[i] - mean_k_z - self.col_means[i] + self.grand_mean)
309            .collect();
310
311        // Project: f_l = Σ_i α^l_i · k̃_z[i]
312        let f1 = dot(&self.alphas[0], &k_z_centered);
313        let f2 = dot(&self.alphas[1], &k_z_centered);
314        let f3 = dot(&self.alphas[2], &k_z_centered);
315
316        // Spherical potential (Hoffmann eq. 6):
317        //   p_S(z) = k(z,z) − (2/n)Σ k(z,x_i) + (1/n²)ΣΣ k(x_i,x_j)
318        //          = 1 − 2·mean_k_z + grand_mean
319        //
320        // This is ‖Φ̃(z)‖² — the total "energy" of the centred point.
321        let spherical_potential = (1.0 - 2.0 * mean_k_z + self.grand_mean).max(0.0);
322
323        (f1, f2, f3, spherical_potential)
324    }
325}
326
327impl Projection for KernelPcaProjection {
328    fn project(&self, embedding: &Embedding) -> SphericalPoint {
329        // Caller contract: dimension must match the fitted projection.
330        assert_eq!(
331            embedding.dimension(),
332            self.dim,
333            "expected dimension {}, got {}",
334            self.dim,
335            embedding.dimension()
336        );
337
338        let normalized = embedding.normalized();
339        let (x, y, z, spherical_potential) = self.kernel_project(&normalized);
340
341        if self.volumetric {
342            let sp = cartesian_to_spherical(&CartesianPoint::new(x, y, z));
343            if sp.r < f64::EPSILON {
344                return SphericalPoint::origin();
345            }
346            SphericalPoint::new_unchecked(sp.r, sp.theta, sp.phi)
347        } else {
348            let projection_sq = x * x + y * y + z * z;
349            let projection_magnitude = projection_sq.sqrt();
350            let certainty = if spherical_potential > f64::EPSILON {
351                (projection_sq / spherical_potential).clamp(0.0, 1.0)
352            } else {
353                0.0
354            };
355            let r = self.radial.compute_rich(&crate::types::RadialContext::full(
356                embedding.magnitude(),
357                projection_magnitude,
358                certainty,
359            ));
360            project_xyz_to_spherical(x, y, z, r)
361        }
362    }
363
364    fn project_rich(&self, embedding: &Embedding) -> ProjectedPoint {
365        // Caller contract: dimension must match the fitted projection.
366        assert_eq!(
367            embedding.dimension(),
368            self.dim,
369            "expected dimension {}, got {}",
370            self.dim,
371            embedding.dimension()
372        );
373
374        let intensity = embedding.magnitude();
375        let normalized = embedding.normalized();
376        let (x, y, z, spherical_potential) = self.kernel_project(&normalized);
377
378        let projection_magnitude = (x * x + y * y + z * z).sqrt();
379
380        // Per-point certainty from reconstruction error (Hoffmann eq. 11):
381        //   p(z)  = p_S(z) − Σ f_l(z)²
382        //   certainty = 1 − p(z)/p_S(z) = (Σ f_l²) / p_S(z)
383        let projection_sq = x * x + y * y + z * z;
384        let certainty = if spherical_potential > f64::EPSILON {
385            (projection_sq / spherical_potential).clamp(0.0, 1.0)
386        } else {
387            0.0
388        };
389
390        let position = if self.volumetric {
391            let sp = cartesian_to_spherical(&CartesianPoint::new(x, y, z));
392            if sp.r < f64::EPSILON {
393                SphericalPoint::origin()
394            } else {
395                SphericalPoint::new_unchecked(sp.r, sp.theta, sp.phi)
396            }
397        } else {
398            let r = self.radial.compute_rich(&crate::types::RadialContext::full(
399                intensity,
400                projection_magnitude,
401                certainty,
402            ));
403            project_xyz_to_spherical(x, y, z, r)
404        };
405
406        ProjectedPoint::new(position, certainty, intensity, projection_magnitude)
407    }
408
409    fn dimensionality(&self) -> usize {
410        self.dim
411    }
412}
413
414// ── Kernel function ────────────────────────────────────────────────────
415
416/// Gaussian (RBF) kernel: k(**x**, **y**) = exp(−‖**x**−**y**‖² · inv_two_sigma_sq)
417#[inline]
418fn gaussian_kernel(a: &[f64], b: &[f64], inv_two_sigma_sq: f64) -> f64 {
419    let sq_dist: f64 = a
420        .iter()
421        .zip(b.iter())
422        .map(|(&x, &y)| {
423            let d = x - y;
424            d * d
425        })
426        .sum();
427    (-sq_dist * inv_two_sigma_sq).exp()
428}
429
430// ── Automatic σ selection ──────────────────────────────────────────────
431
432/// Median-heuristic σ: σ = median(pairwise distances) / √2.
433///
434/// At the median distance, k = exp(−median²/(2σ²)) = exp(−1) ≈ 0.37.
435/// For small datasets (n ≤ 100) all pairs are used; otherwise 5 000
436/// random pairs are sampled.
437fn auto_sigma(data: &[Vec<f64>]) -> f64 {
438    let n = data.len();
439    if n < 2 {
440        return 1.0;
441    }
442
443    let mut distances = Vec::new();
444    let mut rng = SplitMix64::new(0xCAFE_BABE);
445
446    if n <= 100 {
447        distances.reserve(n * (n - 1) / 2);
448        for i in 0..n {
449            for j in (i + 1)..n {
450                distances.push(euclidean_dist(&data[i], &data[j]));
451            }
452        }
453    } else {
454        let num_pairs = 5000.min(n * (n - 1) / 2);
455        distances.reserve(num_pairs);
456        for _ in 0..num_pairs {
457            let i = (rng.next_u64() as usize) % n;
458            let mut j = (rng.next_u64() as usize) % n;
459            if j == i {
460                j = (j + 1) % n;
461            }
462            distances.push(euclidean_dist(&data[i], &data[j]));
463        }
464    }
465
466    // Median via `select_nth_unstable_by` is O(n) vs the O(n log n)
467    // full sort — for the 5000-pair sampled branch above that's the
468    // difference between ~65k and ~5k comparisons. `total_cmp` is
469    // NaN-safe (previously `.partial_cmp().unwrap()` panicked on
470    // degenerate zero-distance input).
471    let mid = distances.len() / 2;
472    let (_, median_ref, _) = distances.select_nth_unstable_by(mid, |a, b| a.total_cmp(b));
473    let median = *median_ref;
474
475    (median * std::f64::consts::FRAC_1_SQRT_2).max(1e-8)
476}
477
478fn euclidean_dist(a: &[f64], b: &[f64]) -> f64 {
479    a.iter()
480        .zip(b.iter())
481        .map(|(&x, &y)| {
482            let d = x - y;
483            d * d
484        })
485        .sum::<f64>()
486        .sqrt()
487}
488
489// ── Eigensolver for symmetric n×n matrix ───────────────────────────────
490
491/// Power iteration with deflation for the top-k eigenvectors of a
492/// symmetric n×n matrix stored as a flat row-major array.
493///
494/// Returns (eigenvectors, eigenvalues) sorted by decreasing eigenvalue.
495fn top_k_symmetric(matrix: &[f64], n: usize, k: usize) -> (Vec<Vec<f64>>, Vec<f64>) {
496    let max_iters = 300;
497    let tol = 1e-10;
498    let mut vectors: Vec<Vec<f64>> = Vec::with_capacity(k);
499    let mut values: Vec<f64> = Vec::with_capacity(k);
500    let mut rng = SplitMix64::new(0xBEEF_CAFE);
501
502    // Closure: write `matrix · src` into `dst`. Extracted because the
503    // inner loop needs this twice — once for the iterate and once for
504    // the Rayleigh quotient — and we cache the second call's result
505    // into the next iteration's iterate.
506    let matvec = |dst: &mut [f64], src: &[f64]| {
507        for (i, dst_i) in dst.iter_mut().enumerate() {
508            let row_start = i * n;
509            let mut s = 0.0;
510            for j in 0..n {
511                s += matrix[row_start + j] * src[j];
512            }
513            *dst_i = s;
514        }
515    };
516
517    for _ in 0..k {
518        let mut v: Vec<f64> = (0..n).map(|_| rng.normal()).collect();
519        normalize_vec(&mut v);
520        let mut eigenvalue = 0.0;
521
522        // Cached `matrix · v` from the previous iteration's Rayleigh
523        // step. `None` on first pass; populated afterward. Saves one
524        // full mat-vec per iteration.
525        //
526        // Note: a `.skip(1)` bug in earlier versions left `u[0] = 0`;
527        // see the `first_coordinate_is_nonzero` regression test.
528        let mut mv_cache: Option<Vec<f64>> = None;
529
530        for _ in 0..max_iters {
531            let mut u = mv_cache.take().unwrap_or_else(|| {
532                let mut u = vec![0.0; n];
533                matvec(&mut u, &v);
534                u
535            });
536
537            for prev in &vectors {
538                let proj = dot(&u, prev);
539                for (ui, &pi) in u.iter_mut().zip(prev.iter()) {
540                    *ui -= proj * pi;
541                }
542            }
543
544            let mag = normalize_vec(&mut u);
545            if mag < f64::EPSILON {
546                break;
547            }
548
549            // Rayleigh quotient: λ ≈ uᵀ · (matrix · u). Save the
550            // `matrix · u` vector — next iteration starts with
551            // `u_next = matrix · v_next = matrix · u_current`.
552            let mut mv_next = vec![0.0; n];
553            matvec(&mut mv_next, &u);
554            eigenvalue = u.iter().zip(mv_next.iter()).map(|(a, b)| a * b).sum();
555
556            // Convergence: |⟨u, v⟩| → 1 as u aligns with v.
557            // `.max(0.0)` clamps the FP noise that briefly pushes the
558            // value below zero near convergence.
559            let change = (1.0 - dot(&u, &v).abs()).max(0.0);
560            v = u;
561            mv_cache = Some(mv_next);
562
563            if change < tol {
564                break;
565            }
566        }
567
568        vectors.push(v);
569        values.push(eigenvalue);
570    }
571
572    while vectors.len() < k {
573        let mut v: Vec<f64> = (0..n).map(|_| rng.normal()).collect();
574        for prev in &vectors {
575            let proj = dot(&v, prev);
576            for (vi, &pi) in v.iter_mut().zip(prev.iter()) {
577                *vi -= proj * pi;
578            }
579        }
580        normalize_vec(&mut v);
581        vectors.push(v);
582        values.push(0.0);
583    }
584
585    (vectors, values)
586}
587
588#[cfg(test)]
589mod tests {
590    use super::*;
591    use sphereql_core::angular_distance;
592    use std::f64::consts::{PI, TAU};
593
594    fn emb(vals: &[f64]) -> Embedding {
595        Embedding::new(vals.to_vec())
596    }
597
598    fn corpus_10d() -> Vec<Embedding> {
599        vec![
600            emb(&[1.0, 0.0, 0.0, 0.1, 0.05, -0.02, 0.03, -0.01, 0.04, 0.02]),
601            emb(&[0.0, 1.0, 0.0, -0.05, 0.1, 0.03, -0.02, 0.01, -0.03, 0.04]),
602            emb(&[0.0, 0.0, 1.0, 0.02, -0.03, 0.1, 0.05, 0.02, -0.01, -0.04]),
603            emb(&[1.0, 1.0, 0.0, 0.05, 0.08, 0.01, 0.01, -0.02, 0.02, 0.03]),
604            emb(&[0.0, 1.0, 1.0, -0.02, 0.07, 0.07, 0.01, 0.02, -0.02, 0.01]),
605            emb(&[1.0, 0.0, 1.0, 0.06, 0.01, 0.05, -0.03, -0.01, 0.03, -0.02]),
606            emb(&[-1.0, 0.0, 0.0, -0.08, 0.02, 0.01, 0.02, 0.03, -0.02, 0.01]),
607            emb(&[0.0, -1.0, 0.0, 0.03, -0.09, -0.02, 0.01, -0.01, 0.02, -0.03]),
608        ]
609    }
610
611    fn assert_valid_spherical(sp: &SphericalPoint) {
612        assert!(sp.r >= 0.0, "r must be >= 0, got {}", sp.r);
613        assert!(
614            sp.theta >= 0.0 && sp.theta < TAU,
615            "theta must be in [0, 2π), got {}",
616            sp.theta
617        );
618        assert!(
619            sp.phi >= 0.0 && sp.phi <= PI,
620            "phi must be in [0, π], got {}",
621            sp.phi
622        );
623    }
624
625    /// Regression test for the `.skip(1)` bug in `top_k_symmetric`:
626    /// the mat-vec loop used to start at row 1, leaving `u[0] = 0.0`
627    /// on every power-iteration step. That silently zeroed the first
628    /// coordinate of every kernel PCA eigenvector. This test fails
629    /// loudly if the skip comes back.
630    #[test]
631    fn top_k_symmetric_first_coordinate_nonzero() {
632        // Symmetric, dominant-diagonal 4×4 matrix whose top eigenvector
633        // is approximately the all-ones vector (each coordinate ≈ 0.5).
634        let n = 4;
635        #[rustfmt::skip]
636        let matrix = vec![
637            4.0, 1.0, 0.5, 0.2,
638            1.0, 4.0, 1.0, 0.5,
639            0.5, 1.0, 4.0, 1.0,
640            0.2, 0.5, 1.0, 4.0,
641        ];
642        let (vectors, values) = top_k_symmetric(&matrix, n, 2);
643        assert_eq!(vectors.len(), 2);
644        for (idx, v) in vectors.iter().enumerate() {
645            assert!(
646                v[0].abs() > 1e-6,
647                "eigenvector {idx} has a suspiciously small first \
648                 coordinate ({:.2e}) — the .skip(1) bug is back",
649                v[0]
650            );
651        }
652        assert!(values[0] > values[1], "eigenvalues must be sorted");
653    }
654
655    #[test]
656    fn kernel_pca_fit_auto_sigma() {
657        let corpus = corpus_10d();
658        let kpca = KernelPcaProjection::fit(&corpus, RadialStrategy::Fixed(1.0)).unwrap();
659        assert_eq!(kpca.dimensionality(), 10);
660        assert!(kpca.sigma() > 0.0);
661        assert_eq!(kpca.num_training_points(), 8);
662    }
663
664    #[test]
665    fn kernel_pca_fit_explicit_sigma() {
666        let corpus = corpus_10d();
667        let kpca =
668            KernelPcaProjection::fit_with_sigma(&corpus, 0.5, RadialStrategy::Fixed(1.0)).unwrap();
669        assert!((kpca.sigma() - 0.5).abs() < 1e-12);
670    }
671
672    #[test]
673    fn kernel_pca_fit_default() {
674        let corpus = corpus_10d();
675        let kpca = KernelPcaProjection::fit_default(&corpus).unwrap();
676        assert_eq!(kpca.dimensionality(), 10);
677    }
678
679    #[test]
680    fn kernel_pca_empty_corpus_returns_err() {
681        assert!(matches!(
682            KernelPcaProjection::fit(&[], RadialStrategy::Fixed(1.0)),
683            Err(ProjectionError::EmptyCorpus)
684        ));
685    }
686
687    #[test]
688    fn kernel_pca_too_few_dims_returns_err() {
689        assert!(matches!(
690            KernelPcaProjection::fit(&[emb(&[1.0, 2.0])], RadialStrategy::Fixed(1.0)),
691            Err(ProjectionError::DimensionTooLow {
692                got: 2,
693                required: 3
694            })
695        ));
696    }
697
698    #[test]
699    fn kernel_pca_zero_sigma_returns_err() {
700        let corpus = corpus_10d();
701        assert!(matches!(
702            KernelPcaProjection::fit_with_sigma(&corpus, 0.0, RadialStrategy::Fixed(1.0)),
703            Err(ProjectionError::InvalidSigma { got }) if got == 0.0
704        ));
705    }
706
707    #[test]
708    fn kernel_pca_produces_valid_spherical_points() {
709        let corpus = corpus_10d();
710        let kpca = KernelPcaProjection::fit(&corpus, RadialStrategy::Fixed(1.0)).unwrap();
711        for e in &corpus {
712            assert_valid_spherical(&kpca.project(e));
713        }
714    }
715
716    #[test]
717    fn kernel_pca_out_of_sample_produces_valid_points() {
718        let corpus = corpus_10d();
719        let kpca = KernelPcaProjection::fit(&corpus, RadialStrategy::Fixed(1.0)).unwrap();
720        let novel = emb(&[0.5, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]);
721        assert_valid_spherical(&kpca.project(&novel));
722    }
723
724    #[test]
725    fn kernel_pca_preserves_angular_ordering() {
726        let corpus = corpus_10d();
727        let kpca = KernelPcaProjection::fit(&corpus, RadialStrategy::Fixed(1.0)).unwrap();
728        let a = emb(&[1.0, 0.1, 0.0, 0.05, 0.02, -0.01, 0.01, 0.0, 0.02, 0.01]);
729        let b = emb(&[0.9, 0.2, 0.1, 0.04, 0.03, 0.0, 0.02, -0.01, 0.01, 0.02]);
730        let c = emb(&[-1.0, -0.1, 0.0, -0.04, 0.01, 0.02, 0.01, 0.02, -0.01, 0.01]);
731        let pa = kpca.project(&a);
732        let pb = kpca.project(&b);
733        let pc = kpca.project(&c);
734        let d_ab = angular_distance(&pa, &pb);
735        let d_ac = angular_distance(&pa, &pc);
736        assert!(
737            d_ab < d_ac,
738            "similar items should be closer: d(a,b)={d_ab:.4} < d(a,c)={d_ac:.4}"
739        );
740    }
741
742    #[test]
743    fn kernel_pca_fixed_radial() {
744        let corpus = corpus_10d();
745        let kpca = KernelPcaProjection::fit(&corpus, RadialStrategy::Fixed(2.5)).unwrap();
746        let sp = kpca.project(&corpus[0]);
747        assert!((sp.r - 2.5).abs() < 1e-12);
748    }
749
750    #[test]
751    fn kernel_pca_magnitude_radial() {
752        let corpus = corpus_10d();
753        let kpca = KernelPcaProjection::fit(&corpus, RadialStrategy::Magnitude).unwrap();
754        let short = emb(&[0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]);
755        let long = emb(&[10.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]);
756        let ps = kpca.project(&short);
757        let pl = kpca.project(&long);
758        assert!(ps.r < pl.r, "longer vector should have larger radius");
759        assert!((ps.r - 0.1).abs() < 1e-10);
760        assert!((pl.r - 10.0).abs() < 1e-10);
761    }
762
763    #[test]
764    fn kernel_pca_volumetric() {
765        let corpus = corpus_10d();
766        let kpca = KernelPcaProjection::fit(&corpus, RadialStrategy::Fixed(1.0))
767            .unwrap()
768            .with_volumetric(true);
769        let sp = kpca.project(&corpus[0]);
770        assert!(sp.r >= 0.0);
771    }
772
773    #[test]
774    fn kernel_pca_project_rich_has_valid_certainty() {
775        let corpus = corpus_10d();
776        let kpca = KernelPcaProjection::fit(&corpus, RadialStrategy::Fixed(1.0)).unwrap();
777        for e in &corpus {
778            let rich = kpca.project_rich(e);
779            assert!(
780                rich.certainty >= 0.0 && rich.certainty <= 1.0,
781                "certainty must be in [0,1], got {}",
782                rich.certainty
783            );
784            assert!(rich.intensity > 0.0);
785            assert!(rich.projection_magnitude >= 0.0);
786        }
787    }
788
789    #[test]
790    fn kernel_pca_certainty_is_meaningful() {
791        let corpus = corpus_10d();
792        let kpca = KernelPcaProjection::fit(&corpus, RadialStrategy::Fixed(1.0)).unwrap();
793        let total_certainty: f64 = corpus.iter().map(|e| kpca.project_rich(e).certainty).sum();
794        let mean_certainty = total_certainty / corpus.len() as f64;
795        assert!(
796            mean_certainty > 0.01,
797            "mean certainty of training data should be non-trivial, got {mean_certainty}"
798        );
799    }
800
801    #[test]
802    fn kernel_pca_explained_variance_in_range() {
803        let corpus = corpus_10d();
804        let kpca = KernelPcaProjection::fit(&corpus, RadialStrategy::Fixed(1.0)).unwrap();
805        let ratio = kpca.explained_variance_ratio();
806        assert!(
807            ratio > 0.0 && ratio <= 1.0,
808            "explained variance ratio should be in (0, 1], got {ratio}"
809        );
810    }
811
812    #[test]
813    fn kernel_pca_eigenvalues_descending() {
814        let corpus = corpus_10d();
815        let kpca = KernelPcaProjection::fit(&corpus, RadialStrategy::Fixed(1.0)).unwrap();
816        let ev = kpca.eigenvalues();
817        assert!(ev[0] >= ev[1], "eigenvalues must descend: {ev:?}");
818        assert!(ev[1] >= ev[2], "eigenvalues must descend: {ev:?}");
819        assert!(ev[0] > 0.0, "first eigenvalue should be positive");
820    }
821
822    #[test]
823    fn gaussian_kernel_self_similarity() {
824        let x = vec![1.0, 0.0, 0.0, 0.0, 0.0];
825        let inv = 0.5;
826        assert!((gaussian_kernel(&x, &x, inv) - 1.0).abs() < 1e-12);
827    }
828
829    #[test]
830    fn gaussian_kernel_symmetry() {
831        let a = vec![1.0, 0.0, 0.5];
832        let b = vec![0.0, 1.0, -0.3];
833        let inv = 1.0;
834        assert!((gaussian_kernel(&a, &b, inv) - gaussian_kernel(&b, &a, inv)).abs() < 1e-15);
835    }
836
837    #[test]
838    fn gaussian_kernel_range() {
839        let a = vec![1.0, 0.0, 0.0];
840        let b = vec![0.0, 1.0, 0.0];
841        let inv = 0.5;
842        let k = gaussian_kernel(&a, &b, inv);
843        assert!(
844            k > 0.0 && k <= 1.0,
845            "Gaussian kernel must be in (0, 1], got {k}"
846        );
847    }
848
849    #[test]
850    fn auto_sigma_is_positive() {
851        let corpus = corpus_10d();
852        let normalized: Vec<Vec<f64>> = corpus.iter().map(|e| e.normalized()).collect();
853        let sigma = auto_sigma(&normalized);
854        assert!(sigma > 0.0);
855    }
856
857    #[test]
858    fn auto_sigma_single_point() {
859        let data = vec![vec![1.0, 0.0, 0.0]];
860        assert!((auto_sigma(&data) - 1.0).abs() < 1e-12);
861    }
862
863    #[test]
864    #[should_panic(expected = "expected dimension 10")]
865    fn kernel_pca_dimension_mismatch_panics() {
866        let corpus = corpus_10d();
867        let kpca = KernelPcaProjection::fit(&corpus, RadialStrategy::Fixed(1.0)).unwrap();
868        let _ = kpca.project(&emb(&[1.0, 2.0, 3.0]));
869    }
870
871    #[test]
872    fn kernel_pca_clone_produces_identical_results() {
873        let corpus = corpus_10d();
874        let kpca = KernelPcaProjection::fit(&corpus, RadialStrategy::Fixed(1.0)).unwrap();
875        let kpca2 = kpca.clone();
876        for e in &corpus {
877            let sp1 = kpca.project(e);
878            let sp2 = kpca2.project(e);
879            assert!((sp1.theta - sp2.theta).abs() < 1e-12);
880            assert!((sp1.phi - sp2.phi).abs() < 1e-12);
881        }
882    }
883
884    #[test]
885    fn kernel_pca_works_with_embedding_index() {
886        use crate::query::EmbeddingIndex;
887        let corpus = corpus_10d();
888        let kpca = KernelPcaProjection::fit(&corpus, RadialStrategy::Fixed(1.0)).unwrap();
889        let mut idx = EmbeddingIndex::builder(kpca)
890            .theta_divisions(4)
891            .phi_divisions(3)
892            .build();
893        for (i, e) in corpus.iter().enumerate() {
894            idx.insert(format!("item-{i}"), e);
895        }
896        assert_eq!(idx.len(), corpus.len());
897        let query = emb(&[0.9, 0.1, 0.0, 0.05, 0.02, -0.01, 0.01, 0.0, 0.02, 0.01]);
898        let results = idx.search_nearest(&query, 3);
899        assert_eq!(results.len(), 3);
900        assert!(results[0].distance <= results[1].distance);
901    }
902
903    #[test]
904    fn large_sigma_approaches_pca_ordering() {
905        use crate::projection::PcaProjection;
906        let corpus = corpus_10d();
907        let pca = PcaProjection::fit(&corpus, RadialStrategy::Fixed(1.0)).unwrap();
908        let kpca = KernelPcaProjection::fit_with_sigma(&corpus, 100.0, RadialStrategy::Fixed(1.0))
909            .unwrap();
910        let query = emb(&[1.0, 0.1, 0.0, 0.05, 0.02, -0.01, 0.01, 0.0, 0.02, 0.01]);
911        let pca_pt = pca.project(&query);
912        let kpca_pt = kpca.project(&query);
913        let mut pca_dists: Vec<(usize, f64)> = corpus
914            .iter()
915            .enumerate()
916            .map(|(i, e)| (i, angular_distance(&pca_pt, &pca.project(e))))
917            .collect();
918        let mut kpca_dists: Vec<(usize, f64)> = corpus
919            .iter()
920            .enumerate()
921            .map(|(i, e)| (i, angular_distance(&kpca_pt, &kpca.project(e))))
922            .collect();
923        pca_dists.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
924        kpca_dists.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
925        assert_eq!(
926            pca_dists[0].0, kpca_dists[0].0,
927            "nearest neighbour should match between PCA and kernel PCA with large σ"
928        );
929    }
930}
sphereql_embed/kernel_pca.rs

sphereql_embed/
kernel_pca.rs