1use sphereql_core::{CartesianPoint, SphericalPoint, cartesian_to_spherical};
2
3use crate::projection::{
4 ProjectionError, SplitMix64, dot, normalize_vec, project_xyz_to_spherical,
5};
6use crate::types::{Embedding, ProjectedPoint, RadialStrategy};
7
8use crate::projection::Projection;
9
10#[derive(Clone)]
45pub struct KernelPcaProjection {
46 training_data: Vec<Vec<f64>>,
49
50 sigma: f64,
52
53 inv_two_sigma_sq: f64,
55
56 alphas: [Vec<f64>; 3],
59
60 eigenvalues: [f64; 3],
62
63 col_means: Vec<f64>,
66
67 grand_mean: f64,
70
71 total_variance: f64,
73
74 dim: usize,
76
77 radial: RadialStrategy,
79
80 volumetric: bool,
83}
84
85impl KernelPcaProjection {
86 pub fn fit(embeddings: &[Embedding], radial: RadialStrategy) -> Result<Self, ProjectionError> {
93 Self::fit_impl(embeddings, None, radial)
94 }
95
96 pub fn fit_with_sigma(
102 embeddings: &[Embedding],
103 sigma: f64,
104 radial: RadialStrategy,
105 ) -> Result<Self, ProjectionError> {
106 if sigma <= 0.0 {
107 return Err(ProjectionError::InvalidSigma { got: sigma });
108 }
109 Self::fit_impl(embeddings, Some(sigma), radial)
110 }
111
112 pub fn fit_default(embeddings: &[Embedding]) -> Result<Self, ProjectionError> {
114 Self::fit(embeddings, RadialStrategy::default())
115 }
116
117 pub fn with_volumetric(mut self, enabled: bool) -> Self {
120 self.volumetric = enabled;
121 self
122 }
123
124 pub fn sigma(&self) -> f64 {
126 self.sigma
127 }
128
129 pub fn num_training_points(&self) -> usize {
131 self.training_data.len()
132 }
133
134 pub fn explained_variance_ratio(&self) -> f64 {
140 if self.total_variance < f64::EPSILON {
141 return 1.0;
142 }
143 let explained: f64 = self.eigenvalues.iter().sum();
144 (explained / (self.total_variance * self.training_data.len() as f64)).clamp(0.0, 1.0)
148 }
149
150 pub fn eigenvalues(&self) -> [f64; 3] {
152 self.eigenvalues
153 }
154
155 fn fit_impl(
158 embeddings: &[Embedding],
159 sigma: Option<f64>,
160 radial: RadialStrategy,
161 ) -> Result<Self, ProjectionError> {
162 if embeddings.is_empty() {
163 return Err(ProjectionError::EmptyCorpus);
164 }
165 let dim = embeddings[0].dimension();
166 if dim < 3 {
167 return Err(ProjectionError::DimensionTooLow {
168 got: dim,
169 required: 3,
170 });
171 }
172 for (i, e) in embeddings.iter().enumerate() {
173 if e.dimension() != dim {
174 return Err(ProjectionError::InconsistentDimension {
175 index: i,
176 expected: dim,
177 got: e.dimension(),
178 });
179 }
180 }
181
182 let normalized: Vec<Vec<f64>> = embeddings.iter().map(|e| e.normalized()).collect();
184 let n = normalized.len();
185
186 let sigma = sigma.unwrap_or_else(|| auto_sigma(&normalized));
188 let inv_two_sigma_sq = 0.5 / (sigma * sigma);
189
190 let mut kernel_flat = vec![0.0_f64; n * n];
195 for i in 0..n {
196 kernel_flat[i * n + i] = 1.0; for j in (i + 1)..n {
198 let val = gaussian_kernel(&normalized[i], &normalized[j], inv_two_sigma_sq);
199 kernel_flat[i * n + j] = val;
200 kernel_flat[j * n + i] = val;
201 }
202 }
203
204 let mut col_means = vec![0.0; n];
208 let mut grand_sum = 0.0;
209 for i in 0..n {
210 let mut row_sum = 0.0;
211 for j in 0..n {
212 row_sum += kernel_flat[i * n + j];
213 }
214 col_means[i] = row_sum / n as f64;
215 grand_sum += row_sum;
216 }
217 let grand_mean = grand_sum / (n * n) as f64;
218
219 for i in 0..n {
229 for j in i..n {
230 let v = kernel_flat[i * n + j] - col_means[i] - col_means[j] + grand_mean;
231 kernel_flat[i * n + j] = v;
232 kernel_flat[j * n + i] = v;
233 }
234 }
235
236 let trace: f64 = (0..n).map(|i| kernel_flat[i * n + i]).sum();
238 let total_variance = trace / n as f64;
239
240 let (raw_vectors, raw_values) = top_k_symmetric(&kernel_flat, n, 3);
242
243 let mut alphas: [Vec<f64>; 3] = [Vec::new(), Vec::new(), Vec::new()];
246 let mut eigenvalues = [0.0_f64; 3];
247 for l in 0..3 {
248 let lambda = raw_values.get(l).copied().unwrap_or(0.0);
249 eigenvalues[l] = lambda;
250 if lambda > f64::EPSILON {
251 let scale = 1.0 / lambda.sqrt();
252 alphas[l] = raw_vectors[l].iter().map(|&a| a * scale).collect();
253 } else {
254 alphas[l] = vec![0.0; n];
255 }
256 }
257
258 Ok(Self {
259 training_data: normalized,
260 sigma,
261 inv_two_sigma_sq,
262 alphas,
263 eigenvalues,
264 col_means,
265 grand_mean,
266 total_variance,
267 dim,
268 radial,
269 volumetric: false,
270 })
271 }
272
273 fn kernel_project(&self, normalized: &[f64]) -> (f64, f64, f64, f64) {
277 let n = self.training_data.len();
278
279 let k_z: Vec<f64> = self
281 .training_data
282 .iter()
283 .map(|x_i| gaussian_kernel(normalized, x_i, self.inv_two_sigma_sq))
284 .collect();
285
286 let mean_k_z: f64 = k_z.iter().sum::<f64>() / n as f64;
288
289 let k_z_centered: Vec<f64> = (0..n)
291 .map(|i| k_z[i] - mean_k_z - self.col_means[i] + self.grand_mean)
292 .collect();
293
294 let f1 = dot(&self.alphas[0], &k_z_centered);
296 let f2 = dot(&self.alphas[1], &k_z_centered);
297 let f3 = dot(&self.alphas[2], &k_z_centered);
298
299 let spherical_potential = (1.0 - 2.0 * mean_k_z + self.grand_mean).max(0.0);
305
306 (f1, f2, f3, spherical_potential)
307 }
308}
309
310impl Projection for KernelPcaProjection {
311 fn project(&self, embedding: &Embedding) -> SphericalPoint {
312 assert_eq!(
313 embedding.dimension(),
314 self.dim,
315 "expected dimension {}, got {}",
316 self.dim,
317 embedding.dimension()
318 );
319
320 let normalized = embedding.normalized();
321 let (x, y, z, _) = self.kernel_project(&normalized);
322
323 if self.volumetric {
324 let sp = cartesian_to_spherical(&CartesianPoint::new(x, y, z));
325 if sp.r < f64::EPSILON {
326 return SphericalPoint::new_unchecked(0.0, 0.0, 0.0);
327 }
328 SphericalPoint::new_unchecked(sp.r, sp.theta, sp.phi)
329 } else {
330 let r = self.radial.compute(embedding.magnitude());
331 project_xyz_to_spherical(x, y, z, r)
332 }
333 }
334
335 fn project_rich(&self, embedding: &Embedding) -> ProjectedPoint {
336 assert_eq!(
337 embedding.dimension(),
338 self.dim,
339 "expected dimension {}, got {}",
340 self.dim,
341 embedding.dimension()
342 );
343
344 let intensity = embedding.magnitude();
345 let normalized = embedding.normalized();
346 let (x, y, z, spherical_potential) = self.kernel_project(&normalized);
347
348 let projection_magnitude = (x * x + y * y + z * z).sqrt();
349
350 let projection_sq = x * x + y * y + z * z;
354 let certainty = if spherical_potential > f64::EPSILON {
355 (projection_sq / spherical_potential).clamp(0.0, 1.0)
356 } else {
357 0.0
358 };
359
360 let position = if self.volumetric {
361 let sp = cartesian_to_spherical(&CartesianPoint::new(x, y, z));
362 if sp.r < f64::EPSILON {
363 SphericalPoint::new_unchecked(0.0, 0.0, 0.0)
364 } else {
365 SphericalPoint::new_unchecked(sp.r, sp.theta, sp.phi)
366 }
367 } else {
368 let r = self.radial.compute(intensity);
369 project_xyz_to_spherical(x, y, z, r)
370 };
371
372 ProjectedPoint::new(position, certainty, intensity, projection_magnitude)
373 }
374
375 fn dimensionality(&self) -> usize {
376 self.dim
377 }
378}
379
380#[inline]
384fn gaussian_kernel(a: &[f64], b: &[f64], inv_two_sigma_sq: f64) -> f64 {
385 let sq_dist: f64 = a
386 .iter()
387 .zip(b.iter())
388 .map(|(&x, &y)| {
389 let d = x - y;
390 d * d
391 })
392 .sum();
393 (-sq_dist * inv_two_sigma_sq).exp()
394}
395
396fn auto_sigma(data: &[Vec<f64>]) -> f64 {
404 let n = data.len();
405 if n < 2 {
406 return 1.0;
407 }
408
409 let mut distances = Vec::new();
410 let mut rng = SplitMix64::new(0xCAFE_BABE);
411
412 if n <= 100 {
413 distances.reserve(n * (n - 1) / 2);
414 for i in 0..n {
415 for j in (i + 1)..n {
416 distances.push(euclidean_dist(&data[i], &data[j]));
417 }
418 }
419 } else {
420 let num_pairs = 5000.min(n * (n - 1) / 2);
421 distances.reserve(num_pairs);
422 for _ in 0..num_pairs {
423 let i = (rng.next_u64() as usize) % n;
424 let mut j = (rng.next_u64() as usize) % n;
425 if j == i {
426 j = (j + 1) % n;
427 }
428 distances.push(euclidean_dist(&data[i], &data[j]));
429 }
430 }
431
432 let mid = distances.len() / 2;
438 let (_, median_ref, _) = distances.select_nth_unstable_by(mid, |a, b| a.total_cmp(b));
439 let median = *median_ref;
440
441 (median * std::f64::consts::FRAC_1_SQRT_2).max(1e-8)
442}
443
444fn euclidean_dist(a: &[f64], b: &[f64]) -> f64 {
445 a.iter()
446 .zip(b.iter())
447 .map(|(&x, &y)| {
448 let d = x - y;
449 d * d
450 })
451 .sum::<f64>()
452 .sqrt()
453}
454
455fn top_k_symmetric(matrix: &[f64], n: usize, k: usize) -> (Vec<Vec<f64>>, Vec<f64>) {
462 let max_iters = 300;
463 let tol = 1e-10;
464 let mut vectors: Vec<Vec<f64>> = Vec::with_capacity(k);
465 let mut values: Vec<f64> = Vec::with_capacity(k);
466 let mut rng = SplitMix64::new(0xBEEF_CAFE);
467
468 let matvec = |dst: &mut [f64], src: &[f64]| {
473 for (i, dst_i) in dst.iter_mut().enumerate() {
474 let row_start = i * n;
475 let mut s = 0.0;
476 for j in 0..n {
477 s += matrix[row_start + j] * src[j];
478 }
479 *dst_i = s;
480 }
481 };
482
483 for _ in 0..k {
484 let mut v: Vec<f64> = (0..n).map(|_| rng.normal()).collect();
485 normalize_vec(&mut v);
486 let mut eigenvalue = 0.0;
487
488 let mut mv_cache: Option<Vec<f64>> = None;
495
496 for _ in 0..max_iters {
497 let mut u = mv_cache.take().unwrap_or_else(|| {
498 let mut u = vec![0.0; n];
499 matvec(&mut u, &v);
500 u
501 });
502
503 for prev in &vectors {
504 let proj = dot(&u, prev);
505 for (ui, &pi) in u.iter_mut().zip(prev.iter()) {
506 *ui -= proj * pi;
507 }
508 }
509
510 let mag = normalize_vec(&mut u);
511 if mag < f64::EPSILON {
512 break;
513 }
514
515 let mut mv_next = vec![0.0; n];
519 matvec(&mut mv_next, &u);
520 eigenvalue = u.iter().zip(mv_next.iter()).map(|(a, b)| a * b).sum();
521
522 let change = (1.0 - dot(&u, &v).abs()).max(0.0);
526 v = u;
527 mv_cache = Some(mv_next);
528
529 if change < tol {
530 break;
531 }
532 }
533
534 vectors.push(v);
535 values.push(eigenvalue);
536 }
537
538 while vectors.len() < k {
539 let mut v: Vec<f64> = (0..n).map(|_| rng.normal()).collect();
540 for prev in &vectors {
541 let proj = dot(&v, prev);
542 for (vi, &pi) in v.iter_mut().zip(prev.iter()) {
543 *vi -= proj * pi;
544 }
545 }
546 normalize_vec(&mut v);
547 vectors.push(v);
548 values.push(0.0);
549 }
550
551 (vectors, values)
552}
553
554#[cfg(test)]
555mod tests {
556 use super::*;
557 use sphereql_core::angular_distance;
558 use std::f64::consts::{PI, TAU};
559
560 fn emb(vals: &[f64]) -> Embedding {
561 Embedding::new(vals.to_vec())
562 }
563
564 fn corpus_10d() -> Vec<Embedding> {
565 vec![
566 emb(&[1.0, 0.0, 0.0, 0.1, 0.05, -0.02, 0.03, -0.01, 0.04, 0.02]),
567 emb(&[0.0, 1.0, 0.0, -0.05, 0.1, 0.03, -0.02, 0.01, -0.03, 0.04]),
568 emb(&[0.0, 0.0, 1.0, 0.02, -0.03, 0.1, 0.05, 0.02, -0.01, -0.04]),
569 emb(&[1.0, 1.0, 0.0, 0.05, 0.08, 0.01, 0.01, -0.02, 0.02, 0.03]),
570 emb(&[0.0, 1.0, 1.0, -0.02, 0.07, 0.07, 0.01, 0.02, -0.02, 0.01]),
571 emb(&[1.0, 0.0, 1.0, 0.06, 0.01, 0.05, -0.03, -0.01, 0.03, -0.02]),
572 emb(&[-1.0, 0.0, 0.0, -0.08, 0.02, 0.01, 0.02, 0.03, -0.02, 0.01]),
573 emb(&[0.0, -1.0, 0.0, 0.03, -0.09, -0.02, 0.01, -0.01, 0.02, -0.03]),
574 ]
575 }
576
577 fn assert_valid_spherical(sp: &SphericalPoint) {
578 assert!(sp.r >= 0.0, "r must be >= 0, got {}", sp.r);
579 assert!(
580 sp.theta >= 0.0 && sp.theta < TAU,
581 "theta must be in [0, 2π), got {}",
582 sp.theta
583 );
584 assert!(
585 sp.phi >= 0.0 && sp.phi <= PI,
586 "phi must be in [0, π], got {}",
587 sp.phi
588 );
589 }
590
591 #[test]
597 fn top_k_symmetric_first_coordinate_nonzero() {
598 let n = 4;
601 #[rustfmt::skip]
602 let matrix = vec![
603 4.0, 1.0, 0.5, 0.2,
604 1.0, 4.0, 1.0, 0.5,
605 0.5, 1.0, 4.0, 1.0,
606 0.2, 0.5, 1.0, 4.0,
607 ];
608 let (vectors, values) = top_k_symmetric(&matrix, n, 2);
609 assert_eq!(vectors.len(), 2);
610 for (idx, v) in vectors.iter().enumerate() {
611 assert!(
612 v[0].abs() > 1e-6,
613 "eigenvector {idx} has a suspiciously small first \
614 coordinate ({:.2e}) — the .skip(1) bug is back",
615 v[0]
616 );
617 }
618 assert!(values[0] > values[1], "eigenvalues must be sorted");
619 }
620
621 #[test]
622 fn kernel_pca_fit_auto_sigma() {
623 let corpus = corpus_10d();
624 let kpca = KernelPcaProjection::fit(&corpus, RadialStrategy::Fixed(1.0)).unwrap();
625 assert_eq!(kpca.dimensionality(), 10);
626 assert!(kpca.sigma() > 0.0);
627 assert_eq!(kpca.num_training_points(), 8);
628 }
629
630 #[test]
631 fn kernel_pca_fit_explicit_sigma() {
632 let corpus = corpus_10d();
633 let kpca =
634 KernelPcaProjection::fit_with_sigma(&corpus, 0.5, RadialStrategy::Fixed(1.0)).unwrap();
635 assert!((kpca.sigma() - 0.5).abs() < 1e-12);
636 }
637
638 #[test]
639 fn kernel_pca_fit_default() {
640 let corpus = corpus_10d();
641 let kpca = KernelPcaProjection::fit_default(&corpus).unwrap();
642 assert_eq!(kpca.dimensionality(), 10);
643 }
644
645 #[test]
646 fn kernel_pca_empty_corpus_returns_err() {
647 assert!(matches!(
648 KernelPcaProjection::fit(&[], RadialStrategy::Fixed(1.0)),
649 Err(ProjectionError::EmptyCorpus)
650 ));
651 }
652
653 #[test]
654 fn kernel_pca_too_few_dims_returns_err() {
655 assert!(matches!(
656 KernelPcaProjection::fit(&[emb(&[1.0, 2.0])], RadialStrategy::Fixed(1.0)),
657 Err(ProjectionError::DimensionTooLow {
658 got: 2,
659 required: 3
660 })
661 ));
662 }
663
664 #[test]
665 fn kernel_pca_zero_sigma_returns_err() {
666 let corpus = corpus_10d();
667 assert!(matches!(
668 KernelPcaProjection::fit_with_sigma(&corpus, 0.0, RadialStrategy::Fixed(1.0)),
669 Err(ProjectionError::InvalidSigma { got }) if got == 0.0
670 ));
671 }
672
673 #[test]
674 fn kernel_pca_produces_valid_spherical_points() {
675 let corpus = corpus_10d();
676 let kpca = KernelPcaProjection::fit(&corpus, RadialStrategy::Fixed(1.0)).unwrap();
677 for e in &corpus {
678 assert_valid_spherical(&kpca.project(e));
679 }
680 }
681
682 #[test]
683 fn kernel_pca_out_of_sample_produces_valid_points() {
684 let corpus = corpus_10d();
685 let kpca = KernelPcaProjection::fit(&corpus, RadialStrategy::Fixed(1.0)).unwrap();
686 let novel = emb(&[0.5, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]);
687 assert_valid_spherical(&kpca.project(&novel));
688 }
689
690 #[test]
691 fn kernel_pca_preserves_angular_ordering() {
692 let corpus = corpus_10d();
693 let kpca = KernelPcaProjection::fit(&corpus, RadialStrategy::Fixed(1.0)).unwrap();
694 let a = emb(&[1.0, 0.1, 0.0, 0.05, 0.02, -0.01, 0.01, 0.0, 0.02, 0.01]);
695 let b = emb(&[0.9, 0.2, 0.1, 0.04, 0.03, 0.0, 0.02, -0.01, 0.01, 0.02]);
696 let c = emb(&[-1.0, -0.1, 0.0, -0.04, 0.01, 0.02, 0.01, 0.02, -0.01, 0.01]);
697 let pa = kpca.project(&a);
698 let pb = kpca.project(&b);
699 let pc = kpca.project(&c);
700 let d_ab = angular_distance(&pa, &pb);
701 let d_ac = angular_distance(&pa, &pc);
702 assert!(
703 d_ab < d_ac,
704 "similar items should be closer: d(a,b)={d_ab:.4} < d(a,c)={d_ac:.4}"
705 );
706 }
707
708 #[test]
709 fn kernel_pca_fixed_radial() {
710 let corpus = corpus_10d();
711 let kpca = KernelPcaProjection::fit(&corpus, RadialStrategy::Fixed(2.5)).unwrap();
712 let sp = kpca.project(&corpus[0]);
713 assert!((sp.r - 2.5).abs() < 1e-12);
714 }
715
716 #[test]
717 fn kernel_pca_magnitude_radial() {
718 let corpus = corpus_10d();
719 let kpca = KernelPcaProjection::fit(&corpus, RadialStrategy::Magnitude).unwrap();
720 let short = emb(&[0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]);
721 let long = emb(&[10.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]);
722 let ps = kpca.project(&short);
723 let pl = kpca.project(&long);
724 assert!(ps.r < pl.r, "longer vector should have larger radius");
725 assert!((ps.r - 0.1).abs() < 1e-10);
726 assert!((pl.r - 10.0).abs() < 1e-10);
727 }
728
729 #[test]
730 fn kernel_pca_volumetric() {
731 let corpus = corpus_10d();
732 let kpca = KernelPcaProjection::fit(&corpus, RadialStrategy::Fixed(1.0))
733 .unwrap()
734 .with_volumetric(true);
735 let sp = kpca.project(&corpus[0]);
736 assert!(sp.r >= 0.0);
737 }
738
739 #[test]
740 fn kernel_pca_project_rich_has_valid_certainty() {
741 let corpus = corpus_10d();
742 let kpca = KernelPcaProjection::fit(&corpus, RadialStrategy::Fixed(1.0)).unwrap();
743 for e in &corpus {
744 let rich = kpca.project_rich(e);
745 assert!(
746 rich.certainty >= 0.0 && rich.certainty <= 1.0,
747 "certainty must be in [0,1], got {}",
748 rich.certainty
749 );
750 assert!(rich.intensity > 0.0);
751 assert!(rich.projection_magnitude >= 0.0);
752 }
753 }
754
755 #[test]
756 fn kernel_pca_certainty_is_meaningful() {
757 let corpus = corpus_10d();
758 let kpca = KernelPcaProjection::fit(&corpus, RadialStrategy::Fixed(1.0)).unwrap();
759 let total_certainty: f64 = corpus.iter().map(|e| kpca.project_rich(e).certainty).sum();
760 let mean_certainty = total_certainty / corpus.len() as f64;
761 assert!(
762 mean_certainty > 0.01,
763 "mean certainty of training data should be non-trivial, got {mean_certainty}"
764 );
765 }
766
767 #[test]
768 fn kernel_pca_explained_variance_in_range() {
769 let corpus = corpus_10d();
770 let kpca = KernelPcaProjection::fit(&corpus, RadialStrategy::Fixed(1.0)).unwrap();
771 let ratio = kpca.explained_variance_ratio();
772 assert!(
773 ratio > 0.0 && ratio <= 1.0,
774 "explained variance ratio should be in (0, 1], got {ratio}"
775 );
776 }
777
778 #[test]
779 fn kernel_pca_eigenvalues_descending() {
780 let corpus = corpus_10d();
781 let kpca = KernelPcaProjection::fit(&corpus, RadialStrategy::Fixed(1.0)).unwrap();
782 let ev = kpca.eigenvalues();
783 assert!(ev[0] >= ev[1], "eigenvalues must descend: {ev:?}");
784 assert!(ev[1] >= ev[2], "eigenvalues must descend: {ev:?}");
785 assert!(ev[0] > 0.0, "first eigenvalue should be positive");
786 }
787
788 #[test]
789 fn gaussian_kernel_self_similarity() {
790 let x = vec![1.0, 0.0, 0.0, 0.0, 0.0];
791 let inv = 0.5;
792 assert!((gaussian_kernel(&x, &x, inv) - 1.0).abs() < 1e-12);
793 }
794
795 #[test]
796 fn gaussian_kernel_symmetry() {
797 let a = vec![1.0, 0.0, 0.5];
798 let b = vec![0.0, 1.0, -0.3];
799 let inv = 1.0;
800 assert!((gaussian_kernel(&a, &b, inv) - gaussian_kernel(&b, &a, inv)).abs() < 1e-15);
801 }
802
803 #[test]
804 fn gaussian_kernel_range() {
805 let a = vec![1.0, 0.0, 0.0];
806 let b = vec![0.0, 1.0, 0.0];
807 let inv = 0.5;
808 let k = gaussian_kernel(&a, &b, inv);
809 assert!(
810 k > 0.0 && k <= 1.0,
811 "Gaussian kernel must be in (0, 1], got {k}"
812 );
813 }
814
815 #[test]
816 fn auto_sigma_is_positive() {
817 let corpus = corpus_10d();
818 let normalized: Vec<Vec<f64>> = corpus.iter().map(|e| e.normalized()).collect();
819 let sigma = auto_sigma(&normalized);
820 assert!(sigma > 0.0);
821 }
822
823 #[test]
824 fn auto_sigma_single_point() {
825 let data = vec![vec![1.0, 0.0, 0.0]];
826 assert!((auto_sigma(&data) - 1.0).abs() < 1e-12);
827 }
828
829 #[test]
830 #[should_panic(expected = "expected dimension 10")]
831 fn kernel_pca_dimension_mismatch_panics() {
832 let corpus = corpus_10d();
833 let kpca = KernelPcaProjection::fit(&corpus, RadialStrategy::Fixed(1.0)).unwrap();
834 let _ = kpca.project(&emb(&[1.0, 2.0, 3.0]));
835 }
836
837 #[test]
838 fn kernel_pca_clone_produces_identical_results() {
839 let corpus = corpus_10d();
840 let kpca = KernelPcaProjection::fit(&corpus, RadialStrategy::Fixed(1.0)).unwrap();
841 let kpca2 = kpca.clone();
842 for e in &corpus {
843 let sp1 = kpca.project(e);
844 let sp2 = kpca2.project(e);
845 assert!((sp1.theta - sp2.theta).abs() < 1e-12);
846 assert!((sp1.phi - sp2.phi).abs() < 1e-12);
847 }
848 }
849
850 #[test]
851 fn kernel_pca_works_with_embedding_index() {
852 use crate::query::EmbeddingIndex;
853 let corpus = corpus_10d();
854 let kpca = KernelPcaProjection::fit(&corpus, RadialStrategy::Fixed(1.0)).unwrap();
855 let mut idx = EmbeddingIndex::builder(kpca)
856 .theta_divisions(4)
857 .phi_divisions(3)
858 .build();
859 for (i, e) in corpus.iter().enumerate() {
860 idx.insert(format!("item-{i}"), e);
861 }
862 assert_eq!(idx.len(), corpus.len());
863 let query = emb(&[0.9, 0.1, 0.0, 0.05, 0.02, -0.01, 0.01, 0.0, 0.02, 0.01]);
864 let results = idx.search_nearest(&query, 3);
865 assert_eq!(results.len(), 3);
866 assert!(results[0].distance <= results[1].distance);
867 }
868
869 #[test]
870 fn large_sigma_approaches_pca_ordering() {
871 use crate::projection::PcaProjection;
872 let corpus = corpus_10d();
873 let pca = PcaProjection::fit(&corpus, RadialStrategy::Fixed(1.0)).unwrap();
874 let kpca = KernelPcaProjection::fit_with_sigma(&corpus, 100.0, RadialStrategy::Fixed(1.0))
875 .unwrap();
876 let query = emb(&[1.0, 0.1, 0.0, 0.05, 0.02, -0.01, 0.01, 0.0, 0.02, 0.01]);
877 let pca_pt = pca.project(&query);
878 let kpca_pt = kpca.project(&query);
879 let mut pca_dists: Vec<(usize, f64)> = corpus
880 .iter()
881 .enumerate()
882 .map(|(i, e)| (i, angular_distance(&pca_pt, &pca.project(e))))
883 .collect();
884 let mut kpca_dists: Vec<(usize, f64)> = corpus
885 .iter()
886 .enumerate()
887 .map(|(i, e)| (i, angular_distance(&kpca_pt, &kpca.project(e))))
888 .collect();
889 pca_dists.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
890 kpca_dists.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
891 assert_eq!(
892 pca_dists[0].0, kpca_dists[0].0,
893 "nearest neighbour should match between PCA and kernel PCA with large σ"
894 );
895 }
896}