use sphereql_core::{CartesianPoint, SphericalPoint, cartesian_to_spherical};
use crate::projection::{SplitMix64, dot, normalize_vec, project_xyz_to_spherical};
use crate::types::{Embedding, ProjectedPoint, RadialStrategy};
use crate::projection::Projection;
#[derive(Clone)]
pub struct KernelPcaProjection {
training_data: Vec<Vec<f64>>,
sigma: f64,
inv_two_sigma_sq: f64,
alphas: [Vec<f64>; 3],
eigenvalues: [f64; 3],
col_means: Vec<f64>,
grand_mean: f64,
total_variance: f64,
dim: usize,
radial: RadialStrategy,
volumetric: bool,
}
impl KernelPcaProjection {
pub fn fit(embeddings: &[Embedding], radial: RadialStrategy) -> Self {
Self::fit_impl(embeddings, None, radial)
}
pub fn fit_with_sigma(embeddings: &[Embedding], sigma: f64, radial: RadialStrategy) -> Self {
assert!(sigma > 0.0, "σ must be positive, got {sigma}");
Self::fit_impl(embeddings, Some(sigma), radial)
}
pub fn fit_default(embeddings: &[Embedding]) -> Self {
Self::fit(embeddings, RadialStrategy::default())
}
pub fn with_volumetric(mut self, enabled: bool) -> Self {
self.volumetric = enabled;
self
}
pub fn sigma(&self) -> f64 {
self.sigma
}
pub fn num_training_points(&self) -> usize {
self.training_data.len()
}
pub fn explained_variance_ratio(&self) -> f64 {
if self.total_variance < f64::EPSILON {
return 1.0;
}
let explained: f64 = self.eigenvalues.iter().sum();
(explained / (self.total_variance * self.training_data.len() as f64)).clamp(0.0, 1.0)
}
pub fn eigenvalues(&self) -> [f64; 3] {
self.eigenvalues
}
fn fit_impl(embeddings: &[Embedding], sigma: Option<f64>, radial: RadialStrategy) -> Self {
assert!(
!embeddings.is_empty(),
"need at least one embedding to fit kernel PCA"
);
let dim = embeddings[0].dimension();
assert!(dim >= 3, "embedding dimension must be >= 3");
for (i, e) in embeddings.iter().enumerate() {
assert_eq!(
e.dimension(),
dim,
"embedding {i} has dimension {}, expected {dim}",
e.dimension()
);
}
let normalized: Vec<Vec<f64>> = embeddings.iter().map(|e| e.normalized()).collect();
let n = normalized.len();
let sigma = sigma.unwrap_or_else(|| auto_sigma(&normalized));
let inv_two_sigma_sq = 0.5 / (sigma * sigma);
let mut kernel_flat = vec![0.0_f64; n * n];
for i in 0..n {
kernel_flat[i * n + i] = 1.0; for j in (i + 1)..n {
let val = gaussian_kernel(&normalized[i], &normalized[j], inv_two_sigma_sq);
kernel_flat[i * n + j] = val;
kernel_flat[j * n + i] = val;
}
}
let mut col_means = vec![0.0; n];
let mut grand_sum = 0.0;
for i in 0..n {
let mut row_sum = 0.0;
for j in 0..n {
row_sum += kernel_flat[i * n + j];
}
col_means[i] = row_sum / n as f64;
grand_sum += row_sum;
}
let grand_mean = grand_sum / (n * n) as f64;
for i in 0..n {
for j in 0..n {
kernel_flat[i * n + j] -= col_means[i] + col_means[j] - grand_mean;
}
}
let trace: f64 = (0..n).map(|i| kernel_flat[i * n + i]).sum();
let total_variance = trace / n as f64;
let (raw_vectors, raw_values) = top_k_symmetric(&kernel_flat, n, 3);
let mut alphas: [Vec<f64>; 3] = [Vec::new(), Vec::new(), Vec::new()];
let mut eigenvalues = [0.0_f64; 3];
for l in 0..3 {
let lambda = raw_values.get(l).copied().unwrap_or(0.0);
eigenvalues[l] = lambda;
if lambda > f64::EPSILON {
let scale = 1.0 / lambda.sqrt();
alphas[l] = raw_vectors[l].iter().map(|&a| a * scale).collect();
} else {
alphas[l] = vec![0.0; n];
}
}
Self {
training_data: normalized,
sigma,
inv_two_sigma_sq,
alphas,
eigenvalues,
col_means,
grand_mean,
total_variance,
dim,
radial,
volumetric: false,
}
}
fn kernel_project(&self, normalized: &[f64]) -> (f64, f64, f64, f64) {
let n = self.training_data.len();
let k_z: Vec<f64> = self
.training_data
.iter()
.map(|x_i| gaussian_kernel(normalized, x_i, self.inv_two_sigma_sq))
.collect();
let mean_k_z: f64 = k_z.iter().sum::<f64>() / n as f64;
let k_z_centered: Vec<f64> = (0..n)
.map(|i| k_z[i] - mean_k_z - self.col_means[i] + self.grand_mean)
.collect();
let f1 = dot(&self.alphas[0], &k_z_centered);
let f2 = dot(&self.alphas[1], &k_z_centered);
let f3 = dot(&self.alphas[2], &k_z_centered);
let spherical_potential = (1.0 - 2.0 * mean_k_z + self.grand_mean).max(0.0);
(f1, f2, f3, spherical_potential)
}
}
impl Projection for KernelPcaProjection {
fn project(&self, embedding: &Embedding) -> SphericalPoint {
assert_eq!(
embedding.dimension(),
self.dim,
"expected dimension {}, got {}",
self.dim,
embedding.dimension()
);
let normalized = embedding.normalized();
let (x, y, z, _) = self.kernel_project(&normalized);
if self.volumetric {
let sp = cartesian_to_spherical(&CartesianPoint::new(x, y, z));
if sp.r < f64::EPSILON {
return SphericalPoint::new_unchecked(0.0, 0.0, 0.0);
}
SphericalPoint::new_unchecked(sp.r, sp.theta, sp.phi)
} else {
let r = self.radial.compute(embedding.magnitude());
project_xyz_to_spherical(x, y, z, r)
}
}
fn project_rich(&self, embedding: &Embedding) -> ProjectedPoint {
assert_eq!(
embedding.dimension(),
self.dim,
"expected dimension {}, got {}",
self.dim,
embedding.dimension()
);
let intensity = embedding.magnitude();
let normalized = embedding.normalized();
let (x, y, z, spherical_potential) = self.kernel_project(&normalized);
let projection_magnitude = (x * x + y * y + z * z).sqrt();
let projection_sq = x * x + y * y + z * z;
let certainty = if spherical_potential > f64::EPSILON {
(projection_sq / spherical_potential).clamp(0.0, 1.0)
} else {
0.0
};
let position = if self.volumetric {
let sp = cartesian_to_spherical(&CartesianPoint::new(x, y, z));
if sp.r < f64::EPSILON {
SphericalPoint::new_unchecked(0.0, 0.0, 0.0)
} else {
SphericalPoint::new_unchecked(sp.r, sp.theta, sp.phi)
}
} else {
let r = self.radial.compute(intensity);
project_xyz_to_spherical(x, y, z, r)
};
ProjectedPoint::new(position, certainty, intensity, projection_magnitude)
}
fn dimensionality(&self) -> usize {
self.dim
}
}
#[inline]
fn gaussian_kernel(a: &[f64], b: &[f64], inv_two_sigma_sq: f64) -> f64 {
let sq_dist: f64 = a
.iter()
.zip(b.iter())
.map(|(&x, &y)| {
let d = x - y;
d * d
})
.sum();
(-sq_dist * inv_two_sigma_sq).exp()
}
fn auto_sigma(data: &[Vec<f64>]) -> f64 {
let n = data.len();
if n < 2 {
return 1.0;
}
let mut distances = Vec::new();
let mut rng = SplitMix64::new(0xCAFE_BABE);
if n <= 100 {
distances.reserve(n * (n - 1) / 2);
for i in 0..n {
for j in (i + 1)..n {
distances.push(euclidean_dist(&data[i], &data[j]));
}
}
} else {
let num_pairs = 5000.min(n * (n - 1) / 2);
distances.reserve(num_pairs);
for _ in 0..num_pairs {
let i = (rng.next_u64() as usize) % n;
let mut j = (rng.next_u64() as usize) % n;
if j == i {
j = (j + 1) % n;
}
distances.push(euclidean_dist(&data[i], &data[j]));
}
}
distances.sort_unstable_by(|a, b| a.partial_cmp(b).unwrap());
let median = distances[distances.len() / 2];
(median * std::f64::consts::FRAC_1_SQRT_2).max(1e-8)
}
fn euclidean_dist(a: &[f64], b: &[f64]) -> f64 {
a.iter()
.zip(b.iter())
.map(|(&x, &y)| {
let d = x - y;
d * d
})
.sum::<f64>()
.sqrt()
}
fn top_k_symmetric(matrix: &[f64], n: usize, k: usize) -> (Vec<Vec<f64>>, Vec<f64>) {
let max_iters = 300;
let tol = 1e-10;
let mut vectors: Vec<Vec<f64>> = Vec::with_capacity(k);
let mut values: Vec<f64> = Vec::with_capacity(k);
let mut rng = SplitMix64::new(0xBEEF_CAFE);
for _ in 0..k {
let mut v: Vec<f64> = (0..n).map(|_| rng.normal()).collect();
normalize_vec(&mut v);
let mut eigenvalue = 0.0;
for _ in 0..max_iters {
let mut u = vec![0.0; n];
for (i, u_i) in u.iter_mut().enumerate().skip(1) {
let row_start = i * n;
let mut s = 0.0;
for j in 0..n {
s += matrix[row_start + j] * v[j];
}
*u_i = s;
}
for prev in &vectors {
let proj = dot(&u, prev);
for (ui, &pi) in u.iter_mut().zip(prev.iter()) {
*ui -= proj * pi;
}
}
let mag = normalize_vec(&mut u);
if mag < f64::EPSILON {
break;
}
eigenvalue = {
let mut s = 0.0;
for i in 0..n {
let row_start = i * n;
let mut mv_i = 0.0;
for j in 0..n {
mv_i += matrix[row_start + j] * u[j];
}
s += u[i] * mv_i;
}
s
};
let change = 1.0 - dot(&u, &v).abs();
v = u;
if change < tol {
break;
}
}
vectors.push(v);
values.push(eigenvalue);
}
while vectors.len() < k {
let mut v: Vec<f64> = (0..n).map(|_| rng.normal()).collect();
for prev in &vectors {
let proj = dot(&v, prev);
for (vi, &pi) in v.iter_mut().zip(prev.iter()) {
*vi -= proj * pi;
}
}
normalize_vec(&mut v);
vectors.push(v);
values.push(0.0);
}
(vectors, values)
}
#[cfg(test)]
mod tests {
use super::*;
use sphereql_core::angular_distance;
use std::f64::consts::{PI, TAU};
fn emb(vals: &[f64]) -> Embedding {
Embedding::new(vals.to_vec())
}
fn corpus_10d() -> Vec<Embedding> {
vec![
emb(&[1.0, 0.0, 0.0, 0.1, 0.05, -0.02, 0.03, -0.01, 0.04, 0.02]),
emb(&[0.0, 1.0, 0.0, -0.05, 0.1, 0.03, -0.02, 0.01, -0.03, 0.04]),
emb(&[0.0, 0.0, 1.0, 0.02, -0.03, 0.1, 0.05, 0.02, -0.01, -0.04]),
emb(&[1.0, 1.0, 0.0, 0.05, 0.08, 0.01, 0.01, -0.02, 0.02, 0.03]),
emb(&[0.0, 1.0, 1.0, -0.02, 0.07, 0.07, 0.01, 0.02, -0.02, 0.01]),
emb(&[1.0, 0.0, 1.0, 0.06, 0.01, 0.05, -0.03, -0.01, 0.03, -0.02]),
emb(&[-1.0, 0.0, 0.0, -0.08, 0.02, 0.01, 0.02, 0.03, -0.02, 0.01]),
emb(&[0.0, -1.0, 0.0, 0.03, -0.09, -0.02, 0.01, -0.01, 0.02, -0.03]),
]
}
fn assert_valid_spherical(sp: &SphericalPoint) {
assert!(sp.r >= 0.0, "r must be >= 0, got {}", sp.r);
assert!(
sp.theta >= 0.0 && sp.theta < TAU,
"theta must be in [0, 2π), got {}",
sp.theta
);
assert!(
sp.phi >= 0.0 && sp.phi <= PI,
"phi must be in [0, π], got {}",
sp.phi
);
}
#[test]
fn kernel_pca_fit_auto_sigma() {
let corpus = corpus_10d();
let kpca = KernelPcaProjection::fit(&corpus, RadialStrategy::Fixed(1.0));
assert_eq!(kpca.dimensionality(), 10);
assert!(kpca.sigma() > 0.0);
assert_eq!(kpca.num_training_points(), 8);
}
#[test]
fn kernel_pca_fit_explicit_sigma() {
let corpus = corpus_10d();
let kpca = KernelPcaProjection::fit_with_sigma(&corpus, 0.5, RadialStrategy::Fixed(1.0));
assert!((kpca.sigma() - 0.5).abs() < 1e-12);
}
#[test]
fn kernel_pca_fit_default() {
let corpus = corpus_10d();
let kpca = KernelPcaProjection::fit_default(&corpus);
assert_eq!(kpca.dimensionality(), 10);
}
#[test]
#[should_panic(expected = "need at least one embedding")]
fn kernel_pca_empty_corpus_panics() {
KernelPcaProjection::fit(&[], RadialStrategy::Fixed(1.0));
}
#[test]
#[should_panic(expected = "embedding dimension must be >= 3")]
fn kernel_pca_too_few_dims_panics() {
KernelPcaProjection::fit(&[emb(&[1.0, 2.0])], RadialStrategy::Fixed(1.0));
}
#[test]
#[should_panic(expected = "σ must be positive")]
fn kernel_pca_zero_sigma_panics() {
let corpus = corpus_10d();
KernelPcaProjection::fit_with_sigma(&corpus, 0.0, RadialStrategy::Fixed(1.0));
}
#[test]
fn kernel_pca_produces_valid_spherical_points() {
let corpus = corpus_10d();
let kpca = KernelPcaProjection::fit(&corpus, RadialStrategy::Fixed(1.0));
for e in &corpus {
assert_valid_spherical(&kpca.project(e));
}
}
#[test]
fn kernel_pca_out_of_sample_produces_valid_points() {
let corpus = corpus_10d();
let kpca = KernelPcaProjection::fit(&corpus, RadialStrategy::Fixed(1.0));
let novel = emb(&[0.5, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]);
assert_valid_spherical(&kpca.project(&novel));
}
#[test]
fn kernel_pca_preserves_angular_ordering() {
let corpus = corpus_10d();
let kpca = KernelPcaProjection::fit(&corpus, RadialStrategy::Fixed(1.0));
let a = emb(&[1.0, 0.1, 0.0, 0.05, 0.02, -0.01, 0.01, 0.0, 0.02, 0.01]);
let b = emb(&[0.9, 0.2, 0.1, 0.04, 0.03, 0.0, 0.02, -0.01, 0.01, 0.02]);
let c = emb(&[-1.0, -0.1, 0.0, -0.04, 0.01, 0.02, 0.01, 0.02, -0.01, 0.01]);
let pa = kpca.project(&a);
let pb = kpca.project(&b);
let pc = kpca.project(&c);
let d_ab = angular_distance(&pa, &pb);
let d_ac = angular_distance(&pa, &pc);
assert!(
d_ab < d_ac,
"similar items should be closer: d(a,b)={d_ab:.4} < d(a,c)={d_ac:.4}"
);
}
#[test]
fn kernel_pca_fixed_radial() {
let corpus = corpus_10d();
let kpca = KernelPcaProjection::fit(&corpus, RadialStrategy::Fixed(2.5));
let sp = kpca.project(&corpus[0]);
assert!((sp.r - 2.5).abs() < 1e-12);
}
#[test]
fn kernel_pca_magnitude_radial() {
let corpus = corpus_10d();
let kpca = KernelPcaProjection::fit(&corpus, RadialStrategy::Magnitude);
let short = emb(&[0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]);
let long = emb(&[10.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]);
let ps = kpca.project(&short);
let pl = kpca.project(&long);
assert!(ps.r < pl.r, "longer vector should have larger radius");
assert!((ps.r - 0.1).abs() < 1e-10);
assert!((pl.r - 10.0).abs() < 1e-10);
}
#[test]
fn kernel_pca_volumetric() {
let corpus = corpus_10d();
let kpca =
KernelPcaProjection::fit(&corpus, RadialStrategy::Fixed(1.0)).with_volumetric(true);
let sp = kpca.project(&corpus[0]);
assert!(sp.r >= 0.0);
}
#[test]
fn kernel_pca_project_rich_has_valid_certainty() {
let corpus = corpus_10d();
let kpca = KernelPcaProjection::fit(&corpus, RadialStrategy::Fixed(1.0));
for e in &corpus {
let rich = kpca.project_rich(e);
assert!(
rich.certainty >= 0.0 && rich.certainty <= 1.0,
"certainty must be in [0,1], got {}",
rich.certainty
);
assert!(rich.intensity > 0.0);
assert!(rich.projection_magnitude >= 0.0);
}
}
#[test]
fn kernel_pca_certainty_is_meaningful() {
let corpus = corpus_10d();
let kpca = KernelPcaProjection::fit(&corpus, RadialStrategy::Fixed(1.0));
let total_certainty: f64 = corpus.iter().map(|e| kpca.project_rich(e).certainty).sum();
let mean_certainty = total_certainty / corpus.len() as f64;
assert!(
mean_certainty > 0.01,
"mean certainty of training data should be non-trivial, got {mean_certainty}"
);
}
#[test]
fn kernel_pca_explained_variance_in_range() {
let corpus = corpus_10d();
let kpca = KernelPcaProjection::fit(&corpus, RadialStrategy::Fixed(1.0));
let ratio = kpca.explained_variance_ratio();
assert!(
ratio > 0.0 && ratio <= 1.0,
"explained variance ratio should be in (0, 1], got {ratio}"
);
}
#[test]
fn kernel_pca_eigenvalues_descending() {
let corpus = corpus_10d();
let kpca = KernelPcaProjection::fit(&corpus, RadialStrategy::Fixed(1.0));
let ev = kpca.eigenvalues();
assert!(ev[0] >= ev[1], "eigenvalues must descend: {ev:?}");
assert!(ev[1] >= ev[2], "eigenvalues must descend: {ev:?}");
assert!(ev[0] > 0.0, "first eigenvalue should be positive");
}
#[test]
fn gaussian_kernel_self_similarity() {
let x = vec![1.0, 0.0, 0.0, 0.0, 0.0];
let inv = 0.5;
assert!((gaussian_kernel(&x, &x, inv) - 1.0).abs() < 1e-12);
}
#[test]
fn gaussian_kernel_symmetry() {
let a = vec![1.0, 0.0, 0.5];
let b = vec![0.0, 1.0, -0.3];
let inv = 1.0;
assert!((gaussian_kernel(&a, &b, inv) - gaussian_kernel(&b, &a, inv)).abs() < 1e-15);
}
#[test]
fn gaussian_kernel_range() {
let a = vec![1.0, 0.0, 0.0];
let b = vec![0.0, 1.0, 0.0];
let inv = 0.5;
let k = gaussian_kernel(&a, &b, inv);
assert!(
k > 0.0 && k <= 1.0,
"Gaussian kernel must be in (0, 1], got {k}"
);
}
#[test]
fn auto_sigma_is_positive() {
let corpus = corpus_10d();
let normalized: Vec<Vec<f64>> = corpus.iter().map(|e| e.normalized()).collect();
let sigma = auto_sigma(&normalized);
assert!(sigma > 0.0);
}
#[test]
fn auto_sigma_single_point() {
let data = vec![vec![1.0, 0.0, 0.0]];
assert!((auto_sigma(&data) - 1.0).abs() < 1e-12);
}
#[test]
#[should_panic(expected = "expected dimension 10")]
fn kernel_pca_dimension_mismatch_panics() {
let corpus = corpus_10d();
let kpca = KernelPcaProjection::fit(&corpus, RadialStrategy::Fixed(1.0));
let _ = kpca.project(&emb(&[1.0, 2.0, 3.0]));
}
#[test]
fn kernel_pca_clone_produces_identical_results() {
let corpus = corpus_10d();
let kpca = KernelPcaProjection::fit(&corpus, RadialStrategy::Fixed(1.0));
let kpca2 = kpca.clone();
for e in &corpus {
let sp1 = kpca.project(e);
let sp2 = kpca2.project(e);
assert!((sp1.theta - sp2.theta).abs() < 1e-12);
assert!((sp1.phi - sp2.phi).abs() < 1e-12);
}
}
#[test]
fn kernel_pca_works_with_embedding_index() {
use crate::query::EmbeddingIndex;
let corpus = corpus_10d();
let kpca = KernelPcaProjection::fit(&corpus, RadialStrategy::Fixed(1.0));
let mut idx = EmbeddingIndex::builder(kpca)
.theta_divisions(4)
.phi_divisions(3)
.build();
for (i, e) in corpus.iter().enumerate() {
idx.insert(format!("item-{i}"), e);
}
assert_eq!(idx.len(), corpus.len());
let query = emb(&[0.9, 0.1, 0.0, 0.05, 0.02, -0.01, 0.01, 0.0, 0.02, 0.01]);
let results = idx.search_nearest(&query, 3);
assert_eq!(results.len(), 3);
assert!(results[0].distance <= results[1].distance);
}
#[test]
fn large_sigma_approaches_pca_ordering() {
use crate::projection::PcaProjection;
let corpus = corpus_10d();
let pca = PcaProjection::fit(&corpus, RadialStrategy::Fixed(1.0));
let kpca = KernelPcaProjection::fit_with_sigma(&corpus, 100.0, RadialStrategy::Fixed(1.0));
let query = emb(&[1.0, 0.1, 0.0, 0.05, 0.02, -0.01, 0.01, 0.0, 0.02, 0.01]);
let pca_pt = pca.project(&query);
let kpca_pt = kpca.project(&query);
let mut pca_dists: Vec<(usize, f64)> = corpus
.iter()
.enumerate()
.map(|(i, e)| (i, angular_distance(&pca_pt, &pca.project(e))))
.collect();
let mut kpca_dists: Vec<(usize, f64)> = corpus
.iter()
.enumerate()
.map(|(i, e)| (i, angular_distance(&kpca_pt, &kpca.project(e))))
.collect();
pca_dists.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
kpca_dists.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
assert_eq!(
pca_dists[0].0, kpca_dists[0].0,
"nearest neighbour should match between PCA and kernel PCA with large σ"
);
}
}