sphereql_embed/config.rs
1//! Configuration surface for the SphereQL pipeline.
2//!
3//! Every tunable constant that governs projection, bridge detection,
4//! inner-sphere gating, domain-group routing, and spatial-quality
5//! Monte Carlo sample counts lives here. This is the first-class knob
6//! inventory that future auto-tuning and meta-learning passes optimize
7//! over.
8//!
9//! The [`PipelineConfig::default`] values reproduce the historical
10//! hardcoded constants; the pipeline accepts any overriding config.
11
12// ── Top-level ──────────────────────────────────────────────────────────
13
14/// All tunable parameters for a SphereQL pipeline build.
15///
16/// Every field is a sub-config grouped by area. [`Self::default`] returns
17/// the values the crate shipped with before the config surface existed.
18#[derive(Debug, Clone, serde::Serialize, serde::Deserialize, Default)]
19#[serde(default)]
20pub struct PipelineConfig {
21 /// Outer-sphere projection family.
22 pub projection_kind: ProjectionKind,
23 /// Inner-sphere gating thresholds.
24 pub inner_sphere: InnerSphereConfig,
25 /// Bridge detection and classification.
26 pub bridges: BridgeConfig,
27 /// Hierarchical domain-group routing.
28 pub routing: RoutingConfig,
29 /// Laplacian eigenmap hyperparameters (only consulted if that
30 /// projection is selected).
31 pub laplacian: LaplacianConfig,
32 /// Spatial quality Monte Carlo sample counts.
33 pub spatial: SpatialConfig,
34}
35
36// ── Projection kind ────────────────────────────────────────────────────
37
38/// Which projection family the pipeline uses for the outer sphere.
39///
40/// A first-class tunable axis:
41/// [`SearchSpace::projection_kinds`](crate::tuner::SearchSpace::projection_kinds)
42/// enumerates the families the auto-tuner sweeps, and
43/// [`CorpusFeatures`](crate::corpus_features::CorpusFeatures) →
44/// [`PipelineConfig`] meta-models can map corpus profiles onto the
45/// kind that works best.
46#[derive(
47 Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize, Default,
48)]
49pub enum ProjectionKind {
50 /// Linear PCA — fast, variance-maximizing. Good default for dense,
51 /// low-noise embeddings.
52 #[default]
53 Pca,
54 /// Kernel PCA with a Gaussian (RBF) kernel. Captures nonlinear
55 /// manifold structure at O(n²) fit cost.
56 KernelPca,
57 /// Laplacian eigenmap over a Jaccard-similarity graph of active
58 /// axes. Connectivity-preserving; preferred when signal lives in
59 /// the co-activation structure of a sparse embedding rather than in
60 /// coordinate variance (the typical failure mode of PCA on 128-dim
61 /// noise-heavy corpora).
62 LaplacianEigenmap,
63}
64
65impl ProjectionKind {
66 /// Short stable name for logs and tuner reports.
67 pub fn name(self) -> &'static str {
68 match self {
69 Self::Pca => "pca",
70 Self::KernelPca => "kernel_pca",
71 Self::LaplacianEigenmap => "laplacian_eigenmap",
72 }
73 }
74
75 /// All supported kinds, in a stable order.
76 pub fn all() -> &'static [ProjectionKind] {
77 &[
78 ProjectionKind::Pca,
79 ProjectionKind::KernelPca,
80 ProjectionKind::LaplacianEigenmap,
81 ]
82 }
83}
84
85// ── Inner-sphere ───────────────────────────────────────────────────────
86
87/// Thresholds governing when a category gets its own inner projection.
88#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
89#[serde(default)]
90pub struct InnerSphereConfig {
91 /// Minimum member count for a category to be considered.
92 pub min_size: usize,
93 /// Minimum EVR improvement (inner − global_subset) to justify building
94 /// an inner sphere at all.
95 pub min_evr_improvement: f64,
96 /// Minimum member count at which kernel PCA is attempted.
97 pub kernel_pca_min_size: usize,
98 /// Minimum EVR improvement of kernel PCA over linear PCA to prefer it.
99 pub min_kernel_improvement: f64,
100}
101
102impl Default for InnerSphereConfig {
103 fn default() -> Self {
104 Self {
105 min_size: 20,
106 min_evr_improvement: 0.10,
107 kernel_pca_min_size: 80,
108 min_kernel_improvement: 0.05,
109 }
110 }
111}
112
113// ── Bridges ────────────────────────────────────────────────────────────
114
115/// Parameters controlling bridge detection and classification.
116#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
117#[serde(default)]
118pub struct BridgeConfig {
119 /// Constant term in the EVR-adaptive bridge threshold
120 /// `threshold = threshold_base + (1 − evr)² · threshold_evr_penalty`.
121 pub threshold_base: f64,
122 /// EVR-penalty coefficient in the bridge threshold formula.
123 pub threshold_evr_penalty: f64,
124 /// Territorial factor below which a bridge is classified as an
125 /// `OverlapArtifact` rather than `Genuine` or `Weak`.
126 pub overlap_artifact_territorial: f64,
127}
128
129impl Default for BridgeConfig {
130 fn default() -> Self {
131 Self {
132 threshold_base: 0.5,
133 threshold_evr_penalty: 0.4,
134 overlap_artifact_territorial: 0.3,
135 }
136 }
137}
138
139impl BridgeConfig {
140 /// EVR-adaptive bridge threshold.
141 ///
142 /// Higher EVR → looser threshold (projection is more trustworthy).
143 /// At EVR=0.19: 0.5 + 0.81² × 0.4 = 0.76 (strict).
144 /// At EVR=0.90: 0.5 + 0.01 × 0.4 = 0.50 (essentially unchanged).
145 pub fn evr_adaptive_threshold(&self, evr: f64) -> f64 {
146 self.threshold_base + (1.0 - evr).powi(2) * self.threshold_evr_penalty
147 }
148}
149
150// ── Hierarchical routing ───────────────────────────────────────────────
151
152/// Parameters for hierarchical domain-group routing.
153#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
154#[serde(default)]
155pub struct RoutingConfig {
156 /// Number of domain groups detected at build time by
157 /// [`detect_domain_groups`](crate::domain_groups::detect_domain_groups).
158 pub num_domain_groups: usize,
159 /// EVR below which `hierarchical_nearest` routes through domain
160 /// groups and inner spheres instead of the outer sphere.
161 pub low_evr_threshold: f64,
162}
163
164impl Default for RoutingConfig {
165 fn default() -> Self {
166 Self {
167 num_domain_groups: 5,
168 low_evr_threshold: 0.35,
169 }
170 }
171}
172
173// ── Laplacian eigenmap ─────────────────────────────────────────────────
174
175/// Graph-construction parameters for [`LaplacianEigenmapProjection`](crate::laplacian::LaplacianEigenmapProjection).
176#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
177#[serde(default)]
178pub struct LaplacianConfig {
179 /// k in the k-NN graph sparsification step.
180 pub k_neighbors: usize,
181 /// Absolute-weight cutoff below which an axis is treated as noise.
182 pub active_threshold: f64,
183}
184
185impl Default for LaplacianConfig {
186 fn default() -> Self {
187 Self {
188 k_neighbors: 15,
189 active_threshold: 0.05,
190 }
191 }
192}
193
194// ── Spatial quality ────────────────────────────────────────────────────
195
196/// Monte Carlo sample counts for [`SpatialQuality::compute`](crate::spatial_quality::SpatialQuality::compute).
197///
198/// These run once at build time. Higher = more precise but slower.
199#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
200#[serde(default)]
201pub struct SpatialConfig {
202 /// Samples used to estimate what fraction of S² is covered by any
203 /// category's cap. Higher = tighter coverage estimate. Default
204 /// `100_000` → ~50ms at 31 categories.
205 pub coverage_samples: usize,
206 /// Samples used per category to estimate its cap exclusivity (the
207 /// fraction of its cap not overlapped by any other category).
208 /// Runs `n_categories` times so cost scales linearly with C.
209 /// Default `30_000` per category.
210 pub exclusivity_samples: usize,
211 /// Samples used to estimate the spherical Voronoi tessellation over
212 /// category centroids. Higher = tighter per-cell area estimates.
213 /// Default `100_000` → ~100ms at 31 centroids.
214 pub voronoi_samples: usize,
215}
216
217impl Default for SpatialConfig {
218 fn default() -> Self {
219 Self {
220 coverage_samples: 100_000,
221 exclusivity_samples: 30_000,
222 voronoi_samples: 100_000,
223 }
224 }
225}
226
227#[cfg(test)]
228mod tests {
229 use super::*;
230
231 #[test]
232 fn defaults_match_legacy_constants() {
233 let c = PipelineConfig::default();
234 assert_eq!(c.projection_kind, ProjectionKind::Pca);
235 assert_eq!(c.inner_sphere.min_size, 20);
236 assert_eq!(c.inner_sphere.kernel_pca_min_size, 80);
237 assert!((c.inner_sphere.min_evr_improvement - 0.10).abs() < 1e-12);
238 assert!((c.inner_sphere.min_kernel_improvement - 0.05).abs() < 1e-12);
239 assert!((c.bridges.threshold_base - 0.5).abs() < 1e-12);
240 assert!((c.bridges.threshold_evr_penalty - 0.4).abs() < 1e-12);
241 assert!((c.bridges.overlap_artifact_territorial - 0.3).abs() < 1e-12);
242 assert_eq!(c.routing.num_domain_groups, 5);
243 assert!((c.routing.low_evr_threshold - 0.35).abs() < 1e-12);
244 assert_eq!(c.laplacian.k_neighbors, 15);
245 assert!((c.laplacian.active_threshold - 0.05).abs() < 1e-12);
246 assert_eq!(c.spatial.coverage_samples, 100_000);
247 assert_eq!(c.spatial.exclusivity_samples, 30_000);
248 assert_eq!(c.spatial.voronoi_samples, 100_000);
249 }
250
251 #[test]
252 fn evr_adaptive_threshold_monotone_in_evr() {
253 let b = BridgeConfig::default();
254 let low = b.evr_adaptive_threshold(0.15);
255 let mid = b.evr_adaptive_threshold(0.50);
256 let high = b.evr_adaptive_threshold(0.90);
257 // Higher EVR → smaller threshold
258 assert!(low > mid);
259 assert!(mid > high);
260 assert!((high - 0.5).abs() < 0.05);
261 }
262
263 #[test]
264 fn config_is_clone() {
265 let a = PipelineConfig::default();
266 let b = a.clone();
267 assert_eq!(a.inner_sphere.min_size, b.inner_sphere.min_size);
268 }
269
270 #[test]
271 fn projection_kind_name_and_all_stable() {
272 assert_eq!(ProjectionKind::Pca.name(), "pca");
273 assert_eq!(ProjectionKind::KernelPca.name(), "kernel_pca");
274 assert_eq!(
275 ProjectionKind::LaplacianEigenmap.name(),
276 "laplacian_eigenmap"
277 );
278 assert_eq!(ProjectionKind::all().len(), 3);
279 }
280}