sphereql_embed/config.rs
1//! Configuration surface for the SphereQL pipeline.
2//!
3//! Every tunable constant that governs projection, bridge detection,
4//! inner-sphere gating, domain-group routing, and spatial-quality
5//! Monte Carlo sample counts lives here. This is the first-class knob
6//! inventory that future auto-tuning and meta-learning passes optimize
7//! over.
8//!
9//! The [`PipelineConfig::default`] values reproduce the historical
10//! hardcoded constants; the pipeline accepts any overriding config.
11
12// ── Top-level ──────────────────────────────────────────────────────────
13
14/// All tunable parameters for a SphereQL pipeline build.
15///
16/// Every field is a sub-config grouped by area. [`Self::default`] returns
17/// the values the crate shipped with before the config surface existed.
18#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
19#[serde(default)]
20pub struct PipelineConfig {
21 /// Outer-sphere projection family.
22 pub projection_kind: ProjectionKind,
23 /// Inner-sphere gating thresholds.
24 pub inner_sphere: InnerSphereConfig,
25 /// Bridge detection and classification.
26 pub bridges: BridgeConfig,
27 /// Hierarchical domain-group routing.
28 pub routing: RoutingConfig,
29 /// Laplacian eigenmap hyperparameters (only consulted if that
30 /// projection is selected).
31 pub laplacian: LaplacianConfig,
32 /// UMAP-on-sphere hyperparameters (only consulted if that
33 /// projection is selected).
34 pub umap: UmapConfig,
35 /// Spatial quality Monte Carlo sample counts.
36 pub spatial: SpatialConfig,
37 /// Minimum number of items a category must have to participate in
38 /// category-level analysis (bridges, domain groups, spatial quality,
39 /// Voronoi tessellation). Categories below this threshold are excluded
40 /// from the enrichment layer but their items remain projected, indexed,
41 /// and queryable on the sphere.
42 ///
43 /// Default 1 (no filtering — every category participates).
44 /// Set to 5–10 for corpora with many singleton categories.
45 #[serde(default = "default_min_category_size")]
46 pub min_category_size: usize,
47}
48
49fn default_min_category_size() -> usize {
50 1
51}
52
53impl Default for PipelineConfig {
54 fn default() -> Self {
55 Self {
56 projection_kind: ProjectionKind::default(),
57 inner_sphere: InnerSphereConfig::default(),
58 bridges: BridgeConfig::default(),
59 routing: RoutingConfig::default(),
60 laplacian: LaplacianConfig::default(),
61 umap: UmapConfig::default(),
62 spatial: SpatialConfig::default(),
63 min_category_size: default_min_category_size(),
64 }
65 }
66}
67
68// ── Projection kind ────────────────────────────────────────────────────
69
70/// Which projection family the pipeline uses for the outer sphere.
71///
72/// A first-class tunable axis:
73/// [`SearchSpace::projection_kinds`](crate::tuner::SearchSpace::projection_kinds)
74/// enumerates the families the auto-tuner sweeps, and
75/// [`CorpusFeatures`](crate::corpus_features::CorpusFeatures) →
76/// [`PipelineConfig`] meta-models can map corpus profiles onto the
77/// kind that works best.
78#[derive(
79 Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize, Default,
80)]
81pub enum ProjectionKind {
82 /// Linear PCA — fast, variance-maximizing. Good default for dense,
83 /// low-noise embeddings.
84 #[default]
85 Pca,
86 /// Kernel PCA with a Gaussian (RBF) kernel. Captures nonlinear
87 /// manifold structure at O(n²) fit cost.
88 KernelPca,
89 /// Laplacian eigenmap over a Jaccard-similarity graph of active
90 /// axes. Connectivity-preserving; preferred when signal lives in
91 /// the co-activation structure of a sparse embedding rather than in
92 /// coordinate variance (the typical failure mode of PCA on 128-dim
93 /// noise-heavy corpora).
94 LaplacianEigenmap,
95 /// UMAP-on-sphere via Adam in the tangent bundle of S². PCA warm
96 /// start, kNN attractive + uniform-negative repulsive, optional
97 /// supervised category term. Preferred when angular ordering on the
98 /// sphere matters more than raw variance preservation, and when a
99 /// modest fit cost (O(n²·epochs) for the kNN graph + iterations) is
100 /// acceptable.
101 UmapSphere,
102}
103
104impl ProjectionKind {
105 /// Short stable name for logs and tuner reports.
106 pub fn name(self) -> &'static str {
107 match self {
108 Self::Pca => "pca",
109 Self::KernelPca => "kernel_pca",
110 Self::LaplacianEigenmap => "laplacian_eigenmap",
111 Self::UmapSphere => "umap_sphere",
112 }
113 }
114
115 /// All supported kinds, in a stable order.
116 pub fn all() -> &'static [ProjectionKind] {
117 &[
118 ProjectionKind::Pca,
119 ProjectionKind::KernelPca,
120 ProjectionKind::LaplacianEigenmap,
121 ProjectionKind::UmapSphere,
122 ]
123 }
124}
125
126// ── Inner-sphere ───────────────────────────────────────────────────────
127
128/// Thresholds governing when a category gets its own inner projection.
129#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
130#[serde(default)]
131pub struct InnerSphereConfig {
132 /// Minimum member count for a category to be considered.
133 pub min_size: usize,
134 /// Minimum EVR improvement (inner − global_subset) to justify building
135 /// an inner sphere at all.
136 pub min_evr_improvement: f64,
137 /// Minimum member count at which kernel PCA is attempted.
138 pub kernel_pca_min_size: usize,
139 /// Minimum EVR improvement of kernel PCA over linear PCA to prefer it.
140 pub min_kernel_improvement: f64,
141}
142
143impl Default for InnerSphereConfig {
144 fn default() -> Self {
145 Self {
146 min_size: 20,
147 min_evr_improvement: 0.10,
148 kernel_pca_min_size: 80,
149 min_kernel_improvement: 0.05,
150 }
151 }
152}
153
154// ── Bridges ────────────────────────────────────────────────────────────
155
156/// Parameters controlling bridge detection and classification.
157#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
158#[serde(default)]
159pub struct BridgeConfig {
160 /// Constant term in the EVR-adaptive bridge threshold
161 /// `threshold = threshold_base + (1 − evr)² · threshold_evr_penalty`.
162 pub threshold_base: f64,
163 /// EVR-penalty coefficient in the bridge threshold formula.
164 pub threshold_evr_penalty: f64,
165 /// Percentile of the observed territorial factor distribution below
166 /// which a bridge is classified as `OverlapArtifact` rather than
167 /// `Genuine` or `Weak`. 0.3 = the bottom 30 % of bridge pairs by
168 /// territorial separation are labeled artifacts. Expressed as a
169 /// percentile so that dense corpora (where all exclusivities collapse
170 /// toward zero) do not classify every bridge as an artifact.
171 pub overlap_artifact_territorial: f64,
172 /// Quantile of the home-affinity distribution that sets the
173 /// genuine-bridge floor. For each member item, "home affinity" is
174 /// the cosine similarity between the item's embedding and its own
175 /// category's centroid. A bridge is classified `Genuine` when
176 /// `min(affinity_to_source, affinity_to_target)` exceeds the
177 /// quantile-q of those home affinities; otherwise `Weak`.
178 ///
179 /// Why a quantile and not an absolute cosine: home affinity scale
180 /// varies with the projection layout. After stratified PCA spreads
181 /// imbalanced corpora, home affinities can drop into the 0.3–0.6
182 /// band where a fixed 0.5 cosine floor labels almost every cross-
183 /// domain item `Weak`. A quantile-based floor adapts to the
184 /// corpus's own affinity scale: tight corpora get a strict floor,
185 /// spread ones get a permissive one, without per-corpus tuning.
186 ///
187 /// Smaller q = stricter (only bridges matching the strongest
188 /// home affinities qualify). Larger q = more permissive. Default
189 /// 0.25: a bridge is `Genuine` if it has at least as much
190 /// affinity to both sides as the bottom-25% of items have to
191 /// their own home category.
192 pub balanced_affinity_quantile: f64,
193 /// EVR below which bridge classification is unreliable. When the
194 /// outer projection's EVR is below this threshold, all bridges
195 /// are labeled `Weak` (honest uncertainty) rather than attempting
196 /// territorial-factor-based classification — which collapses to
197 /// 100% `OverlapArtifact` when caps overlap everywhere on a
198 /// low-EVR projection, flattening the tuner landscape. Default
199 /// 0.20.
200 pub min_evr_for_classification: f64,
201}
202
203impl Default for BridgeConfig {
204 fn default() -> Self {
205 Self {
206 threshold_base: 0.5,
207 threshold_evr_penalty: 0.4,
208 overlap_artifact_territorial: 0.3,
209 balanced_affinity_quantile: 0.25,
210 min_evr_for_classification: 0.20,
211 }
212 }
213}
214
215impl BridgeConfig {
216 /// EVR-adaptive bridge threshold.
217 ///
218 /// Higher EVR → looser threshold (projection is more trustworthy).
219 /// At EVR=0.19: 0.5 + 0.81² × 0.4 = 0.76 (strict).
220 /// At EVR=0.90: 0.5 + 0.01 × 0.4 = 0.50 (essentially unchanged).
221 pub fn evr_adaptive_threshold(&self, evr: f64) -> f64 {
222 self.threshold_base + (1.0 - evr).powi(2) * self.threshold_evr_penalty
223 }
224}
225
226// ── Hierarchical routing ───────────────────────────────────────────────
227
228/// Parameters for hierarchical domain-group routing.
229#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
230#[serde(default)]
231pub struct RoutingConfig {
232 /// Number of domain groups detected at build time by
233 /// [`detect_domain_groups`](crate::domain_groups::detect_domain_groups).
234 pub num_domain_groups: usize,
235 /// Distance-ratio gate for the default `nearest()` path. A query
236 /// drills into the nearest group's inner sphere when
237 /// `d_to_nearest / d_to_second_nearest < group_routing_alpha`. A
238 /// smaller α is stricter (only routes when one group is clearly
239 /// closer). Default `0.8` matches the routing interview decision;
240 /// set to `0.0` to disable the default-route behavior entirely
241 /// (falls back to outer-sphere k-NN).
242 pub group_routing_alpha: f64,
243 /// EVR below which `hierarchical_nearest` historically routed
244 /// through domain groups instead of the outer sphere.
245 ///
246 /// Retained for backward-compatibility and debugging — the default
247 /// `nearest()` path now uses [`Self::group_routing_alpha`] instead.
248 /// `hierarchical_nearest()` still consults this for its EVR-gated
249 /// branch.
250 pub low_evr_threshold: f64,
251}
252
253impl Default for RoutingConfig {
254 fn default() -> Self {
255 Self {
256 num_domain_groups: 5,
257 group_routing_alpha: 0.8,
258 low_evr_threshold: 0.35,
259 }
260 }
261}
262
263// ── Laplacian eigenmap ─────────────────────────────────────────────────
264
265/// Graph-construction parameters for [`LaplacianEigenmapProjection`](crate::laplacian::LaplacianEigenmapProjection).
266#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
267#[serde(default)]
268pub struct LaplacianConfig {
269 /// k in the k-NN graph sparsification step.
270 pub k_neighbors: usize,
271 /// Absolute-weight cutoff below which an axis is treated as noise.
272 pub active_threshold: f64,
273}
274
275impl Default for LaplacianConfig {
276 fn default() -> Self {
277 Self {
278 k_neighbors: 15,
279 active_threshold: 0.05,
280 }
281 }
282}
283
284// ── UMAP-on-sphere ─────────────────────────────────────────────────────
285
286/// Hyperparameters for [`UmapSphereProjection`](crate::umap::UmapSphereProjection).
287///
288/// These are the tunable knobs exposed to the auto-tuner. Non-tunable
289/// constants (`learning_rate`, `negative_sample_rate`) stay at their
290/// canonical UMAP defaults inside [`fit_projection_for_config`].
291#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
292#[serde(default)]
293pub struct UmapConfig {
294 /// k in the kNN graph (attractive term). Higher = more global structure.
295 pub n_neighbors: usize,
296 /// Adam optimization epochs. ~200 for corpora < 10k, ~400 for 50k+.
297 pub n_epochs: usize,
298 /// Weight on the category supervision term. 0.0 = unsupervised UMAP.
299 /// Positive values pull same-category items together and push
300 /// different-category items apart. 1.0–3.0 is typical.
301 pub category_weight: f64,
302 /// How tightly neighbors may pack on the sphere. Larger values
303 /// flatten the embedding kernel near zero, so clusters claim more
304 /// territory — exactly what the territorial/cap-overlap metrics
305 /// measure. 0.0 = near-maximal clumping; 0.1 is the canonical UMAP
306 /// default.
307 pub min_dist: f64,
308 /// Weight on an attractive pull toward each point's PCA warm-start
309 /// position; 0.0 disables. Use small values (~0.01–0.1) on sparse
310 /// corpora whose kNN graphs fragment into disconnected components,
311 /// to keep the components' global arrangement from drifting under
312 /// unopposed repulsion. Intentionally not a tuner axis — it is a
313 /// data-pathology escape hatch, not a search dimension.
314 pub warm_start_anchor: f64,
315 /// PRNG seed for negative sampling and tie-breaking.
316 pub seed: u64,
317}
318
319impl Default for UmapConfig {
320 fn default() -> Self {
321 Self {
322 n_neighbors: 15,
323 n_epochs: 200,
324 category_weight: 1.5,
325 min_dist: 0.1,
326 warm_start_anchor: 0.0,
327 seed: 0xA1B2_C3D4,
328 }
329 }
330}
331
332// ── Spatial quality ────────────────────────────────────────────────────
333
334/// Monte Carlo sample counts for [`SpatialQuality::compute`](crate::spatial_quality::SpatialQuality::compute).
335///
336/// These run once at build time. Higher = more precise but slower.
337///
338/// This config only governs the build-time `SpatialQuality::compute`
339/// pass. The navigator's `run_full_analysis` uses its own
340/// `NavigatorConfig` sample counts (with different defaults) and is
341/// unaffected by these values.
342#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
343#[serde(default)]
344pub struct SpatialConfig {
345 /// Samples used to estimate what fraction of S² is covered by any
346 /// category's cap. Higher = tighter coverage estimate. Default
347 /// `100_000` → ~50ms at 31 categories.
348 pub coverage_samples: usize,
349 /// Samples used per category to estimate its cap exclusivity (the
350 /// fraction of its cap not overlapped by any other category).
351 /// Runs `n_categories` times so cost scales linearly with C.
352 /// Default `30_000` per category.
353 pub exclusivity_samples: usize,
354 /// Samples used to estimate the spherical Voronoi tessellation over
355 /// category centroids. Higher = tighter per-cell area estimates.
356 /// Default `100_000` → ~100ms at 31 centroids.
357 pub voronoi_samples: usize,
358}
359
360impl Default for SpatialConfig {
361 fn default() -> Self {
362 Self {
363 coverage_samples: 100_000,
364 exclusivity_samples: 30_000,
365 voronoi_samples: 100_000,
366 }
367 }
368}
369
370#[cfg(test)]
371mod tests {
372 use super::*;
373
374 #[test]
375 fn defaults_match_legacy_constants() {
376 let c = PipelineConfig::default();
377 assert_eq!(c.projection_kind, ProjectionKind::Pca);
378 assert_eq!(c.inner_sphere.min_size, 20);
379 assert_eq!(c.inner_sphere.kernel_pca_min_size, 80);
380 assert!((c.inner_sphere.min_evr_improvement - 0.10).abs() < 1e-12);
381 assert!((c.inner_sphere.min_kernel_improvement - 0.05).abs() < 1e-12);
382 assert!((c.bridges.threshold_base - 0.5).abs() < 1e-12);
383 assert!((c.bridges.threshold_evr_penalty - 0.4).abs() < 1e-12);
384 assert!((c.bridges.overlap_artifact_territorial - 0.3).abs() < 1e-12);
385 assert!((c.bridges.balanced_affinity_quantile - 0.25).abs() < 1e-12);
386 assert!((c.bridges.min_evr_for_classification - 0.20).abs() < 1e-12);
387 assert_eq!(c.routing.num_domain_groups, 5);
388 assert!((c.routing.low_evr_threshold - 0.35).abs() < 1e-12);
389 assert_eq!(c.laplacian.k_neighbors, 15);
390 assert!((c.laplacian.active_threshold - 0.05).abs() < 1e-12);
391 assert_eq!(c.umap.n_neighbors, 15);
392 assert_eq!(c.umap.n_epochs, 200);
393 assert!((c.umap.category_weight - 1.5).abs() < 1e-12);
394 assert!((c.umap.min_dist - 0.1).abs() < 1e-12);
395 assert_eq!(c.umap.warm_start_anchor, 0.0);
396 assert_eq!(c.spatial.coverage_samples, 100_000);
397 assert_eq!(c.spatial.exclusivity_samples, 30_000);
398 assert_eq!(c.spatial.voronoi_samples, 100_000);
399 assert_eq!(c.min_category_size, 1);
400 }
401
402 #[test]
403 fn evr_adaptive_threshold_monotone_in_evr() {
404 let b = BridgeConfig::default();
405 let low = b.evr_adaptive_threshold(0.15);
406 let mid = b.evr_adaptive_threshold(0.50);
407 let high = b.evr_adaptive_threshold(0.90);
408 // Higher EVR → smaller threshold
409 assert!(low > mid);
410 assert!(mid > high);
411 assert!((high - 0.5).abs() < 0.05);
412 }
413
414 #[test]
415 fn config_is_clone() {
416 let a = PipelineConfig::default();
417 let b = a.clone();
418 assert_eq!(a.inner_sphere.min_size, b.inner_sphere.min_size);
419 }
420
421 #[test]
422 fn projection_kind_name_and_all_stable() {
423 assert_eq!(ProjectionKind::Pca.name(), "pca");
424 assert_eq!(ProjectionKind::KernelPca.name(), "kernel_pca");
425 assert_eq!(
426 ProjectionKind::LaplacianEigenmap.name(),
427 "laplacian_eigenmap"
428 );
429 assert_eq!(ProjectionKind::UmapSphere.name(), "umap_sphere");
430 assert_eq!(ProjectionKind::all().len(), 4);
431 }
432}