Skip to main content

sphereql_embed/
config.rs

1//! Configuration surface for the SphereQL pipeline.
2//!
3//! Every tunable constant that governs projection, bridge detection,
4//! inner-sphere gating, domain-group routing, and spatial-quality
5//! Monte Carlo sample counts lives here. This is the first-class knob
6//! inventory that future auto-tuning and meta-learning passes optimize
7//! over.
8//!
9//! The [`PipelineConfig::default`] values reproduce the historical
10//! hardcoded constants; the pipeline accepts any overriding config.
11
12// ── Top-level ──────────────────────────────────────────────────────────
13
14/// All tunable parameters for a SphereQL pipeline build.
15///
16/// Every field is a sub-config grouped by area. [`Self::default`] returns
17/// the values the crate shipped with before the config surface existed.
18#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
19#[serde(default)]
20pub struct PipelineConfig {
21    /// Outer-sphere projection family.
22    pub projection_kind: ProjectionKind,
23    /// Inner-sphere gating thresholds.
24    pub inner_sphere: InnerSphereConfig,
25    /// Bridge detection and classification.
26    pub bridges: BridgeConfig,
27    /// Hierarchical domain-group routing.
28    pub routing: RoutingConfig,
29    /// Laplacian eigenmap hyperparameters (only consulted if that
30    /// projection is selected).
31    pub laplacian: LaplacianConfig,
32    /// UMAP-on-sphere hyperparameters (only consulted if that
33    /// projection is selected).
34    pub umap: UmapConfig,
35    /// Spatial quality Monte Carlo sample counts.
36    pub spatial: SpatialConfig,
37    /// Minimum number of items a category must have to participate in
38    /// category-level analysis (bridges, domain groups, spatial quality,
39    /// Voronoi tessellation). Categories below this threshold are excluded
40    /// from the enrichment layer but their items remain projected, indexed,
41    /// and queryable on the sphere.
42    ///
43    /// Default 1 (no filtering — every category participates).
44    /// Set to 5–10 for corpora with many singleton categories.
45    #[serde(default = "default_min_category_size")]
46    pub min_category_size: usize,
47}
48
49fn default_min_category_size() -> usize {
50    1
51}
52
53impl Default for PipelineConfig {
54    fn default() -> Self {
55        Self {
56            projection_kind: ProjectionKind::default(),
57            inner_sphere: InnerSphereConfig::default(),
58            bridges: BridgeConfig::default(),
59            routing: RoutingConfig::default(),
60            laplacian: LaplacianConfig::default(),
61            umap: UmapConfig::default(),
62            spatial: SpatialConfig::default(),
63            min_category_size: default_min_category_size(),
64        }
65    }
66}
67
68// ── Projection kind ────────────────────────────────────────────────────
69
70/// Which projection family the pipeline uses for the outer sphere.
71///
72/// A first-class tunable axis:
73/// [`SearchSpace::projection_kinds`](crate::tuner::SearchSpace::projection_kinds)
74/// enumerates the families the auto-tuner sweeps, and
75/// [`CorpusFeatures`](crate::corpus_features::CorpusFeatures) →
76/// [`PipelineConfig`] meta-models can map corpus profiles onto the
77/// kind that works best.
78#[derive(
79    Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize, Default,
80)]
81pub enum ProjectionKind {
82    /// Linear PCA — fast, variance-maximizing. Good default for dense,
83    /// low-noise embeddings.
84    #[default]
85    Pca,
86    /// Kernel PCA with a Gaussian (RBF) kernel. Captures nonlinear
87    /// manifold structure at O(n²) fit cost.
88    KernelPca,
89    /// Laplacian eigenmap over a Jaccard-similarity graph of active
90    /// axes. Connectivity-preserving; preferred when signal lives in
91    /// the co-activation structure of a sparse embedding rather than in
92    /// coordinate variance (the typical failure mode of PCA on 128-dim
93    /// noise-heavy corpora).
94    LaplacianEigenmap,
95    /// UMAP-on-sphere via Adam in the tangent bundle of S². PCA warm
96    /// start, kNN attractive + uniform-negative repulsive, optional
97    /// supervised category term. Preferred when angular ordering on the
98    /// sphere matters more than raw variance preservation, and when a
99    /// modest fit cost (O(n²·epochs) for the kNN graph + iterations) is
100    /// acceptable.
101    UmapSphere,
102}
103
104impl ProjectionKind {
105    /// Short stable name for logs and tuner reports.
106    pub fn name(self) -> &'static str {
107        match self {
108            Self::Pca => "pca",
109            Self::KernelPca => "kernel_pca",
110            Self::LaplacianEigenmap => "laplacian_eigenmap",
111            Self::UmapSphere => "umap_sphere",
112        }
113    }
114
115    /// All supported kinds, in a stable order.
116    pub fn all() -> &'static [ProjectionKind] {
117        &[
118            ProjectionKind::Pca,
119            ProjectionKind::KernelPca,
120            ProjectionKind::LaplacianEigenmap,
121            ProjectionKind::UmapSphere,
122        ]
123    }
124}
125
126// ── Inner-sphere ───────────────────────────────────────────────────────
127
128/// Thresholds governing when a category gets its own inner projection.
129#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
130#[serde(default)]
131pub struct InnerSphereConfig {
132    /// Minimum member count for a category to be considered.
133    pub min_size: usize,
134    /// Minimum EVR improvement (inner − global_subset) to justify building
135    /// an inner sphere at all.
136    pub min_evr_improvement: f64,
137    /// Minimum member count at which kernel PCA is attempted.
138    pub kernel_pca_min_size: usize,
139    /// Minimum EVR improvement of kernel PCA over linear PCA to prefer it.
140    pub min_kernel_improvement: f64,
141}
142
143impl Default for InnerSphereConfig {
144    fn default() -> Self {
145        Self {
146            min_size: 20,
147            min_evr_improvement: 0.10,
148            kernel_pca_min_size: 80,
149            min_kernel_improvement: 0.05,
150        }
151    }
152}
153
154// ── Bridges ────────────────────────────────────────────────────────────
155
156/// Parameters controlling bridge detection and classification.
157#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
158#[serde(default)]
159pub struct BridgeConfig {
160    /// Constant term in the EVR-adaptive bridge threshold
161    /// `threshold = threshold_base + (1 − evr)² · threshold_evr_penalty`.
162    pub threshold_base: f64,
163    /// EVR-penalty coefficient in the bridge threshold formula.
164    pub threshold_evr_penalty: f64,
165    /// Percentile of the observed territorial factor distribution below
166    /// which a bridge is classified as `OverlapArtifact` rather than
167    /// `Genuine` or `Weak`.  0.3 = the bottom 30 % of bridge pairs by
168    /// territorial separation are labeled artifacts.  Expressed as a
169    /// percentile so that dense corpora (where all exclusivities collapse
170    /// toward zero) do not classify every bridge as an artifact.
171    pub overlap_artifact_territorial: f64,
172    /// Quantile of the home-affinity distribution that sets the
173    /// genuine-bridge floor. For each member item, "home affinity" is
174    /// the cosine similarity between the item's embedding and its own
175    /// category's centroid. A bridge is classified `Genuine` when
176    /// `min(affinity_to_source, affinity_to_target)` exceeds the
177    /// quantile-q of those home affinities; otherwise `Weak`.
178    ///
179    /// Why a quantile and not an absolute cosine: home affinity scale
180    /// varies with the projection layout. After stratified PCA spreads
181    /// imbalanced corpora, home affinities can drop into the 0.3–0.6
182    /// band where a fixed 0.5 cosine floor labels almost every cross-
183    /// domain item `Weak`. A quantile-based floor adapts to the
184    /// corpus's own affinity scale: tight corpora get a strict floor,
185    /// spread ones get a permissive one, without per-corpus tuning.
186    ///
187    /// Smaller q = stricter (only bridges matching the strongest
188    /// home affinities qualify). Larger q = more permissive. Default
189    /// 0.25: a bridge is `Genuine` if it has at least as much
190    /// affinity to both sides as the bottom-25% of items have to
191    /// their own home category.
192    pub balanced_affinity_quantile: f64,
193    /// EVR below which bridge classification is unreliable. When the
194    /// outer projection's EVR is below this threshold, all bridges
195    /// are labeled `Weak` (honest uncertainty) rather than attempting
196    /// territorial-factor-based classification — which collapses to
197    /// 100% `OverlapArtifact` when caps overlap everywhere on a
198    /// low-EVR projection, flattening the tuner landscape. Default
199    /// 0.20.
200    pub min_evr_for_classification: f64,
201}
202
203impl Default for BridgeConfig {
204    fn default() -> Self {
205        Self {
206            threshold_base: 0.5,
207            threshold_evr_penalty: 0.4,
208            overlap_artifact_territorial: 0.3,
209            balanced_affinity_quantile: 0.25,
210            min_evr_for_classification: 0.20,
211        }
212    }
213}
214
215impl BridgeConfig {
216    /// EVR-adaptive bridge threshold.
217    ///
218    /// Higher EVR → looser threshold (projection is more trustworthy).
219    /// At EVR=0.19: 0.5 + 0.81² × 0.4 = 0.76 (strict).
220    /// At EVR=0.90: 0.5 + 0.01 × 0.4 = 0.50 (essentially unchanged).
221    pub fn evr_adaptive_threshold(&self, evr: f64) -> f64 {
222        self.threshold_base + (1.0 - evr).powi(2) * self.threshold_evr_penalty
223    }
224}
225
226// ── Hierarchical routing ───────────────────────────────────────────────
227
228/// Parameters for hierarchical domain-group routing.
229#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
230#[serde(default)]
231pub struct RoutingConfig {
232    /// Number of domain groups detected at build time by
233    /// [`detect_domain_groups`](crate::domain_groups::detect_domain_groups).
234    pub num_domain_groups: usize,
235    /// Distance-ratio gate for the default `nearest()` path. A query
236    /// drills into the nearest group's inner sphere when
237    /// `d_to_nearest / d_to_second_nearest < group_routing_alpha`. A
238    /// smaller α is stricter (only routes when one group is clearly
239    /// closer). Default `0.8` matches the routing interview decision;
240    /// set to `0.0` to disable the default-route behavior entirely
241    /// (falls back to outer-sphere k-NN).
242    pub group_routing_alpha: f64,
243    /// EVR below which `hierarchical_nearest` historically routed
244    /// through domain groups instead of the outer sphere.
245    ///
246    /// Retained for backward-compatibility and debugging — the default
247    /// `nearest()` path now uses [`Self::group_routing_alpha`] instead.
248    /// `hierarchical_nearest()` still consults this for its EVR-gated
249    /// branch.
250    pub low_evr_threshold: f64,
251}
252
253impl Default for RoutingConfig {
254    fn default() -> Self {
255        Self {
256            num_domain_groups: 5,
257            group_routing_alpha: 0.8,
258            low_evr_threshold: 0.35,
259        }
260    }
261}
262
263// ── Laplacian eigenmap ─────────────────────────────────────────────────
264
265/// Graph-construction parameters for [`LaplacianEigenmapProjection`](crate::laplacian::LaplacianEigenmapProjection).
266#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
267#[serde(default)]
268pub struct LaplacianConfig {
269    /// k in the k-NN graph sparsification step.
270    pub k_neighbors: usize,
271    /// Absolute-weight cutoff below which an axis is treated as noise.
272    pub active_threshold: f64,
273}
274
275impl Default for LaplacianConfig {
276    fn default() -> Self {
277        Self {
278            k_neighbors: 15,
279            active_threshold: 0.05,
280        }
281    }
282}
283
284// ── UMAP-on-sphere ─────────────────────────────────────────────────────
285
286/// Hyperparameters for [`UmapSphereProjection`](crate::umap::UmapSphereProjection).
287///
288/// These are the tunable knobs exposed to the auto-tuner. Non-tunable
289/// constants (`learning_rate`, `negative_sample_rate`) stay at their
290/// canonical UMAP defaults inside [`fit_projection_for_config`].
291#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
292#[serde(default)]
293pub struct UmapConfig {
294    /// k in the kNN graph (attractive term). Higher = more global structure.
295    pub n_neighbors: usize,
296    /// Adam optimization epochs. ~200 for corpora < 10k, ~400 for 50k+.
297    pub n_epochs: usize,
298    /// Weight on the category supervision term. 0.0 = unsupervised UMAP.
299    /// Positive values pull same-category items together and push
300    /// different-category items apart. 1.0–3.0 is typical.
301    pub category_weight: f64,
302    /// How tightly neighbors may pack on the sphere. Larger values
303    /// flatten the embedding kernel near zero, so clusters claim more
304    /// territory — exactly what the territorial/cap-overlap metrics
305    /// measure. 0.0 = near-maximal clumping; 0.1 is the canonical UMAP
306    /// default.
307    pub min_dist: f64,
308    /// Weight on an attractive pull toward each point's PCA warm-start
309    /// position; 0.0 disables. Use small values (~0.01–0.1) on sparse
310    /// corpora whose kNN graphs fragment into disconnected components,
311    /// to keep the components' global arrangement from drifting under
312    /// unopposed repulsion. Intentionally not a tuner axis — it is a
313    /// data-pathology escape hatch, not a search dimension.
314    pub warm_start_anchor: f64,
315    /// PRNG seed for negative sampling and tie-breaking.
316    pub seed: u64,
317}
318
319impl Default for UmapConfig {
320    fn default() -> Self {
321        Self {
322            n_neighbors: 15,
323            n_epochs: 200,
324            category_weight: 1.5,
325            min_dist: 0.1,
326            warm_start_anchor: 0.0,
327            seed: 0xA1B2_C3D4,
328        }
329    }
330}
331
332// ── Spatial quality ────────────────────────────────────────────────────
333
334/// Monte Carlo sample counts for [`SpatialQuality::compute`](crate::spatial_quality::SpatialQuality::compute).
335///
336/// These run once at build time. Higher = more precise but slower.
337///
338/// This config only governs the build-time `SpatialQuality::compute`
339/// pass. The navigator's `run_full_analysis` uses its own
340/// `NavigatorConfig` sample counts (with different defaults) and is
341/// unaffected by these values.
342#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
343#[serde(default)]
344pub struct SpatialConfig {
345    /// Samples used to estimate what fraction of S² is covered by any
346    /// category's cap. Higher = tighter coverage estimate. Default
347    /// `100_000` → ~50ms at 31 categories.
348    pub coverage_samples: usize,
349    /// Samples used per category to estimate its cap exclusivity (the
350    /// fraction of its cap not overlapped by any other category).
351    /// Runs `n_categories` times so cost scales linearly with C.
352    /// Default `30_000` per category.
353    pub exclusivity_samples: usize,
354    /// Samples used to estimate the spherical Voronoi tessellation over
355    /// category centroids. Higher = tighter per-cell area estimates.
356    /// Default `100_000` → ~100ms at 31 centroids.
357    pub voronoi_samples: usize,
358}
359
360impl Default for SpatialConfig {
361    fn default() -> Self {
362        Self {
363            coverage_samples: 100_000,
364            exclusivity_samples: 30_000,
365            voronoi_samples: 100_000,
366        }
367    }
368}
369
370#[cfg(test)]
371mod tests {
372    use super::*;
373
374    #[test]
375    fn defaults_match_legacy_constants() {
376        let c = PipelineConfig::default();
377        assert_eq!(c.projection_kind, ProjectionKind::Pca);
378        assert_eq!(c.inner_sphere.min_size, 20);
379        assert_eq!(c.inner_sphere.kernel_pca_min_size, 80);
380        assert!((c.inner_sphere.min_evr_improvement - 0.10).abs() < 1e-12);
381        assert!((c.inner_sphere.min_kernel_improvement - 0.05).abs() < 1e-12);
382        assert!((c.bridges.threshold_base - 0.5).abs() < 1e-12);
383        assert!((c.bridges.threshold_evr_penalty - 0.4).abs() < 1e-12);
384        assert!((c.bridges.overlap_artifact_territorial - 0.3).abs() < 1e-12);
385        assert!((c.bridges.balanced_affinity_quantile - 0.25).abs() < 1e-12);
386        assert!((c.bridges.min_evr_for_classification - 0.20).abs() < 1e-12);
387        assert_eq!(c.routing.num_domain_groups, 5);
388        assert!((c.routing.low_evr_threshold - 0.35).abs() < 1e-12);
389        assert_eq!(c.laplacian.k_neighbors, 15);
390        assert!((c.laplacian.active_threshold - 0.05).abs() < 1e-12);
391        assert_eq!(c.umap.n_neighbors, 15);
392        assert_eq!(c.umap.n_epochs, 200);
393        assert!((c.umap.category_weight - 1.5).abs() < 1e-12);
394        assert!((c.umap.min_dist - 0.1).abs() < 1e-12);
395        assert_eq!(c.umap.warm_start_anchor, 0.0);
396        assert_eq!(c.spatial.coverage_samples, 100_000);
397        assert_eq!(c.spatial.exclusivity_samples, 30_000);
398        assert_eq!(c.spatial.voronoi_samples, 100_000);
399        assert_eq!(c.min_category_size, 1);
400    }
401
402    #[test]
403    fn evr_adaptive_threshold_monotone_in_evr() {
404        let b = BridgeConfig::default();
405        let low = b.evr_adaptive_threshold(0.15);
406        let mid = b.evr_adaptive_threshold(0.50);
407        let high = b.evr_adaptive_threshold(0.90);
408        // Higher EVR → smaller threshold
409        assert!(low > mid);
410        assert!(mid > high);
411        assert!((high - 0.5).abs() < 0.05);
412    }
413
414    #[test]
415    fn config_is_clone() {
416        let a = PipelineConfig::default();
417        let b = a.clone();
418        assert_eq!(a.inner_sphere.min_size, b.inner_sphere.min_size);
419    }
420
421    #[test]
422    fn projection_kind_name_and_all_stable() {
423        assert_eq!(ProjectionKind::Pca.name(), "pca");
424        assert_eq!(ProjectionKind::KernelPca.name(), "kernel_pca");
425        assert_eq!(
426            ProjectionKind::LaplacianEigenmap.name(),
427            "laplacian_eigenmap"
428        );
429        assert_eq!(ProjectionKind::UmapSphere.name(), "umap_sphere");
430        assert_eq!(ProjectionKind::all().len(), 4);
431    }
432}