Skip to main content

sphereql_embed/
config.rs

1//! Configuration surface for the SphereQL pipeline.
2//!
3//! Every tunable constant that governs projection, bridge detection,
4//! inner-sphere gating, domain-group routing, and spatial-quality
5//! Monte Carlo sample counts lives here. This is the first-class knob
6//! inventory that future auto-tuning and meta-learning passes optimize
7//! over.
8//!
9//! The [`PipelineConfig::default`] values reproduce the historical
10//! hardcoded constants; the pipeline accepts any overriding config.
11
12// ── Top-level ──────────────────────────────────────────────────────────
13
14/// All tunable parameters for a SphereQL pipeline build.
15///
16/// Every field is a sub-config grouped by area. [`Self::default`] returns
17/// the values the crate shipped with before the config surface existed.
18#[derive(Debug, Clone, serde::Serialize, serde::Deserialize, Default)]
19#[serde(default)]
20pub struct PipelineConfig {
21    /// Outer-sphere projection family.
22    pub projection_kind: ProjectionKind,
23    /// Inner-sphere gating thresholds.
24    pub inner_sphere: InnerSphereConfig,
25    /// Bridge detection and classification.
26    pub bridges: BridgeConfig,
27    /// Hierarchical domain-group routing.
28    pub routing: RoutingConfig,
29    /// Laplacian eigenmap hyperparameters (only consulted if that
30    /// projection is selected).
31    pub laplacian: LaplacianConfig,
32    /// Spatial quality Monte Carlo sample counts.
33    pub spatial: SpatialConfig,
34}
35
36// ── Projection kind ────────────────────────────────────────────────────
37
38/// Which projection family the pipeline uses for the outer sphere.
39///
40/// A first-class tunable axis:
41/// [`SearchSpace::projection_kinds`](crate::tuner::SearchSpace::projection_kinds)
42/// enumerates the families the auto-tuner sweeps, and
43/// [`CorpusFeatures`](crate::corpus_features::CorpusFeatures) →
44/// [`PipelineConfig`] meta-models can map corpus profiles onto the
45/// kind that works best.
46#[derive(
47    Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize, Default,
48)]
49pub enum ProjectionKind {
50    /// Linear PCA — fast, variance-maximizing. Good default for dense,
51    /// low-noise embeddings.
52    #[default]
53    Pca,
54    /// Kernel PCA with a Gaussian (RBF) kernel. Captures nonlinear
55    /// manifold structure at O(n²) fit cost.
56    KernelPca,
57    /// Laplacian eigenmap over a Jaccard-similarity graph of active
58    /// axes. Connectivity-preserving; preferred when signal lives in
59    /// the co-activation structure of a sparse embedding rather than in
60    /// coordinate variance (the typical failure mode of PCA on 128-dim
61    /// noise-heavy corpora).
62    LaplacianEigenmap,
63}
64
65impl ProjectionKind {
66    /// Short stable name for logs and tuner reports.
67    pub fn name(self) -> &'static str {
68        match self {
69            Self::Pca => "pca",
70            Self::KernelPca => "kernel_pca",
71            Self::LaplacianEigenmap => "laplacian_eigenmap",
72        }
73    }
74
75    /// All supported kinds, in a stable order.
76    pub fn all() -> &'static [ProjectionKind] {
77        &[
78            ProjectionKind::Pca,
79            ProjectionKind::KernelPca,
80            ProjectionKind::LaplacianEigenmap,
81        ]
82    }
83}
84
85// ── Inner-sphere ───────────────────────────────────────────────────────
86
87/// Thresholds governing when a category gets its own inner projection.
88#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
89#[serde(default)]
90pub struct InnerSphereConfig {
91    /// Minimum member count for a category to be considered.
92    pub min_size: usize,
93    /// Minimum EVR improvement (inner − global_subset) to justify building
94    /// an inner sphere at all.
95    pub min_evr_improvement: f64,
96    /// Minimum member count at which kernel PCA is attempted.
97    pub kernel_pca_min_size: usize,
98    /// Minimum EVR improvement of kernel PCA over linear PCA to prefer it.
99    pub min_kernel_improvement: f64,
100}
101
102impl Default for InnerSphereConfig {
103    fn default() -> Self {
104        Self {
105            min_size: 20,
106            min_evr_improvement: 0.10,
107            kernel_pca_min_size: 80,
108            min_kernel_improvement: 0.05,
109        }
110    }
111}
112
113// ── Bridges ────────────────────────────────────────────────────────────
114
115/// Parameters controlling bridge detection and classification.
116#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
117#[serde(default)]
118pub struct BridgeConfig {
119    /// Constant term in the EVR-adaptive bridge threshold
120    /// `threshold = threshold_base + (1 − evr)² · threshold_evr_penalty`.
121    pub threshold_base: f64,
122    /// EVR-penalty coefficient in the bridge threshold formula.
123    pub threshold_evr_penalty: f64,
124    /// Territorial factor below which a bridge is classified as an
125    /// `OverlapArtifact` rather than `Genuine` or `Weak`.
126    pub overlap_artifact_territorial: f64,
127}
128
129impl Default for BridgeConfig {
130    fn default() -> Self {
131        Self {
132            threshold_base: 0.5,
133            threshold_evr_penalty: 0.4,
134            overlap_artifact_territorial: 0.3,
135        }
136    }
137}
138
139impl BridgeConfig {
140    /// EVR-adaptive bridge threshold.
141    ///
142    /// Higher EVR → looser threshold (projection is more trustworthy).
143    /// At EVR=0.19: 0.5 + 0.81² × 0.4 = 0.76 (strict).
144    /// At EVR=0.90: 0.5 + 0.01 × 0.4 = 0.50 (essentially unchanged).
145    pub fn evr_adaptive_threshold(&self, evr: f64) -> f64 {
146        self.threshold_base + (1.0 - evr).powi(2) * self.threshold_evr_penalty
147    }
148}
149
150// ── Hierarchical routing ───────────────────────────────────────────────
151
152/// Parameters for hierarchical domain-group routing.
153#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
154#[serde(default)]
155pub struct RoutingConfig {
156    /// Number of domain groups detected at build time by
157    /// [`detect_domain_groups`](crate::domain_groups::detect_domain_groups).
158    pub num_domain_groups: usize,
159    /// EVR below which `hierarchical_nearest` routes through domain
160    /// groups and inner spheres instead of the outer sphere.
161    pub low_evr_threshold: f64,
162}
163
164impl Default for RoutingConfig {
165    fn default() -> Self {
166        Self {
167            num_domain_groups: 5,
168            low_evr_threshold: 0.35,
169        }
170    }
171}
172
173// ── Laplacian eigenmap ─────────────────────────────────────────────────
174
175/// Graph-construction parameters for [`LaplacianEigenmapProjection`](crate::laplacian::LaplacianEigenmapProjection).
176#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
177#[serde(default)]
178pub struct LaplacianConfig {
179    /// k in the k-NN graph sparsification step.
180    pub k_neighbors: usize,
181    /// Absolute-weight cutoff below which an axis is treated as noise.
182    pub active_threshold: f64,
183}
184
185impl Default for LaplacianConfig {
186    fn default() -> Self {
187        Self {
188            k_neighbors: 15,
189            active_threshold: 0.05,
190        }
191    }
192}
193
194// ── Spatial quality ────────────────────────────────────────────────────
195
196/// Monte Carlo sample counts for [`SpatialQuality::compute`](crate::spatial_quality::SpatialQuality::compute).
197///
198/// These run once at build time. Higher = more precise but slower.
199#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
200#[serde(default)]
201pub struct SpatialConfig {
202    /// Samples used to estimate what fraction of S² is covered by any
203    /// category's cap. Higher = tighter coverage estimate. Default
204    /// `100_000` → ~50ms at 31 categories.
205    pub coverage_samples: usize,
206    /// Samples used per category to estimate its cap exclusivity (the
207    /// fraction of its cap not overlapped by any other category).
208    /// Runs `n_categories` times so cost scales linearly with C.
209    /// Default `30_000` per category.
210    pub exclusivity_samples: usize,
211    /// Samples used to estimate the spherical Voronoi tessellation over
212    /// category centroids. Higher = tighter per-cell area estimates.
213    /// Default `100_000` → ~100ms at 31 centroids.
214    pub voronoi_samples: usize,
215}
216
217impl Default for SpatialConfig {
218    fn default() -> Self {
219        Self {
220            coverage_samples: 100_000,
221            exclusivity_samples: 30_000,
222            voronoi_samples: 100_000,
223        }
224    }
225}
226
227#[cfg(test)]
228mod tests {
229    use super::*;
230
231    #[test]
232    fn defaults_match_legacy_constants() {
233        let c = PipelineConfig::default();
234        assert_eq!(c.projection_kind, ProjectionKind::Pca);
235        assert_eq!(c.inner_sphere.min_size, 20);
236        assert_eq!(c.inner_sphere.kernel_pca_min_size, 80);
237        assert!((c.inner_sphere.min_evr_improvement - 0.10).abs() < 1e-12);
238        assert!((c.inner_sphere.min_kernel_improvement - 0.05).abs() < 1e-12);
239        assert!((c.bridges.threshold_base - 0.5).abs() < 1e-12);
240        assert!((c.bridges.threshold_evr_penalty - 0.4).abs() < 1e-12);
241        assert!((c.bridges.overlap_artifact_territorial - 0.3).abs() < 1e-12);
242        assert_eq!(c.routing.num_domain_groups, 5);
243        assert!((c.routing.low_evr_threshold - 0.35).abs() < 1e-12);
244        assert_eq!(c.laplacian.k_neighbors, 15);
245        assert!((c.laplacian.active_threshold - 0.05).abs() < 1e-12);
246        assert_eq!(c.spatial.coverage_samples, 100_000);
247        assert_eq!(c.spatial.exclusivity_samples, 30_000);
248        assert_eq!(c.spatial.voronoi_samples, 100_000);
249    }
250
251    #[test]
252    fn evr_adaptive_threshold_monotone_in_evr() {
253        let b = BridgeConfig::default();
254        let low = b.evr_adaptive_threshold(0.15);
255        let mid = b.evr_adaptive_threshold(0.50);
256        let high = b.evr_adaptive_threshold(0.90);
257        // Higher EVR → smaller threshold
258        assert!(low > mid);
259        assert!(mid > high);
260        assert!((high - 0.5).abs() < 0.05);
261    }
262
263    #[test]
264    fn config_is_clone() {
265        let a = PipelineConfig::default();
266        let b = a.clone();
267        assert_eq!(a.inner_sphere.min_size, b.inner_sphere.min_size);
268    }
269
270    #[test]
271    fn projection_kind_name_and_all_stable() {
272        assert_eq!(ProjectionKind::Pca.name(), "pca");
273        assert_eq!(ProjectionKind::KernelPca.name(), "kernel_pca");
274        assert_eq!(
275            ProjectionKind::LaplacianEigenmap.name(),
276            "laplacian_eigenmap"
277        );
278        assert_eq!(ProjectionKind::all().len(), 3);
279    }
280}