Skip to main content

sphereql_embed/
category.rs

1use std::collections::HashMap;
2
3use sphereql_core::{SphericalPoint, angular_distance, cosine_similarity};
4
5use crate::config::PipelineConfig;
6use crate::kernel_pca::KernelPcaProjection;
7use crate::projection::{PcaProjection, Projection};
8use crate::spatial_quality::SpatialQuality;
9use crate::types::{Embedding, RadialStrategy};
10
11// ── Category summary ───────────────────────────────────────────────────
12
13/// Aggregate statistics for a single category on the outer sphere.
14///
15/// Computed from the projected positions of all items in that category.
16/// Every category gets a summary regardless of size — this is the
17/// foundation of the Category Enrichment Layer.
18#[derive(Debug, Clone)]
19pub struct CategorySummary {
20    /// Category name (as provided by the user).
21    pub name: String,
22    /// Indices of member items in the pipeline's item list.
23    pub member_indices: Vec<usize>,
24    /// Mean embedding in high-dimensional space (pre-projection).
25    /// Length = embedding dimensionality.
26    pub centroid_embedding: Vec<f64>,
27    /// The centroid projected onto the outer sphere.
28    pub centroid_position: SphericalPoint,
29    /// Mean angular distance (radians) of members from the centroid
30    /// on the projected sphere. Measures how "spread out" the category is.
31    pub angular_spread: f64,
32    /// 1.0 / (1.0 + angular_spread). Higher = tighter cluster.
33    /// Normalized to (0, 1].
34    pub cohesion: f64,
35    /// Number of member items.
36    pub member_count: usize,
37    /// Solid angle of this category's cap on S² (steradians).
38    pub cap_area: f64,
39    /// Fraction of this category's cap not overlapped by any other. [0, 1].
40    pub exclusivity: f64,
41    /// Voronoi cell area on S² (steradians).
42    pub voronoi_area: f64,
43    /// Items per steradian of Voronoi cell.
44    pub territorial_efficiency: f64,
45    /// Mean territorial-adjusted bridge strength across all outgoing edges.
46    /// 0.0 if this category has no neighbors. Populated after the graph
47    /// is built in [`CategoryLayer::build`].
48    pub bridge_quality: f64,
49}
50
51// ── Bridge items ───────────────────────────────────────────────────────
52
53/// Semantic relation type for a bridge edge.
54#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)]
55pub enum RelationType {
56    Studies,
57    AppliesTo,
58    Enables,
59    CausedBy,
60    Contains,
61    SharedMethod,
62    HistoricalLink,
63}
64
65impl RelationType {
66    pub fn name(self) -> &'static str {
67        match self {
68            Self::Studies => "studies",
69            Self::AppliesTo => "applies_to",
70            Self::Enables => "enables",
71            Self::CausedBy => "caused_by",
72            Self::Contains => "contains",
73            Self::SharedMethod => "shared_method",
74            Self::HistoricalLink => "historical_link",
75        }
76    }
77}
78
79/// Infer a [`RelationType`] for a bridge from its concept label.
80///
81/// Source and target category names are accepted for future use but
82/// currently unused — keyword cues in the label drive the inference.
83fn infer_relation(
84    concept_label: &str,
85    _source_category: &str,
86    _target_category: &str,
87) -> Option<RelationType> {
88    let label = concept_label.to_lowercase();
89
90    if label.contains("history of")
91        || label.contains("historical")
92        || label.contains("origins of")
93        || label.contains("evolution of")
94    {
95        return Some(RelationType::HistoricalLink);
96    }
97
98    if label.contains("method")
99        || label.contains("technique")
100        || label.contains("analysis")
101        || label.contains("spectroscop")
102        || label.contains("microscop")
103        || label.contains("imaging")
104        || label.contains("modeling")
105        || label.contains("simulation")
106        || label.contains("measurement")
107    {
108        return Some(RelationType::SharedMethod);
109    }
110
111    if label.contains("applications")
112        || label.contains("applied")
113        || label.contains(" in ")
114        || label.contains(" for ")
115    {
116        return Some(RelationType::AppliesTo);
117    }
118
119    if label.contains("studies")
120        || label.contains("research")
121        || label.contains("science of")
122        || label.contains("theory of")
123    {
124        return Some(RelationType::Studies);
125    }
126
127    if label.starts_with("bio")
128        || label.starts_with("geo")
129        || label.starts_with("neuro")
130        || label.starts_with("astro")
131        || label.starts_with("electro")
132        || label.starts_with("psycho")
133        || label.starts_with("socio")
134        || label.starts_with("eco")
135    {
136        return Some(RelationType::Contains);
137    }
138
139    if label.contains("foundation")
140        || label.contains("prerequisite")
141        || label.contains("basis")
142        || label.contains("fundamentals")
143        || label.contains("principles")
144    {
145        return Some(RelationType::Enables);
146    }
147
148    None
149}
150
151/// Quality classification for a bridge item.
152///
153/// Assigned after all bridges are collected, comparing each bridge's
154/// strength against the corpus-wide median and the pair's territorial
155/// separation on S².
156#[derive(Debug, Clone, Copy, PartialEq, Eq)]
157pub enum BridgeClassification {
158    /// Strength at or above the median and the source/target caps are
159    /// spatially distinct — a real cross-domain connector.
160    Genuine,
161    /// The two categories overlap heavily on S² (low territorial factor).
162    /// This bridge is more shared-territory noise than a genuine connector.
163    OverlapArtifact,
164    /// Strength below median in a territorially clean pair — a real
165    /// connection but not a strong one.
166    Weak,
167}
168
169/// An item that semantically spans two categories.
170///
171/// Bridge items are closer to a foreign category's centroid than to the
172/// median distance within their own category. They are the conceptual
173/// connectors that make cross-domain paths meaningful.
174#[derive(Debug, Clone)]
175pub struct BridgeItem {
176    /// Index of this item in the pipeline's item list.
177    pub item_index: usize,
178    /// The item's own category index.
179    pub source_category: usize,
180    /// The foreign category this item bridges toward.
181    pub target_category: usize,
182    /// Cosine similarity to own category centroid (in high-D space).
183    pub affinity_to_source: f64,
184    /// Cosine similarity to foreign category centroid (in high-D space).
185    pub affinity_to_target: f64,
186    /// Bridge strength: harmonic mean of the two affinities.
187    /// Higher = equally strong connection to both domains.
188    pub bridge_strength: f64,
189    /// Quality label assigned after the full bridge set is observed.
190    pub classification: BridgeClassification,
191    /// Semantic relation type, inferred heuristically. `None` when untyped.
192    pub relation: Option<RelationType>,
193}
194
195// ── Category graph ─────────────────────────────────────────────────────
196
197/// Edge in the category adjacency graph.
198#[derive(Debug, Clone)]
199pub struct CategoryEdge {
200    /// Index of the neighbor category.
201    pub target: usize,
202    /// Angular distance between category centroids on the sphere.
203    pub centroid_distance: f64,
204    /// Number of bridge items connecting these two categories.
205    pub bridge_count: usize,
206    /// Strongest bridge connecting these two categories (0.0 if no bridges).
207    pub max_bridge_strength: f64,
208    /// Mean bridge strength across all bridges (0.0 if no bridges).
209    pub mean_bridge_strength: f64,
210    /// Combined edge weight (lower = more connected).
211    /// Computed as `centroid_distance / (1 + bridge_count * mean_bridge_strength)`.
212    /// This prefers fewer strong bridges over many weak ones.
213    pub weight: f64,
214}
215
216/// The full category adjacency graph.
217#[derive(Debug, Clone)]
218pub struct CategoryGraph {
219    /// Adjacency list: `adjacency[i]` contains edges from category i.
220    pub adjacency: Vec<Vec<CategoryEdge>>,
221    /// Bridge items keyed by (source_category, target_category).
222    /// Sorted by descending bridge_strength within each pair.
223    pub bridges: HashMap<(usize, usize), Vec<BridgeItem>>,
224}
225
226// ── Category-level concept path ────────────────────────────────────────
227
228/// A step in a category-level concept path.
229#[derive(Debug, Clone)]
230pub struct CategoryPathStep {
231    /// Category index.
232    pub category_index: usize,
233    /// Category name.
234    pub category_name: String,
235    /// Cumulative distance from the start.
236    pub cumulative_distance: f64,
237    /// Bridge items connecting this step to the next (empty for the last step).
238    pub bridges_to_next: Vec<BridgeItem>,
239    /// Spatial confidence of this hop: bridge_strength × territorial_factor.
240    /// Higher = more trustworthy transition. 0.0 for the last step.
241    pub hop_confidence: f64,
242}
243
244/// Result of a category-level concept path query.
245#[derive(Debug, Clone)]
246pub struct CategoryPath {
247    /// Ordered steps from source to target category.
248    pub steps: Vec<CategoryPathStep>,
249    /// Total path distance.
250    pub total_distance: f64,
251    /// Product of all hop confidences along the path.
252    /// Low values indicate the path routes through shaky connections.
253    pub path_confidence: f64,
254}
255
256// ── Inner sphere (Phase 2) ─────────────────────────────────────────────
257
258/// The projection type used for a category's inner sphere.
259///
260/// Wraps either a linear PCA or kernel PCA projection, chosen
261/// automatically based on the category's size and measured EVR
262/// improvement over the global projection.
263#[derive(Clone)]
264pub enum InnerProjection {
265    /// Standard linear PCA — chosen for categories meeting
266    /// [`InnerSphereConfig::min_size`](crate::config::InnerSphereConfig::min_size)
267    /// but below
268    /// [`kernel_pca_min_size`](crate::config::InnerSphereConfig::kernel_pca_min_size),
269    /// or when kernel PCA fails to improve over linear by
270    /// [`min_kernel_improvement`](crate::config::InnerSphereConfig::min_kernel_improvement).
271    LinearPca(PcaProjection),
272    /// Gaussian kernel PCA — chosen for categories meeting
273    /// [`kernel_pca_min_size`](crate::config::InnerSphereConfig::kernel_pca_min_size)
274    /// where it measurably outperforms linear PCA.
275    KernelPca(KernelPcaProjection),
276}
277
278impl std::fmt::Debug for InnerProjection {
279    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
280        match self {
281            Self::LinearPca(_) => write!(f, "LinearPca"),
282            Self::KernelPca(_) => write!(f, "KernelPca"),
283        }
284    }
285}
286
287impl Projection for InnerProjection {
288    fn project(&self, embedding: &Embedding) -> SphericalPoint {
289        match self {
290            Self::LinearPca(p) => p.project(embedding),
291            Self::KernelPca(p) => p.project(embedding),
292        }
293    }
294    fn project_rich(&self, embedding: &Embedding) -> crate::types::ProjectedPoint {
295        match self {
296            Self::LinearPca(p) => p.project_rich(embedding),
297            Self::KernelPca(p) => p.project_rich(embedding),
298        }
299    }
300    fn dimensionality(&self) -> usize {
301        match self {
302            Self::LinearPca(p) => p.dimensionality(),
303            Self::KernelPca(p) => p.dimensionality(),
304        }
305    }
306}
307
308/// A category-specific inner sphere with its own optimized projection.
309///
310/// Only created for categories that meet all of:
311/// 1. At least [`InnerSphereConfig::min_size`](crate::config::InnerSphereConfig::min_size) members
312/// 2. Inner EVR improves over global subset EVR by ≥
313///    [`InnerSphereConfig::min_evr_improvement`](crate::config::InnerSphereConfig::min_evr_improvement)
314///
315/// The inner sphere gives higher-resolution angular discrimination
316/// within the category than the global outer projection can provide.
317#[derive(Debug, Clone)]
318pub struct InnerSphere {
319    /// The category-specific projection (linear PCA or kernel PCA).
320    pub projection: InnerProjection,
321    /// Positions of member items in the inner sphere's coordinate system.
322    /// `inner_positions[i]` corresponds to `member_indices[i]`.
323    pub inner_positions: Vec<SphericalPoint>,
324    /// Global item indices of the members (same order as inner_positions).
325    pub member_indices: Vec<usize>,
326    /// Explained variance ratio of the inner projection.
327    pub explained_variance_ratio: f64,
328    /// Mean certainty of these items under the global (outer) projection.
329    /// Baseline for measuring improvement.
330    pub global_subset_evr: f64,
331    /// `explained_variance_ratio - global_subset_evr`.
332    pub evr_improvement: f64,
333}
334
335/// A single item from a [`CategoryLayer::drill_down_with_projection`] query.
336#[derive(Debug, Clone)]
337pub struct DrillDownResult {
338    /// Index of the item in the pipeline's global item list.
339    pub item_index: usize,
340    /// Angular distance to the query in the relevant coordinate system.
341    pub distance: f64,
342    /// Whether the inner sphere's projection was used (true) or the
343    /// outer sphere was used as fallback (false).
344    pub used_inner_sphere: bool,
345}
346
347/// Stats for a single inner sphere, returned by
348/// [`inner_sphere_stats`](CategoryLayer::inner_sphere_stats).
349#[derive(Debug, Clone)]
350pub struct InnerSphereReport {
351    /// Category name.
352    pub category_name: String,
353    /// Category index.
354    pub category_index: usize,
355    /// Number of members in the inner sphere.
356    pub member_count: usize,
357    /// `"LinearPca"` or `"KernelPca"`.
358    pub projection_type: &'static str,
359    /// Explained variance ratio of the inner projection.
360    pub inner_evr: f64,
361    /// Mean certainty of members under the global projection.
362    pub global_subset_evr: f64,
363    /// EVR improvement over global.
364    pub evr_improvement: f64,
365}
366
367// ── The enrichment layer ───────────────────────────────────────────────
368
369/// Category Enrichment Layer: aggregate statistics, inter-category graph,
370/// bridge item detection, and automatic inner spheres over a projected
371/// SphereQL corpus.
372///
373/// This is a read-only structure computed from an existing pipeline's
374/// data. It adds category-level reasoning without modifying the
375/// underlying projection or spatial index.
376#[derive(Debug, Clone)]
377pub struct CategoryLayer {
378    /// One summary per unique category, in insertion order.
379    pub summaries: Vec<CategorySummary>,
380    /// Map from category name to index in `summaries`.
381    pub name_to_index: HashMap<String, usize>,
382    /// The inter-category adjacency graph.
383    pub graph: CategoryGraph,
384    /// Outer-sphere positions for all items (same indexing as embeddings).
385    outer_positions: Vec<SphericalPoint>,
386    /// Inner spheres keyed by category index. Only present for categories
387    /// that meet the size and EVR-improvement thresholds.
388    pub inner_spheres: HashMap<usize, InnerSphere>,
389    /// Domain groups discovered from category geometry. Default
390    /// `nearest()` routing chooses the closest group via these centroids.
391    pub domain_groups: Vec<crate::domain_groups::DomainGroup>,
392    /// Inner spheres keyed by domain-group index. Built only when the
393    /// group's union of members beats the EVR threshold versus the
394    /// outer projection — the realistic-grouping unit per the v2 routing
395    /// design (replaces the previous one-per-category default).
396    pub group_inner_spheres: HashMap<usize, InnerSphere>,
397    /// Pre-computed spatial properties of the category layout on S².
398    /// Used for bridge quality scoring, confidence signals, and routing.
399    pub spatial_quality: SpatialQuality,
400}
401
402impl CategoryLayer {
403    /// Build the category enrichment layer from pipeline data.
404    ///
405    /// - `categories[i]` is the category name for item i.
406    /// - `embeddings[i]` is the raw embedding for item i.
407    /// - `projected_positions[i]` is the spherical position on the outer sphere.
408    /// - `projection` is used to project category centroids and measure
409    ///   per-point certainty for inner sphere threshold decisions.
410    ///
411    /// Inner spheres are automatically constructed for categories that:
412    /// 1. Have ≥ 20 members
413    /// 2. Show ≥ 0.10 EVR improvement over the global projection
414    ///
415    /// Categories with ≥ 80 members additionally try kernel PCA and
416    /// select it if it improves EVR by ≥ 0.05 over linear PCA.
417    ///
418    /// O(N·C + C²) for the base layer, plus O(n_c²·d) per inner sphere.
419    ///
420    /// Uses [`PipelineConfig::default`] for all tunables. Call
421    /// [`Self::build_with_config`] to override.
422    pub fn build<P: Projection>(
423        categories: &[String],
424        embeddings: &[Embedding],
425        projected_positions: &[SphericalPoint],
426        projection: &P,
427        evr: f64,
428    ) -> Self {
429        Self::build_with_config(
430            categories,
431            embeddings,
432            projected_positions,
433            projection,
434            evr,
435            &PipelineConfig::default(),
436        )
437    }
438
439    /// Configurable variant of [`Self::build`]. Threads a [`PipelineConfig`]
440    /// through spatial quality, bridge classification, and inner-sphere
441    /// gating.
442    pub fn build_with_config<P: Projection>(
443        categories: &[String],
444        embeddings: &[Embedding],
445        projected_positions: &[SphericalPoint],
446        projection: &P,
447        evr: f64,
448        config: &PipelineConfig,
449    ) -> Self {
450        let n = categories.len();
451        // Invariant: the pipeline builder validates length parity before calling
452        // build_with_config. A mismatch here is a programmer error, not a
453        // recoverable input error.
454        assert_eq!(n, embeddings.len());
455        assert_eq!(n, projected_positions.len());
456
457        // 1. Discover unique categories in first-seen order, then drop any
458        //    whose member count falls below `min_category_size`. Items in
459        //    excluded categories remain projected and indexed on the sphere
460        //    — they just don't get a CategorySummary, don't produce bridges,
461        //    and don't participate in domain groups or spatial quality.
462        let min_size = config.min_category_size;
463
464        let mut name_to_index: HashMap<String, usize> = HashMap::new();
465        let mut cat_names: Vec<String> = Vec::new();
466        let mut cat_members: Vec<Vec<usize>> = Vec::new();
467
468        // Single pass: insert in first-seen order, but defer the size filter.
469        let mut raw_index: HashMap<&str, usize> = HashMap::new();
470        let mut raw_names: Vec<&str> = Vec::new();
471        let mut raw_members: Vec<Vec<usize>> = Vec::new();
472        for (i, cat) in categories.iter().enumerate() {
473            let idx = if let Some(&idx) = raw_index.get(cat.as_str()) {
474                idx
475            } else {
476                let idx = raw_names.len();
477                raw_index.insert(cat.as_str(), idx);
478                raw_names.push(cat.as_str());
479                raw_members.push(Vec::new());
480                idx
481            };
482            raw_members[idx].push(i);
483        }
484
485        for (raw_name, members) in raw_names.iter().zip(raw_members) {
486            if members.len() < min_size {
487                continue;
488            }
489            let idx = cat_names.len();
490            name_to_index.insert((*raw_name).to_string(), idx);
491            cat_names.push((*raw_name).to_string());
492            cat_members.push(members);
493        }
494
495        let num_cats = cat_names.len();
496        let dim = if n > 0 { embeddings[0].dimension() } else { 0 };
497
498        // 2. Compute category summaries
499        let mut summaries: Vec<CategorySummary> = Vec::with_capacity(num_cats);
500
501        for (ci, name) in cat_names.iter().enumerate() {
502            let members = &cat_members[ci];
503            let count = members.len();
504
505            // Centroid in high-D space
506            let mut centroid_emb = vec![0.0; dim];
507            for &mi in members {
508                for (j, &v) in embeddings[mi].values.iter().enumerate() {
509                    centroid_emb[j] += v;
510                }
511            }
512            if count > 0 {
513                for v in &mut centroid_emb {
514                    *v /= count as f64;
515                }
516            }
517
518            // Project the centroid
519            let centroid_embedding_obj = Embedding::new(centroid_emb.clone());
520            let centroid_position = projection.project(&centroid_embedding_obj);
521
522            // Angular spread: mean angular distance of members from centroid
523            let angular_spread = if count > 1 {
524                let total: f64 = members
525                    .iter()
526                    .map(|&mi| angular_distance(&projected_positions[mi], &centroid_position))
527                    .sum();
528                total / count as f64
529            } else {
530                0.0
531            };
532
533            let cohesion = 1.0 / (1.0 + angular_spread);
534
535            summaries.push(CategorySummary {
536                name: name.clone(),
537                member_indices: members.clone(),
538                centroid_embedding: centroid_emb,
539                centroid_position,
540                angular_spread,
541                cohesion,
542                member_count: count,
543                // Spatial fields filled after SpatialQuality is computed
544                cap_area: 0.0,
545                exclusivity: 0.0,
546                voronoi_area: 0.0,
547                territorial_efficiency: 0.0,
548                // Filled after build_graph() classifies bridges
549                bridge_quality: 0.0,
550            });
551        }
552
553        // 3. Compute spatial quality from category geometry
554        let centroids: Vec<SphericalPoint> =
555            summaries.iter().map(|s| s.centroid_position).collect();
556        let half_angles: Vec<f64> = summaries.iter().map(|s| s.angular_spread).collect();
557        let mut spatial_quality =
558            SpatialQuality::compute_with_config(&centroids, &half_angles, evr, config);
559
560        // Backfill spatial fields on summaries
561        for (i, summary) in summaries.iter_mut().enumerate() {
562            summary.cap_area = spatial_quality.cap_areas[i];
563            summary.exclusivity = spatial_quality.exclusivities[i];
564            summary.voronoi_area = spatial_quality.voronoi_area(i);
565            summary.territorial_efficiency =
566                spatial_quality.territorial_efficiency(i, summary.member_count);
567        }
568
569        // 4. Build category graph + detect bridges (spatially informed)
570        let graph = Self::build_graph(&summaries, embeddings, num_cats, &spatial_quality, config);
571
572        // Backfill bridge_quality on summaries using the just-built graph.
573        // bridge_quality = mean of (edge.mean_bridge_strength × territorial_factor)
574        // over all outgoing edges; 0 for isolated categories.
575        for (i, summary) in summaries.iter_mut().enumerate() {
576            let edges = &graph.adjacency[i];
577            if edges.is_empty() {
578                summary.bridge_quality = 0.0;
579            } else {
580                let total: f64 = edges
581                    .iter()
582                    .map(|e| {
583                        e.mean_bridge_strength * spatial_quality.territorial_factor(i, e.target)
584                    })
585                    .sum();
586                summary.bridge_quality = total / edges.len() as f64;
587            }
588        }
589
590        // Populate C×C spatial bridge quality matrix on SpatialQuality.
591        spatial_quality.set_bridge_quality_matrix(&graph);
592
593        // 5. Build inner spheres for qualifying categories
594        let inner_spheres = Self::build_inner_spheres(&summaries, embeddings, projection, config);
595
596        // 6. Detect domain groups + build group-level inner spheres
597        // (the v2 default-routing unit). Both are no-ops when fewer than
598        // two groups exist.
599        let pre_layer = CategoryLayer {
600            summaries: summaries.clone(),
601            name_to_index: name_to_index.clone(),
602            graph: graph.clone(),
603            outer_positions: projected_positions.to_vec(),
604            inner_spheres: inner_spheres.clone(),
605            domain_groups: Vec::new(),
606            group_inner_spheres: HashMap::new(),
607            spatial_quality: spatial_quality.clone(),
608        };
609        let domain_groups = crate::domain_groups::detect_domain_groups(
610            &pre_layer,
611            config.routing.num_domain_groups,
612        );
613        let group_inner_spheres = Self::build_group_inner_spheres(
614            &domain_groups,
615            &summaries,
616            embeddings,
617            projection,
618            config,
619        );
620
621        CategoryLayer {
622            summaries,
623            name_to_index,
624            graph,
625            outer_positions: projected_positions.to_vec(),
626            inner_spheres,
627            domain_groups,
628            group_inner_spheres,
629            spatial_quality,
630        }
631    }
632
633    /// Annotate every bridge with a heuristically inferred [`RelationType`].
634    ///
635    /// Looks at each bridge item's concept label (indexed into `labels`
636    /// by `BridgeItem::item_index`) along with its source and target
637    /// category names. Bridges whose label doesn't match any cue remain
638    /// `relation: None`.
639    pub fn annotate_bridge_relations(&mut self, labels: &[String]) {
640        for ((src_cat, tgt_cat), bridges) in self.graph.bridges.iter_mut() {
641            let src_name = &self.summaries[*src_cat].name;
642            let tgt_name = &self.summaries[*tgt_cat].name;
643            for bridge in bridges.iter_mut() {
644                if bridge.item_index < labels.len() {
645                    bridge.relation =
646                        infer_relation(&labels[bridge.item_index], src_name, tgt_name);
647                }
648            }
649        }
650    }
651
652    /// Histogram of relation types across every bridge in the graph.
653    ///
654    /// `None` is included as its own bucket so callers can see how many
655    /// bridges remain unlabeled after annotation.
656    pub fn relation_census(&self) -> std::collections::HashMap<Option<RelationType>, usize> {
657        let mut counts = std::collections::HashMap::new();
658        for bridges in self.graph.bridges.values() {
659            for b in bridges {
660                *counts.entry(b.relation).or_default() += 1;
661            }
662        }
663        counts
664    }
665
666    /// Build the inter-category adjacency graph and detect bridge items.
667    ///
668    /// Bridge detection uses the spatial quality's EVR-adaptive threshold
669    /// and exclusivity-weighted strength to prevent bridge inflation at
670    /// low EVR and discount bridges between overlapping categories.
671    fn build_graph(
672        summaries: &[CategorySummary],
673        embeddings: &[Embedding],
674        num_cats: usize,
675        spatial: &SpatialQuality,
676        config: &PipelineConfig,
677    ) -> CategoryGraph {
678        let mut centroid_dists = vec![vec![0.0; num_cats]; num_cats];
679        for i in 0..num_cats {
680            for j in (i + 1)..num_cats {
681                let d = angular_distance(
682                    &summaries[i].centroid_position,
683                    &summaries[j].centroid_position,
684                );
685                centroid_dists[i][j] = d;
686                centroid_dists[j][i] = d;
687            }
688        }
689
690        let bridge_threshold = spatial.bridge_threshold;
691
692        let mut bridges: HashMap<(usize, usize), Vec<BridgeItem>> = HashMap::new();
693
694        for (ci, summary) in summaries.iter().enumerate() {
695            let centroid_a = &summary.centroid_embedding;
696
697            for &mi in &summary.member_indices {
698                let item_emb = &embeddings[mi];
699                // Invariant: all embeddings are validated to share the same dimension
700                // at pipeline build time, so cosine_similarity cannot fail here.
701                let sim_to_own = cosine_similarity(&item_emb.values, centroid_a)
702                    .expect("centroid and member share fixed embedding dimensionality");
703
704                for (cj, other_summary) in summaries.iter().enumerate() {
705                    if ci == cj {
706                        continue;
707                    }
708
709                    // Same invariant as sim_to_own above.
710                    let sim_to_other =
711                        cosine_similarity(&item_emb.values, &other_summary.centroid_embedding)
712                            .expect("centroid and member share fixed embedding dimensionality");
713
714                    // EVR-adaptive threshold: stricter when projection is lossy.
715                    // Require positive own-affinity too; otherwise the harmonic-mean
716                    // strength below collapses or goes negative for a category that
717                    // doesn't even hold the item.
718                    if sim_to_own > 0.0
719                        && sim_to_other > 0.0
720                        && sim_to_other > sim_to_own * bridge_threshold
721                    {
722                        let raw_strength = if sim_to_own + sim_to_other > f64::EPSILON {
723                            2.0 * sim_to_own * sim_to_other / (sim_to_own + sim_to_other)
724                        } else {
725                            0.0
726                        };
727
728                        // Discount bridges between heavily overlapping categories.
729                        // Two categories that can't be distinguished on S² produce
730                        // "bridges" that are really just shared territory.
731                        let territorial = spatial.territorial_factor(ci, cj);
732                        let bridge_strength = raw_strength * territorial;
733
734                        bridges.entry((ci, cj)).or_default().push(BridgeItem {
735                            item_index: mi,
736                            source_category: ci,
737                            target_category: cj,
738                            affinity_to_source: sim_to_own,
739                            affinity_to_target: sim_to_other,
740                            bridge_strength,
741                            // Populated in the classification pass below.
742                            classification: BridgeClassification::Weak,
743                            relation: None,
744                        });
745                    }
746                }
747            }
748        }
749
750        // Classification pass: compare each bridge against the corpus-wide
751        // median strength and the pair's territorial separation on S².
752        //
753        // When EVR is below `min_evr_for_classification`, the projection
754        // is too lossy for territorial factors to distinguish genuine
755        // bridges from overlap artifacts — every factor collapses to
756        // near-zero, every bridge gets labeled OverlapArtifact, and the
757        // tuner landscape flattens. Skip the territorial check entirely
758        // in that regime and leave the default `Weak` label on each
759        // bridge (honest uncertainty).
760        if spatial.evr >= config.bridges.min_evr_for_classification {
761            let mut all_strengths: Vec<f64> = bridges
762                .values()
763                .flat_map(|list| list.iter().map(|b| b.bridge_strength))
764                .collect();
765            let median_strength = if all_strengths.is_empty() {
766                0.0
767            } else {
768                all_strengths.sort_by(|a, b| a.total_cmp(b));
769                all_strengths[all_strengths.len() / 2]
770            };
771
772            let overlap_threshold = config.bridges.overlap_artifact_territorial;
773            for list in bridges.values_mut() {
774                for b in list.iter_mut() {
775                    let tf = spatial.territorial_factor(b.source_category, b.target_category);
776                    b.classification = if tf < overlap_threshold {
777                        BridgeClassification::OverlapArtifact
778                    } else if b.bridge_strength >= median_strength {
779                        BridgeClassification::Genuine
780                    } else {
781                        BridgeClassification::Weak
782                    };
783                }
784            }
785        }
786
787        for list in bridges.values_mut() {
788            list.sort_by(|a, b| {
789                b.bridge_strength
790                    .partial_cmp(&a.bridge_strength)
791                    .unwrap_or(std::cmp::Ordering::Equal)
792            });
793        }
794
795        let mut adjacency: Vec<Vec<CategoryEdge>> = vec![Vec::new(); num_cats];
796        for i in 0..num_cats {
797            for (j, &cd) in centroid_dists[i].iter().enumerate() {
798                if i == j {
799                    continue;
800                }
801                let bridge_list = bridges.get(&(i, j));
802                let bridge_count = bridge_list.map_or(0, |b| b.len());
803                let max_bridge_strength = bridge_list
804                    .and_then(|b| b.first().map(|item| item.bridge_strength))
805                    .unwrap_or(0.0);
806                let mean_bridge_strength = bridge_list
807                    .map(|b| {
808                        let sum: f64 = b.iter().map(|item| item.bridge_strength).sum();
809                        sum / b.len() as f64
810                    })
811                    .unwrap_or(0.0);
812
813                // Voronoi neighbors get a routing bonus — they're geometrically
814                // adjacent on S², so traversal between them is natural even
815                // without strong bridges.
816                let voronoi_factor = if spatial.are_voronoi_neighbors(i, j) {
817                    0.8
818                } else {
819                    1.0
820                };
821
822                let weight =
823                    cd * voronoi_factor / (1.0 + bridge_count as f64 * mean_bridge_strength);
824
825                adjacency[i].push(CategoryEdge {
826                    target: j,
827                    centroid_distance: cd,
828                    bridge_count,
829                    max_bridge_strength,
830                    mean_bridge_strength,
831                    weight,
832                });
833            }
834            adjacency[i].sort_by(|a, b| {
835                a.weight
836                    .partial_cmp(&b.weight)
837                    .unwrap_or(std::cmp::Ordering::Equal)
838            });
839        }
840
841        CategoryGraph { adjacency, bridges }
842    }
843
844    /// Evaluate each category and build inner spheres where they help.
845    fn build_inner_spheres<P: Projection>(
846        summaries: &[CategorySummary],
847        embeddings: &[Embedding],
848        projection: &P,
849        config: &PipelineConfig,
850    ) -> HashMap<usize, InnerSphere> {
851        let mut result = HashMap::new();
852        let cfg = &config.inner_sphere;
853
854        for (ci, summary) in summaries.iter().enumerate() {
855            if summary.member_count < cfg.min_size {
856                continue;
857            }
858
859            let member_embs: Vec<Embedding> = summary
860                .member_indices
861                .iter()
862                .map(|&i| embeddings[i].clone())
863                .collect();
864
865            // Global subset EVR: mean certainty under global projection
866            let global_subset_evr: f64 = member_embs
867                .iter()
868                .map(|e| projection.project_rich(e).certainty)
869                .sum::<f64>()
870                / member_embs.len() as f64;
871
872            // Fit inner linear PCA. On failure (too few members, dim
873            // too low, etc.) skip this category's inner sphere
874            // silently — the outer sphere still covers queries.
875            let Ok(inner_pca) = PcaProjection::fit(&member_embs, RadialStrategy::Fixed(1.0)) else {
876                continue;
877            };
878            let inner_linear_evr = inner_pca.explained_variance_ratio();
879
880            if inner_linear_evr - global_subset_evr < cfg.min_evr_improvement {
881                continue;
882            }
883
884            let (inner_proj, inner_evr) = if summary.member_count >= cfg.kernel_pca_min_size {
885                // Kernel PCA can fail on degenerate subsets too; fall
886                // back to the already-successful linear fit.
887                match KernelPcaProjection::fit(&member_embs, RadialStrategy::Fixed(1.0)) {
888                    Ok(inner_kpca) => {
889                        let kernel_evr = inner_kpca.explained_variance_ratio();
890                        if kernel_evr > inner_linear_evr + cfg.min_kernel_improvement {
891                            (InnerProjection::KernelPca(inner_kpca), kernel_evr)
892                        } else {
893                            (InnerProjection::LinearPca(inner_pca), inner_linear_evr)
894                        }
895                    }
896                    Err(_) => (InnerProjection::LinearPca(inner_pca), inner_linear_evr),
897                }
898            } else {
899                (InnerProjection::LinearPca(inner_pca), inner_linear_evr)
900            };
901
902            let inner_positions: Vec<SphericalPoint> =
903                member_embs.iter().map(|e| inner_proj.project(e)).collect();
904
905            result.insert(
906                ci,
907                InnerSphere {
908                    projection: inner_proj,
909                    inner_positions,
910                    member_indices: summary.member_indices.clone(),
911                    explained_variance_ratio: inner_evr,
912                    global_subset_evr,
913                    evr_improvement: inner_evr - global_subset_evr,
914                },
915            );
916        }
917
918        result
919    }
920
921    /// Fit one inner sphere per domain group. Each group's inner
922    /// projection sees the union of its member categories' embeddings.
923    ///
924    /// Built only when (a) ≥ 2 groups exist, (b) the union has at
925    /// least [`InnerSphereConfig::min_size`] members, and (c) the
926    /// inner-vs-outer EVR improvement clears the same threshold the
927    /// per-category builder uses. Kernel PCA is attempted at the same
928    /// `kernel_pca_min_size` cutoff as the per-category path.
929    fn build_group_inner_spheres<P: Projection>(
930        groups: &[crate::domain_groups::DomainGroup],
931        summaries: &[CategorySummary],
932        embeddings: &[Embedding],
933        projection: &P,
934        config: &PipelineConfig,
935    ) -> HashMap<usize, InnerSphere> {
936        let mut result = HashMap::new();
937        if groups.len() < 2 {
938            return result;
939        }
940        let cfg = &config.inner_sphere;
941
942        for (gi, group) in groups.iter().enumerate() {
943            let mut member_indices: Vec<usize> = Vec::new();
944            for &ci in &group.member_categories {
945                member_indices.extend(summaries[ci].member_indices.iter().copied());
946            }
947            if member_indices.len() < cfg.min_size {
948                continue;
949            }
950
951            let member_embs: Vec<Embedding> = member_indices
952                .iter()
953                .map(|&i| embeddings[i].clone())
954                .collect();
955
956            let global_subset_evr: f64 = member_embs
957                .iter()
958                .map(|e| projection.project_rich(e).certainty)
959                .sum::<f64>()
960                / member_embs.len() as f64;
961
962            let Ok(inner_pca) = PcaProjection::fit(&member_embs, RadialStrategy::Fixed(1.0)) else {
963                continue;
964            };
965            let inner_linear_evr = inner_pca.explained_variance_ratio();
966            if inner_linear_evr - global_subset_evr < cfg.min_evr_improvement {
967                continue;
968            }
969
970            let (inner_proj, inner_evr) = if member_indices.len() >= cfg.kernel_pca_min_size {
971                match KernelPcaProjection::fit(&member_embs, RadialStrategy::Fixed(1.0)) {
972                    Ok(inner_kpca) => {
973                        let kernel_evr = inner_kpca.explained_variance_ratio();
974                        if kernel_evr > inner_linear_evr + cfg.min_kernel_improvement {
975                            (InnerProjection::KernelPca(inner_kpca), kernel_evr)
976                        } else {
977                            (InnerProjection::LinearPca(inner_pca), inner_linear_evr)
978                        }
979                    }
980                    Err(_) => (InnerProjection::LinearPca(inner_pca), inner_linear_evr),
981                }
982            } else {
983                (InnerProjection::LinearPca(inner_pca), inner_linear_evr)
984            };
985
986            let inner_positions: Vec<SphericalPoint> =
987                member_embs.iter().map(|e| inner_proj.project(e)).collect();
988
989            result.insert(
990                gi,
991                InnerSphere {
992                    projection: inner_proj,
993                    inner_positions,
994                    member_indices,
995                    explained_variance_ratio: inner_evr,
996                    global_subset_evr,
997                    evr_improvement: inner_evr - global_subset_evr,
998                },
999            );
1000        }
1001
1002        result
1003    }
1004
1005    // ── Phase 1 query methods (unchanged) ──────────────────────────────
1006
1007    /// Number of categories.
1008    pub fn num_categories(&self) -> usize {
1009        self.summaries.len()
1010    }
1011
1012    /// Look up a category by name.
1013    pub fn get_category(&self, name: &str) -> Option<&CategorySummary> {
1014        self.name_to_index
1015            .get(name)
1016            .map(|&idx| &self.summaries[idx])
1017    }
1018
1019    /// Get the k nearest neighbor categories to the given category.
1020    pub fn category_neighbors(&self, category_name: &str, k: usize) -> Vec<&CategorySummary> {
1021        let Some(&ci) = self.name_to_index.get(category_name) else {
1022            return Vec::new();
1023        };
1024        self.graph.adjacency[ci]
1025            .iter()
1026            .take(k)
1027            .map(|edge| &self.summaries[edge.target])
1028            .collect()
1029    }
1030
1031    /// Get bridge items between two categories.
1032    pub fn bridge_items(
1033        &self,
1034        source_category: &str,
1035        target_category: &str,
1036        max_bridges: usize,
1037    ) -> Vec<&BridgeItem> {
1038        let Some(&si) = self.name_to_index.get(source_category) else {
1039            return Vec::new();
1040        };
1041        let Some(&ti) = self.name_to_index.get(target_category) else {
1042            return Vec::new();
1043        };
1044        self.graph
1045            .bridges
1046            .get(&(si, ti))
1047            .map(|list| list.iter().take(max_bridges).collect())
1048            .unwrap_or_default()
1049    }
1050
1051    /// Find the shortest path between two categories through the category graph.
1052    pub fn category_path(
1053        &self,
1054        source_category: &str,
1055        target_category: &str,
1056    ) -> Option<CategoryPath> {
1057        let &si = self.name_to_index.get(source_category)?;
1058        let &ti = self.name_to_index.get(target_category)?;
1059        if si == ti {
1060            return Some(CategoryPath {
1061                steps: vec![CategoryPathStep {
1062                    category_index: si,
1063                    category_name: self.summaries[si].name.clone(),
1064                    cumulative_distance: 0.0,
1065                    bridges_to_next: Vec::new(),
1066                    hop_confidence: 0.0,
1067                }],
1068                total_distance: 0.0,
1069                path_confidence: 1.0,
1070            });
1071        }
1072
1073        // Dijkstra via binary-heap. Previously a linear scan over
1074        // `dist` per pop gave O(C²) — fine for tens of categories,
1075        // sloppy under the larger corpora the tuner now exercises.
1076        // Match the pattern already used in query.rs::concept_path.
1077        let n = self.summaries.len();
1078        let mut dist = vec![f64::INFINITY; n];
1079        let mut prev: Vec<Option<usize>> = vec![None; n];
1080        let mut heap = std::collections::BinaryHeap::new();
1081
1082        dist[si] = 0.0;
1083        heap.push(HeapEntry {
1084            dist: 0.0,
1085            node: si,
1086        });
1087
1088        while let Some(HeapEntry { dist: d, node: u }) = heap.pop() {
1089            if u == ti {
1090                break;
1091            }
1092            // Stale entry — we already relaxed this node to something
1093            // shorter. Skip without touching neighbors.
1094            if d > dist[u] {
1095                continue;
1096            }
1097            for edge in &self.graph.adjacency[u] {
1098                let nd = d + edge.weight;
1099                if nd < dist[edge.target] {
1100                    dist[edge.target] = nd;
1101                    prev[edge.target] = Some(u);
1102                    heap.push(HeapEntry {
1103                        dist: nd,
1104                        node: edge.target,
1105                    });
1106                }
1107            }
1108        }
1109
1110        if dist[ti].is_infinite() {
1111            return None;
1112        }
1113
1114        let mut path_indices = Vec::new();
1115        let mut cur = ti;
1116        loop {
1117            path_indices.push(cur);
1118            match prev[cur] {
1119                Some(p) => cur = p,
1120                None => break,
1121            }
1122        }
1123        path_indices.reverse();
1124
1125        let mut steps = Vec::with_capacity(path_indices.len());
1126        for (step_idx, &ci) in path_indices.iter().enumerate() {
1127            let bridges_to_next = if step_idx + 1 < path_indices.len() {
1128                let next_ci = path_indices[step_idx + 1];
1129                self.graph
1130                    .bridges
1131                    .get(&(ci, next_ci))
1132                    .map(|list| list.iter().take(3).cloned().collect())
1133                    .unwrap_or_default()
1134            } else {
1135                Vec::new()
1136            };
1137
1138            let hop_confidence = if step_idx + 1 < path_indices.len() {
1139                let next_ci = path_indices[step_idx + 1];
1140                let edge_strength = self.graph.adjacency[ci]
1141                    .iter()
1142                    .find(|e| e.target == next_ci)
1143                    .map_or(0.0, |e| e.max_bridge_strength);
1144                let territorial = self.spatial_quality.territorial_factor(ci, next_ci);
1145                let voronoi_bonus = if self.spatial_quality.are_voronoi_neighbors(ci, next_ci) {
1146                    1.2
1147                } else {
1148                    1.0
1149                };
1150                (edge_strength * territorial * voronoi_bonus).min(1.0)
1151            } else {
1152                0.0
1153            };
1154
1155            steps.push(CategoryPathStep {
1156                category_index: ci,
1157                category_name: self.summaries[ci].name.clone(),
1158                cumulative_distance: dist[ci],
1159                bridges_to_next,
1160                hop_confidence,
1161            });
1162        }
1163
1164        let path_confidence = steps
1165            .iter()
1166            .take(steps.len().saturating_sub(1))
1167            .map(|s| s.hop_confidence)
1168            .fold(1.0, |acc, c| acc * c.max(0.01));
1169
1170        Some(CategoryPath {
1171            total_distance: dist[ti],
1172            steps,
1173            path_confidence,
1174        })
1175    }
1176
1177    /// Find all categories whose centroid is within `max_angle` radians
1178    /// of the given embedding's projected position.
1179    pub fn categories_near_embedding<P: Projection>(
1180        &self,
1181        embedding: &Embedding,
1182        projection: &P,
1183        max_angle: f64,
1184    ) -> Vec<(usize, f64)> {
1185        let pos = projection.project(embedding);
1186        let mut results: Vec<(usize, f64)> = self
1187            .summaries
1188            .iter()
1189            .enumerate()
1190            .map(|(i, s)| (i, angular_distance(&pos, &s.centroid_position)))
1191            .filter(|&(_, d)| d <= max_angle)
1192            .collect();
1193        results.sort_by(|a, b| a.1.total_cmp(&b.1));
1194        results
1195    }
1196
1197    /// Certainty-weighted category routing.
1198    ///
1199    /// Like [`Self::categories_near_embedding`] but penalizes routes through
1200    /// low-certainty projection regions. The effective distance is scaled
1201    /// by `1 / sqrt(certainty)`, so poorly-projected queries don't get
1202    /// routed to whatever random centroid happens to be angularly close
1203    /// in the distorted projection.
1204    ///
1205    /// Returns `(category_index, raw_angular_distance, effective_distance, certainty)`.
1206    pub fn categories_near_embedding_weighted<P: Projection>(
1207        &self,
1208        embedding: &Embedding,
1209        projection: &P,
1210        max_angle: f64,
1211    ) -> Vec<(usize, f64, f64, f64)> {
1212        let rich = projection.project_rich(embedding);
1213        let pos = rich.position;
1214        let certainty = rich.certainty.max(0.001);
1215
1216        let mut results: Vec<(usize, f64, f64, f64)> = self
1217            .summaries
1218            .iter()
1219            .enumerate()
1220            .map(|(i, s)| {
1221                let raw_dist = angular_distance(&pos, &s.centroid_position);
1222                // Low certainty inflates distance → avoids noisy routing
1223                let effective = raw_dist / certainty.sqrt();
1224                (i, raw_dist, effective, certainty)
1225            })
1226            .filter(|&(_, raw, _, _)| raw <= max_angle)
1227            .collect();
1228        results.sort_by(|a, b| a.2.total_cmp(&b.2));
1229        results
1230    }
1231
1232    // ── Phase 2 query methods (inner spheres) ──────────────────────────
1233
1234    /// Whether a given category has an inner sphere.
1235    pub fn has_inner_sphere(&self, category_name: &str) -> bool {
1236        self.name_to_index
1237            .get(category_name)
1238            .is_some_and(|&ci| self.inner_spheres.contains_key(&ci))
1239    }
1240
1241    /// Get the inner sphere for a category, if one exists.
1242    pub fn get_inner_sphere(&self, category_name: &str) -> Option<&InnerSphere> {
1243        self.name_to_index
1244            .get(category_name)
1245            .and_then(|&ci| self.inner_spheres.get(&ci))
1246    }
1247
1248    /// Number of categories that have inner spheres.
1249    pub fn num_inner_spheres(&self) -> usize {
1250        self.inner_spheres.len()
1251    }
1252
1253    /// Drill down with an explicit outer projection for the fallback case.
1254    ///
1255    /// When no inner sphere exists, the query is projected using the
1256    /// provided projection and compared against stored outer positions.
1257    pub fn drill_down_with_projection<P: Projection>(
1258        &self,
1259        category_name: &str,
1260        embedding: &Embedding,
1261        projection: &P,
1262        k: usize,
1263    ) -> Vec<DrillDownResult> {
1264        let Some(&ci) = self.name_to_index.get(category_name) else {
1265            return Vec::new();
1266        };
1267        let summary = &self.summaries[ci];
1268
1269        if let Some(inner) = self.inner_spheres.get(&ci) {
1270            let query_pos = inner.projection.project(embedding);
1271            let mut results: Vec<DrillDownResult> = inner
1272                .inner_positions
1273                .iter()
1274                .enumerate()
1275                .map(|(local_idx, pos)| DrillDownResult {
1276                    item_index: inner.member_indices[local_idx],
1277                    distance: angular_distance(&query_pos, pos),
1278                    used_inner_sphere: true,
1279                })
1280                .collect();
1281            results.sort_by(|a, b| {
1282                a.distance
1283                    .partial_cmp(&b.distance)
1284                    .unwrap_or(std::cmp::Ordering::Equal)
1285            });
1286            results.truncate(k);
1287            results
1288        } else {
1289            let query_pos = projection.project(embedding);
1290            let mut results: Vec<DrillDownResult> = summary
1291                .member_indices
1292                .iter()
1293                .map(|&mi| DrillDownResult {
1294                    item_index: mi,
1295                    distance: angular_distance(&self.outer_positions[mi], &query_pos),
1296                    used_inner_sphere: false,
1297                })
1298                .collect();
1299            results.sort_by(|a, b| {
1300                a.distance
1301                    .partial_cmp(&b.distance)
1302                    .unwrap_or(std::cmp::Ordering::Equal)
1303            });
1304            results.truncate(k);
1305            results
1306        }
1307    }
1308
1309    /// Drill down into a domain group's inner sphere. Returns an empty
1310    /// vec if the group index is unknown or has no inner sphere.
1311    ///
1312    /// Used by the v2 default `nearest()` path: route to the closest
1313    /// group, then run k-NN against the group-level inner positions.
1314    pub fn drill_down_group(
1315        &self,
1316        group_index: usize,
1317        embedding: &Embedding,
1318        k: usize,
1319    ) -> Vec<DrillDownResult> {
1320        let Some(inner) = self.group_inner_spheres.get(&group_index) else {
1321            return Vec::new();
1322        };
1323        let query_pos = inner.projection.project(embedding);
1324        let mut results: Vec<DrillDownResult> = inner
1325            .inner_positions
1326            .iter()
1327            .enumerate()
1328            .map(|(local_idx, pos)| DrillDownResult {
1329                item_index: inner.member_indices[local_idx],
1330                distance: angular_distance(&query_pos, pos),
1331                used_inner_sphere: true,
1332            })
1333            .collect();
1334        results.sort_by(|a, b| {
1335            a.distance
1336                .partial_cmp(&b.distance)
1337                .unwrap_or(std::cmp::Ordering::Equal)
1338        });
1339        results.truncate(k);
1340        results
1341    }
1342
1343    /// Index of the domain group whose centroid is nearest to `pos`,
1344    /// along with the angular distance to the *second*-nearest group's
1345    /// centroid (or `f64::INFINITY` if there is only one group).
1346    ///
1347    /// The default `nearest()` routing in
1348    /// [`SphereQLPipeline`](crate::pipeline::SphereQLPipeline) drills
1349    /// into the nearest group's inner sphere only when
1350    /// `nearest_dist / second_dist < group_routing_alpha`; the second
1351    /// distance is returned alongside so the caller can apply that
1352    /// gate without recomputing.
1353    pub fn nearest_group(&self, pos: &SphericalPoint) -> Option<(usize, f64, f64)> {
1354        if self.domain_groups.is_empty() {
1355            return None;
1356        }
1357        let mut best = (0usize, f64::INFINITY);
1358        let mut second = f64::INFINITY;
1359        for (gi, g) in self.domain_groups.iter().enumerate() {
1360            let d = angular_distance(pos, &g.centroid);
1361            if d < best.1 {
1362                second = best.1;
1363                best = (gi, d);
1364            } else if d < second {
1365                second = d;
1366            }
1367        }
1368        Some((best.0, best.1, second))
1369    }
1370
1371    /// Report which categories have inner spheres, their projection type,
1372    /// and EVR metrics.
1373    pub fn inner_sphere_stats(&self) -> Vec<InnerSphereReport> {
1374        let mut reports: Vec<InnerSphereReport> = self
1375            .inner_spheres
1376            .iter()
1377            .map(|(&ci, inner)| {
1378                let proj_type = match &inner.projection {
1379                    InnerProjection::LinearPca(_) => "LinearPca",
1380                    InnerProjection::KernelPca(_) => "KernelPca",
1381                };
1382                InnerSphereReport {
1383                    category_name: self.summaries[ci].name.clone(),
1384                    category_index: ci,
1385                    member_count: inner.member_indices.len(),
1386                    projection_type: proj_type,
1387                    inner_evr: inner.explained_variance_ratio,
1388                    global_subset_evr: inner.global_subset_evr,
1389                    evr_improvement: inner.evr_improvement,
1390                }
1391            })
1392            .collect();
1393        reports.sort_by_key(|r| r.category_index);
1394        reports
1395    }
1396}
1397
1398// ── Helpers ────────────────────────────────────────────────────────────
1399
1400/// Min-heap entry keyed on `dist` for [`CategoryLayer::category_path`]'s
1401/// Dijkstra. `BinaryHeap` is a max-heap, so `Ord` is reversed.
1402/// `total_cmp` is NaN-safe (NaN sorts to the end).
1403#[derive(PartialEq)]
1404struct HeapEntry {
1405    dist: f64,
1406    node: usize,
1407}
1408
1409impl Eq for HeapEntry {}
1410
1411impl PartialOrd for HeapEntry {
1412    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
1413        Some(self.cmp(other))
1414    }
1415}
1416
1417impl Ord for HeapEntry {
1418    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
1419        other.dist.total_cmp(&self.dist)
1420    }
1421}
1422
1423// ── Tests ──────────────────────────────────────────────────────────────
1424
1425#[cfg(test)]
1426mod tests {
1427    use super::*;
1428
1429    fn emb(vals: &[f64]) -> Embedding {
1430        Embedding::new(vals.to_vec())
1431    }
1432
1433    // --- Phase 1 test helpers ---
1434
1435    fn test_corpus() -> (Vec<String>, Vec<Embedding>) {
1436        let categories = vec![
1437            "science".into(),
1438            "science".into(),
1439            "science".into(),
1440            "science".into(),
1441            "cooking".into(),
1442            "cooking".into(),
1443            "cooking".into(),
1444            "cooking".into(),
1445            "music".into(),
1446            "music".into(),
1447            "music".into(),
1448            "music".into(),
1449        ];
1450        let embeddings = vec![
1451            emb(&[1.0, 0.1, 0.0, 0.05, 0.02]),
1452            emb(&[0.9, 0.15, 0.05, 0.03, 0.01]),
1453            emb(&[0.95, 0.05, 0.1, 0.04, 0.03]),
1454            emb(&[0.85, 0.2, 0.0, 0.06, 0.01]),
1455            emb(&[0.1, 1.0, 0.0, 0.02, 0.05]),
1456            emb(&[0.15, 0.9, 0.05, 0.03, 0.04]),
1457            emb(&[0.05, 0.95, 0.1, 0.01, 0.06]),
1458            emb(&[0.2, 0.85, 0.0, 0.04, 0.03]),
1459            emb(&[0.0, 0.1, 1.0, 0.05, 0.02]),
1460            emb(&[0.05, 0.15, 0.9, 0.03, 0.01]),
1461            emb(&[0.1, 0.05, 0.95, 0.04, 0.03]),
1462            emb(&[0.0, 0.2, 0.85, 0.06, 0.01]),
1463        ];
1464        (categories, embeddings)
1465    }
1466
1467    fn build_test_layer() -> (CategoryLayer, Vec<Embedding>, PcaProjection) {
1468        let (categories, embeddings) = test_corpus();
1469        let pca = PcaProjection::fit(&embeddings, RadialStrategy::Fixed(1.0)).unwrap();
1470        let projected: Vec<SphericalPoint> = embeddings.iter().map(|e| pca.project(e)).collect();
1471        let evr = pca.explained_variance_ratio();
1472        let layer = CategoryLayer::build(&categories, &embeddings, &projected, &pca, evr);
1473        (layer, embeddings, pca)
1474    }
1475
1476    // --- Phase 2 test helpers ---
1477
1478    fn large_category_corpus() -> (Vec<String>, Vec<Embedding>) {
1479        let mut categories = Vec::new();
1480        let mut embeddings = Vec::new();
1481
1482        for i in 0..25 {
1483            categories.push("big".into());
1484            let t = i as f64 / 25.0;
1485            let mut v = vec![0.0; 10];
1486            v[0] = 1.0 + 0.3 * (t * std::f64::consts::TAU).sin();
1487            v[1] = 0.5 + 0.3 * (t * std::f64::consts::TAU).cos();
1488            v[2] = 0.2 * t;
1489            for (d, slot) in v.iter_mut().enumerate().take(10).skip(3) {
1490                *slot = 0.01 * ((i * 7 + d) as f64 % 1.0);
1491            }
1492            embeddings.push(emb(&v));
1493        }
1494
1495        for i in 0..4 {
1496            categories.push("small_a".into());
1497            let mut v = vec![0.0; 10];
1498            v[5] = 1.0 + 0.1 * i as f64;
1499            v[6] = 0.05;
1500            embeddings.push(emb(&v));
1501        }
1502
1503        for i in 0..4 {
1504            categories.push("small_b".into());
1505            let mut v = vec![0.0; 10];
1506            v[8] = 1.0 + 0.1 * i as f64;
1507            v[9] = 0.05;
1508            embeddings.push(emb(&v));
1509        }
1510
1511        (categories, embeddings)
1512    }
1513
1514    fn build_large_test_layer() -> (CategoryLayer, Vec<Embedding>, PcaProjection) {
1515        let (categories, embeddings) = large_category_corpus();
1516        let pca = PcaProjection::fit(&embeddings, RadialStrategy::Fixed(1.0)).unwrap();
1517        let projected: Vec<SphericalPoint> = embeddings.iter().map(|e| pca.project(e)).collect();
1518        let evr = pca.explained_variance_ratio();
1519        let layer = CategoryLayer::build(&categories, &embeddings, &projected, &pca, evr);
1520        (layer, embeddings, pca)
1521    }
1522
1523    // ======== Phase 1 tests (unchanged) ========
1524
1525    #[test]
1526    fn builds_correct_number_of_categories() {
1527        let (layer, _, _) = build_test_layer();
1528        assert_eq!(layer.num_categories(), 3);
1529    }
1530
1531    #[test]
1532    fn category_names_correct() {
1533        let (layer, _, _) = build_test_layer();
1534        let names: Vec<&str> = layer.summaries.iter().map(|s| s.name.as_str()).collect();
1535        assert!(names.contains(&"science"));
1536        assert!(names.contains(&"cooking"));
1537        assert!(names.contains(&"music"));
1538    }
1539
1540    #[test]
1541    fn member_counts_correct() {
1542        let (layer, _, _) = build_test_layer();
1543        for summary in &layer.summaries {
1544            assert_eq!(summary.member_count, 4);
1545            assert_eq!(summary.member_indices.len(), 4);
1546        }
1547    }
1548
1549    #[test]
1550    fn centroid_embedding_is_mean() {
1551        let (layer, embeddings, _) = build_test_layer();
1552        let science = layer.get_category("science").unwrap();
1553        let mut expected = vec![0.0; 5];
1554        for emb in embeddings.iter().take(4) {
1555            for (j, &v) in emb.values.iter().enumerate() {
1556                expected[j] += v;
1557            }
1558        }
1559        for v in &mut expected {
1560            *v /= 4.0;
1561        }
1562        for (j, (&actual, &exp)) in science
1563            .centroid_embedding
1564            .iter()
1565            .zip(expected.iter())
1566            .enumerate()
1567        {
1568            assert!(
1569                (actual - exp).abs() < 1e-10,
1570                "centroid dim {j}: {actual} != {exp}"
1571            );
1572        }
1573    }
1574
1575    #[test]
1576    fn angular_spread_is_nonnegative() {
1577        let (layer, _, _) = build_test_layer();
1578        for s in &layer.summaries {
1579            assert!(s.angular_spread >= 0.0);
1580        }
1581    }
1582
1583    #[test]
1584    fn cohesion_in_range() {
1585        let (layer, _, _) = build_test_layer();
1586        for s in &layer.summaries {
1587            assert!(s.cohesion > 0.0 && s.cohesion <= 1.0);
1588        }
1589    }
1590
1591    #[test]
1592    fn graph_has_edges_for_all_pairs() {
1593        let (layer, _, _) = build_test_layer();
1594        for (i, edges) in layer.graph.adjacency.iter().enumerate() {
1595            assert_eq!(edges.len(), layer.num_categories() - 1, "cat {i}");
1596        }
1597    }
1598
1599    #[test]
1600    fn edge_weights_positive() {
1601        let (layer, _, _) = build_test_layer();
1602        for edges in &layer.graph.adjacency {
1603            for e in edges {
1604                assert!(e.weight > 0.0);
1605                assert!(e.centroid_distance > 0.0);
1606            }
1607        }
1608    }
1609
1610    #[test]
1611    fn edges_sorted_by_weight() {
1612        let (layer, _, _) = build_test_layer();
1613        for edges in &layer.graph.adjacency {
1614            for w in edges.windows(2) {
1615                assert!(w[0].weight <= w[1].weight);
1616            }
1617        }
1618    }
1619
1620    #[test]
1621    fn edge_bridge_strength_fields_populated() {
1622        let (layer, _, _) = build_test_layer();
1623        for edges in &layer.graph.adjacency {
1624            for e in edges {
1625                assert!(e.max_bridge_strength >= 0.0 && e.max_bridge_strength <= 1.0);
1626                assert!(e.mean_bridge_strength >= 0.0 && e.mean_bridge_strength <= 1.0);
1627                assert!(e.mean_bridge_strength <= e.max_bridge_strength + 1e-10);
1628                if e.bridge_count > 0 {
1629                    assert!(e.max_bridge_strength > 0.0);
1630                    assert!(e.mean_bridge_strength > 0.0);
1631                } else {
1632                    assert!(e.max_bridge_strength == 0.0);
1633                    assert!(e.mean_bridge_strength == 0.0);
1634                }
1635            }
1636        }
1637    }
1638
1639    #[test]
1640    fn edge_weight_incorporates_bridge_strength() {
1641        let (layer, _, _) = build_test_layer();
1642        for edges in &layer.graph.adjacency {
1643            for e in edges {
1644                // The Voronoi bonus can only reduce weight, so weight ≤ base formula.
1645                // The territorial factor can also reduce bridge strength.
1646                let base_no_bonus =
1647                    e.centroid_distance / (1.0 + e.bridge_count as f64 * e.mean_bridge_strength);
1648                assert!(
1649                    e.weight <= base_no_bonus + 1e-10,
1650                    "weight {:.6} should be ≤ base {:.6} (Voronoi bonus reduces it)",
1651                    e.weight,
1652                    base_no_bonus,
1653                );
1654                assert!(e.weight > 0.0, "weight must be positive");
1655            }
1656        }
1657    }
1658
1659    #[test]
1660    fn get_category_by_name() {
1661        let (layer, _, _) = build_test_layer();
1662        assert!(layer.get_category("science").is_some());
1663        assert!(layer.get_category("astrology").is_none());
1664    }
1665
1666    #[test]
1667    fn category_neighbors_returns_sorted() {
1668        let (layer, _, _) = build_test_layer();
1669        assert_eq!(layer.category_neighbors("science", 2).len(), 2);
1670    }
1671
1672    #[test]
1673    fn category_neighbors_k_larger_than_available() {
1674        let (layer, _, _) = build_test_layer();
1675        assert_eq!(layer.category_neighbors("science", 100).len(), 2);
1676    }
1677
1678    #[test]
1679    fn category_neighbors_unknown_returns_empty() {
1680        let (layer, _, _) = build_test_layer();
1681        assert!(layer.category_neighbors("nonexistent", 5).is_empty());
1682    }
1683
1684    #[test]
1685    fn bridge_items_detected() {
1686        let (layer, _, _) = build_test_layer();
1687        let _ = layer.bridge_items("science", "cooking", 10);
1688    }
1689
1690    #[test]
1691    fn bridge_items_unknown_category_returns_empty() {
1692        let (layer, _, _) = build_test_layer();
1693        assert!(layer.bridge_items("science", "nonexistent", 10).is_empty());
1694    }
1695
1696    #[test]
1697    fn bridge_classification_populated() {
1698        let (layer, _, _) = build_test_layer();
1699        for bridges in layer.graph.bridges.values() {
1700            for b in bridges {
1701                // Every bridge should have one of the three classifications.
1702                assert!(
1703                    b.classification == BridgeClassification::Genuine
1704                        || b.classification == BridgeClassification::OverlapArtifact
1705                        || b.classification == BridgeClassification::Weak
1706                );
1707            }
1708        }
1709    }
1710
1711    #[test]
1712    fn low_evr_skips_territorial_classification() {
1713        // When `spatial.evr` falls below `min_evr_for_classification`,
1714        // every bridge should fall back to `Weak`. We force the gate
1715        // by raising the threshold above the measured EVR rather than
1716        // synthesizing a low-EVR projection.
1717        let (categories, embeddings) = test_corpus();
1718        let pca = PcaProjection::fit(&embeddings, RadialStrategy::Fixed(1.0)).unwrap();
1719        let projected: Vec<SphericalPoint> = embeddings.iter().map(|e| pca.project(e)).collect();
1720
1721        let mut config = PipelineConfig::default();
1722        // Set the gate above the natural ceiling so the early-skip
1723        // branch is exercised regardless of corpus EVR.
1724        config.bridges.min_evr_for_classification = 1.5;
1725
1726        let layer = CategoryLayer::build_with_config(
1727            &categories,
1728            &embeddings,
1729            &projected,
1730            &pca,
1731            0.10,
1732            &config,
1733        );
1734
1735        for bridges in layer.graph.bridges.values() {
1736            for b in bridges {
1737                assert_eq!(
1738                    b.classification,
1739                    BridgeClassification::Weak,
1740                    "low EVR should label every bridge Weak, got {:?}",
1741                    b.classification
1742                );
1743            }
1744        }
1745    }
1746
1747    #[test]
1748    fn bridge_quality_nonnegative() {
1749        let (layer, _, _) = build_test_layer();
1750        for s in &layer.summaries {
1751            assert!(
1752                s.bridge_quality >= 0.0,
1753                "{} has negative bridge_quality",
1754                s.name
1755            );
1756        }
1757    }
1758
1759    #[test]
1760    fn bridge_quality_matrix_symmetric_ish() {
1761        let (layer, _, _) = build_test_layer();
1762        let m = &layer.spatial_quality.bridge_quality_matrix;
1763        let n = m.len();
1764        assert_eq!(n, layer.num_categories());
1765        for (i, row) in m.iter().enumerate() {
1766            assert_eq!(row.len(), n);
1767            assert_eq!(row[i], 0.0, "diagonal should be zero");
1768        }
1769    }
1770
1771    #[test]
1772    fn bridge_strength_in_valid_range() {
1773        let (layer, _, _) = build_test_layer();
1774        for list in layer.graph.bridges.values() {
1775            for b in list {
1776                assert!(b.bridge_strength >= 0.0 && b.bridge_strength <= 1.0);
1777            }
1778        }
1779    }
1780
1781    #[test]
1782    fn bridges_sorted_by_strength() {
1783        let (layer, _, _) = build_test_layer();
1784        for list in layer.graph.bridges.values() {
1785            for w in list.windows(2) {
1786                assert!(w[0].bridge_strength >= w[1].bridge_strength);
1787            }
1788        }
1789    }
1790
1791    #[test]
1792    fn category_path_same_category() {
1793        let (layer, _, _) = build_test_layer();
1794        let path = layer.category_path("science", "science").unwrap();
1795        assert_eq!(path.steps.len(), 1);
1796        assert!(path.total_distance.abs() < 1e-12);
1797    }
1798
1799    #[test]
1800    fn category_path_adjacent() {
1801        let (layer, _, _) = build_test_layer();
1802        let path = layer.category_path("science", "cooking").unwrap();
1803        assert!(path.steps.len() >= 2);
1804        assert_eq!(path.steps.first().unwrap().category_name, "science");
1805        assert_eq!(path.steps.last().unwrap().category_name, "cooking");
1806        assert!(path.total_distance > 0.0);
1807    }
1808
1809    #[test]
1810    fn category_path_unknown_returns_none() {
1811        let (layer, _, _) = build_test_layer();
1812        assert!(layer.category_path("science", "nonexistent").is_none());
1813    }
1814
1815    #[test]
1816    fn category_path_distances_monotonic() {
1817        let (layer, _, _) = build_test_layer();
1818        let path = layer.category_path("science", "music").unwrap();
1819        for w in path.steps.windows(2) {
1820            assert!(w[1].cumulative_distance >= w[0].cumulative_distance);
1821        }
1822    }
1823
1824    #[test]
1825    fn categories_near_embedding_finds_correct() {
1826        let (layer, _, pca) = build_test_layer();
1827        let near = layer.categories_near_embedding(
1828            &emb(&[1.0, 0.0, 0.0, 0.0, 0.0]),
1829            &pca,
1830            std::f64::consts::PI,
1831        );
1832        assert!(!near.is_empty());
1833        assert_eq!(layer.summaries[near[0].0].name, "science");
1834    }
1835
1836    #[test]
1837    fn categories_near_embedding_sorted_by_distance() {
1838        let (layer, _, pca) = build_test_layer();
1839        let near = layer.categories_near_embedding(
1840            &emb(&[0.5, 0.5, 0.5, 0.0, 0.0]),
1841            &pca,
1842            std::f64::consts::PI,
1843        );
1844        for w in near.windows(2) {
1845            assert!(w[0].1 <= w[1].1);
1846        }
1847    }
1848
1849    #[test]
1850    fn categories_near_embedding_respects_threshold() {
1851        let (layer, _, pca) = build_test_layer();
1852        let near = layer.categories_near_embedding(&emb(&[1.0, 0.0, 0.0, 0.0, 0.0]), &pca, 0.01);
1853        for &(_, d) in &near {
1854            assert!(d <= 0.01);
1855        }
1856    }
1857
1858    #[test]
1859    fn cosine_similarity_identical() {
1860        assert!(
1861            (cosine_similarity(&[1.0, 0.0, 0.0], &[1.0, 0.0, 0.0]).unwrap() - 1.0).abs() < 1e-12
1862        );
1863    }
1864
1865    #[test]
1866    fn cosine_similarity_orthogonal() {
1867        assert!(
1868            cosine_similarity(&[1.0, 0.0, 0.0], &[0.0, 1.0, 0.0])
1869                .unwrap()
1870                .abs()
1871                < 1e-12
1872        );
1873    }
1874
1875    #[test]
1876    fn cosine_similarity_opposite() {
1877        assert!(
1878            (cosine_similarity(&[1.0, 0.0, 0.0], &[-1.0, 0.0, 0.0]).unwrap() + 1.0).abs() < 1e-12
1879        );
1880    }
1881
1882    #[test]
1883    fn cosine_similarity_zero_vector() {
1884        assert!(
1885            cosine_similarity(&[0.0, 0.0, 0.0], &[1.0, 0.0, 0.0])
1886                .unwrap()
1887                .abs()
1888                < 1e-12
1889        );
1890    }
1891
1892    // ======== Phase 2 tests (inner spheres) ========
1893
1894    #[test]
1895    fn small_categories_get_no_inner_sphere() {
1896        let (layer, _, _) = build_test_layer();
1897        assert_eq!(layer.num_inner_spheres(), 0);
1898        assert!(!layer.has_inner_sphere("science"));
1899    }
1900
1901    #[test]
1902    fn large_category_may_get_inner_sphere() {
1903        let (layer, _, _) = build_large_test_layer();
1904        assert!(!layer.has_inner_sphere("small_a"));
1905        assert!(!layer.has_inner_sphere("small_b"));
1906        let _ = layer.has_inner_sphere("big");
1907    }
1908
1909    #[test]
1910    fn inner_sphere_stats_count_matches() {
1911        let (layer, _, _) = build_large_test_layer();
1912        assert_eq!(layer.inner_sphere_stats().len(), layer.num_inner_spheres());
1913    }
1914
1915    #[test]
1916    fn inner_sphere_stats_sorted_by_index() {
1917        let (layer, _, _) = build_large_test_layer();
1918        let stats = layer.inner_sphere_stats();
1919        for w in stats.windows(2) {
1920            assert!(w[0].category_index <= w[1].category_index);
1921        }
1922    }
1923
1924    #[test]
1925    fn inner_sphere_evr_improvement_positive() {
1926        let (layer, _, _) = build_large_test_layer();
1927        let min_improvement = PipelineConfig::default().inner_sphere.min_evr_improvement;
1928        for inner in layer.inner_spheres.values() {
1929            assert!(inner.evr_improvement >= min_improvement);
1930        }
1931    }
1932
1933    #[test]
1934    fn inner_sphere_positions_match_member_count() {
1935        let (layer, _, _) = build_large_test_layer();
1936        for (&ci, inner) in &layer.inner_spheres {
1937            assert_eq!(inner.inner_positions.len(), inner.member_indices.len());
1938            assert_eq!(inner.member_indices.len(), layer.summaries[ci].member_count);
1939        }
1940    }
1941
1942    #[test]
1943    fn inner_sphere_member_indices_valid() {
1944        let (layer, _, _) = build_large_test_layer();
1945        let total = layer.outer_positions.len();
1946        for inner in layer.inner_spheres.values() {
1947            for &mi in &inner.member_indices {
1948                assert!(mi < total);
1949            }
1950        }
1951    }
1952
1953    #[test]
1954    fn inner_sphere_report_projection_type_valid() {
1955        let (layer, _, _) = build_large_test_layer();
1956        for r in layer.inner_sphere_stats() {
1957            assert!(r.projection_type == "LinearPca" || r.projection_type == "KernelPca");
1958        }
1959    }
1960
1961    #[test]
1962    fn inner_sphere_evr_in_range() {
1963        let (layer, _, _) = build_large_test_layer();
1964        for inner in layer.inner_spheres.values() {
1965            assert!(inner.explained_variance_ratio >= 0.0 && inner.explained_variance_ratio <= 1.0);
1966            assert!(inner.global_subset_evr >= 0.0 && inner.global_subset_evr <= 1.0);
1967        }
1968    }
1969
1970    #[test]
1971    fn has_inner_sphere_unknown_category() {
1972        let (layer, _, _) = build_test_layer();
1973        assert!(!layer.has_inner_sphere("nonexistent"));
1974    }
1975
1976    #[test]
1977    fn get_inner_sphere_returns_none_for_small() {
1978        let (layer, _, _) = build_test_layer();
1979        assert!(layer.get_inner_sphere("science").is_none());
1980    }
1981
1982    #[test]
1983    fn drill_down_returns_results() {
1984        let (layer, _, pca) = build_large_test_layer();
1985        let q = emb(&[1.0, 0.5, 0.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]);
1986        let results = layer.drill_down_with_projection("big", &q, &pca, 5);
1987        assert!(!results.is_empty());
1988        assert!(results.len() <= 5);
1989    }
1990
1991    #[test]
1992    fn drill_down_sorted_by_distance() {
1993        let (layer, _, pca) = build_large_test_layer();
1994        let q = emb(&[1.0, 0.5, 0.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]);
1995        let results = layer.drill_down_with_projection("big", &q, &pca, 10);
1996        for w in results.windows(2) {
1997            assert!(w[0].distance <= w[1].distance);
1998        }
1999    }
2000
2001    #[test]
2002    fn drill_down_unknown_category_empty() {
2003        let (layer, _, pca) = build_large_test_layer();
2004        assert!(
2005            layer
2006                .drill_down_with_projection("nonexistent", &emb(&[1.0; 10]), &pca, 5)
2007                .is_empty()
2008        );
2009    }
2010
2011    #[test]
2012    fn drill_down_item_indices_valid() {
2013        let (layer, _, pca) = build_large_test_layer();
2014        let q = emb(&[1.0, 0.5, 0.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]);
2015        let total = layer.outer_positions.len();
2016        for r in layer.drill_down_with_projection("big", &q, &pca, 25) {
2017            assert!(r.item_index < total);
2018        }
2019    }
2020
2021    #[test]
2022    fn drill_down_small_category_uses_outer() {
2023        let (layer, _, pca) = build_large_test_layer();
2024        let q = emb(&[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]);
2025        for r in layer.drill_down_with_projection("small_a", &q, &pca, 4) {
2026            assert!(!r.used_inner_sphere);
2027        }
2028    }
2029
2030    #[test]
2031    fn drill_down_distances_nonnegative() {
2032        let (layer, _, pca) = build_large_test_layer();
2033        for r in layer.drill_down_with_projection("big", &emb(&[1.0; 10]), &pca, 10) {
2034            assert!(r.distance >= 0.0);
2035        }
2036    }
2037
2038    #[test]
2039    fn inner_projection_enum_debug() {
2040        let corpus: Vec<Embedding> = (0..5)
2041            .map(|i| emb(&[i as f64, 0.0, 0.0, 0.0, 0.0]))
2042            .collect();
2043        let pca = PcaProjection::fit(&corpus, RadialStrategy::Fixed(1.0)).unwrap();
2044        assert_eq!(
2045            format!("{:?}", InnerProjection::LinearPca(pca)),
2046            "LinearPca"
2047        );
2048    }
2049
2050    #[test]
2051    fn inner_projection_projects_correctly() {
2052        let corpus: Vec<Embedding> = (0..5)
2053            .map(|i| emb(&[i as f64, 0.0, 0.0, 0.0, 0.0]))
2054            .collect();
2055        let pca = PcaProjection::fit(&corpus, RadialStrategy::Fixed(1.0)).unwrap();
2056        let proj = InnerProjection::LinearPca(pca.clone());
2057        let e = emb(&[1.0, 0.0, 0.0, 0.0, 0.0]);
2058        let sp_enum = proj.project(&e);
2059        let sp_direct = pca.project(&e);
2060        assert!((sp_enum.theta - sp_direct.theta).abs() < 1e-12);
2061        assert!((sp_enum.phi - sp_direct.phi).abs() < 1e-12);
2062    }
2063
2064    #[test]
2065    fn inner_projection_dimensionality() {
2066        let corpus: Vec<Embedding> = (0..5)
2067            .map(|i| emb(&[i as f64, 0.0, 0.0, 0.0, 0.0]))
2068            .collect();
2069        let pca = PcaProjection::fit(&corpus, RadialStrategy::Fixed(1.0)).unwrap();
2070        assert_eq!(InnerProjection::LinearPca(pca).dimensionality(), 5);
2071    }
2072
2073    // ======== min_category_size filtering ========
2074
2075    #[test]
2076    fn min_category_size_excludes_small_categories() {
2077        let (categories, embeddings) = test_corpus(); // 3 categories × 4 items each
2078        let pca = PcaProjection::fit(&embeddings, RadialStrategy::Fixed(1.0)).unwrap();
2079        let projected: Vec<SphericalPoint> = embeddings.iter().map(|e| pca.project(e)).collect();
2080        let evr = pca.explained_variance_ratio();
2081
2082        let config = PipelineConfig {
2083            min_category_size: 5,
2084            ..Default::default()
2085        };
2086        let layer = CategoryLayer::build_with_config(
2087            &categories,
2088            &embeddings,
2089            &projected,
2090            &pca,
2091            evr,
2092            &config,
2093        );
2094
2095        assert_eq!(
2096            layer.num_categories(),
2097            0,
2098            "all categories below threshold should be excluded"
2099        );
2100        assert!(layer.graph.adjacency.is_empty());
2101        assert!(layer.graph.bridges.is_empty());
2102    }
2103
2104    #[test]
2105    fn min_category_size_one_includes_everything() {
2106        let (categories, embeddings) = test_corpus();
2107        let pca = PcaProjection::fit(&embeddings, RadialStrategy::Fixed(1.0)).unwrap();
2108        let projected: Vec<SphericalPoint> = embeddings.iter().map(|e| pca.project(e)).collect();
2109        let evr = pca.explained_variance_ratio();
2110
2111        let config = PipelineConfig {
2112            min_category_size: 1,
2113            ..Default::default()
2114        };
2115        let layer = CategoryLayer::build_with_config(
2116            &categories,
2117            &embeddings,
2118            &projected,
2119            &pca,
2120            evr,
2121            &config,
2122        );
2123
2124        assert_eq!(
2125            layer.num_categories(),
2126            3,
2127            "all categories should be included at min_size=1"
2128        );
2129    }
2130
2131    #[test]
2132    fn min_category_size_partial_filter() {
2133        // One large category (10 items) and two small ones (2 each).
2134        let mut categories = Vec::new();
2135        let mut embeddings = Vec::new();
2136        for i in 0..10 {
2137            categories.push("big".to_string());
2138            let mut v = vec![0.0; 5];
2139            v[0] = 1.0 + i as f64 * 0.01;
2140            embeddings.push(emb(&v));
2141        }
2142        for i in 0..2 {
2143            categories.push("small_a".to_string());
2144            let mut v = vec![0.0; 5];
2145            v[1] = 1.0 + i as f64 * 0.01;
2146            embeddings.push(emb(&v));
2147        }
2148        for i in 0..2 {
2149            categories.push("small_b".to_string());
2150            let mut v = vec![0.0; 5];
2151            v[2] = 1.0 + i as f64 * 0.01;
2152            embeddings.push(emb(&v));
2153        }
2154
2155        let pca = PcaProjection::fit(&embeddings, RadialStrategy::Fixed(1.0)).unwrap();
2156        let projected: Vec<SphericalPoint> = embeddings.iter().map(|e| pca.project(e)).collect();
2157        let evr = pca.explained_variance_ratio();
2158
2159        let config = PipelineConfig {
2160            min_category_size: 5,
2161            ..Default::default()
2162        };
2163        let layer = CategoryLayer::build_with_config(
2164            &categories,
2165            &embeddings,
2166            &projected,
2167            &pca,
2168            evr,
2169            &config,
2170        );
2171
2172        assert_eq!(layer.num_categories(), 1);
2173        assert_eq!(layer.summaries[0].name, "big");
2174        assert_eq!(layer.summaries[0].member_count, 10);
2175    }
2176
2177    #[test]
2178    fn infer_relation_historical() {
2179        assert_eq!(
2180            infer_relation("History of technology", "history", "engineering"),
2181            Some(RelationType::HistoricalLink)
2182        );
2183    }
2184
2185    #[test]
2186    fn infer_relation_method() {
2187        assert_eq!(
2188            infer_relation("Spectroscopy", "chemistry", "physics"),
2189            Some(RelationType::SharedMethod)
2190        );
2191    }
2192
2193    #[test]
2194    fn infer_relation_none() {
2195        assert_eq!(infer_relation("Quantum computing", "physics", "cs"), None);
2196    }
2197
2198    #[test]
2199    fn relation_type_name_stable() {
2200        assert_eq!(RelationType::Studies.name(), "studies");
2201        assert_eq!(RelationType::HistoricalLink.name(), "historical_link");
2202    }
2203}