Skip to main content

anno_core/core/
grounded.rs

1//! Grounded entity representation with unified Signal → Track → Identity hierarchy.
2//!
3//! # Research Motivation
4//!
5//! Note: `anno` is text-first. The broad `Location` substrate here is intentionally future-facing.
6//! See `docs/LOCATION.md` in the repo for the philosophy and practical guidance.
7//!
8//! Traditional NER systems conflate three distinct levels of entity processing:
9//!
10//! 1. **Signal Detection** (Level 1): "There's something here" - localization + classification
11//! 2. **Track Formation** (Level 2): "These mentions are the same entity within this document"
12//! 3. **Identity Resolution** (Level 3): "This entity is Q7186 in Wikidata"
13//!
14//! This conflation causes issues:
15//! - Embedding models struggle when a single `Entity` type represents both mentions and KB entries
16//! - Cross-document coreference requires different similarity metrics than within-document
17//! - The "modal gap" between text spans and KB entities creates representation mismatches
18//!
19//! # The Isomorphism: Vision Detection ↔ NER
20//!
21//! ```text
22//! ┌─────────────────────────────────────────────────────────────────────────┐
23//! │                    VISION                    TEXT (NER)                 │
24//! ├─────────────────────────────────────────────────────────────────────────┤
25//! │ Localization Unit  │ BoundingBox (x,y,w,h)  │ TextSpan (start,end)     │
26//! │ Signal             │ Detection              │ Mention                  │
27//! │ Track (Level 2)    │ Tracklet (MOT)         │ CorefChain              │
28//! │ Identity (Level 3) │ Face Recognition       │ Entity Linking          │
29//! │ Region Proposal    │ RPN / DETR queries     │ Span enumeration        │
30//! │ Modality           │ Iconic (physics)       │ Symbolic (convention)   │
31//! └─────────────────────────────────────────────────────────────────────────┘
32//! ```
33//!
34//! The key insight: **detection is modality-agnostic**. Whether detecting "Steve Jobs"
35//! in text or a face in an image, the fundamental operation is:
36//!
37//! ```text
38//! Detection = Localization (where?) × Classification (what?)
39//! ```
40//!
41//! # Semiotic Gap: Icon vs Symbol
42//!
43//! A crucial nuance distinguishes text from vision:
44//!
45//! - **Iconic signs** (vision): The signifier physically resembles the signified.
46//!   A photo of a cat looks like a cat. Detection is about physics/geometry.
47//!
48//! - **Symbolic signs** (text): The signifier is arbitrary convention.
49//!   "cat" doesn't look like a cat. Detection requires learning cultural codes.
50//!
51//! This explains why text NER requires more sophisticated linguistic features
52//! (negation, quantification, recursion) that have no visual analogue.
53//!
54//! # Architecture: Entity-Centric Representation
55//!
56//! ```text
57//! ┌─────────────────────────────────────────────────────────────────────────┐
58//! │                      GroundedDocument                                   │
59//! ├─────────────────────────────────────────────────────────────────────────┤
60//! │                                                                         │
61//! │  identities: HashMap<IdentityId, Identity>                              │
62//! │       │                                                                 │
63//! │       └──► Identity { kb_id, canonical_name, embedding, ... }           │
64//! │                 │                                                       │
65//! │  tracks: HashMap<TrackId, Track<S>>                                     │
66//! │       │                                                                 │
67//! │       └──► Track { identity_id, signals: Vec<SignalRef>, ... }          │
68//! │                 │                                                       │
69//! │  signals: Vec<Signal<S>>                                                │
70//! │       │                                                                 │
71//! │       └──► Signal { location: S, label, confidence, ... }               │
72//! │                                                                         │
73//! └─────────────────────────────────────────────────────────────────────────┘
74//! ```
75//!
76//! This entity-centric design enables:
77//! - Efficient streaming/incremental coreference (signals → tracks incrementally)
78//! - Clear separation of detection, clustering, and linking
79//! - Unified treatment of text and visual signals
80//!
81//! # References
82//!
83//! - GLiNER: Bi-encoder span-label matching for zero-shot NER
84//! - DETR: End-to-end object detection with transformers
85//! - Pix2Seq: "Everything is a token" - bounding boxes as spatial tokens
86//! - CDLKT: Cross-document Language-Knowledge Transfer
87//! - Groma: Grounded multimodal assistant
88
89use super::confidence::Confidence;
90use super::entity::{
91    DiscontinuousSpan, Entity, EntityType, HierarchicalConfidence, Provenance, Span,
92};
93use serde::{Deserialize, Serialize};
94use std::collections::HashMap;
95
96// =============================================================================
97// Modality: The Semiotic Distinction
98// =============================================================================
99
100/// The semiotic modality of a signal source.
101///
102/// This captures a fundamental distinction in how meaning is encoded:
103///
104/// - **Iconic**: Physical resemblance (photos, audio waveforms)
105/// - **Symbolic**: Arbitrary convention (text, notation)
106/// - **Indexical**: Causal connection (smoke → fire, but rare in our domain)
107///
108/// # Why This Matters
109///
110/// The modality affects what linguistic features are relevant:
111///
112/// | Feature | Iconic (Vision) | Symbolic (Text) |
113/// |---------|-----------------|-----------------|
114/// | Negation | No analogue | "not a doctor" |
115/// | Quantification | Approximate | "every/some/no" |
116/// | Recursion | Rare | Nested NPs |
117/// | Compositionality | Limited | Full |
118///
119/// Detection in iconic modalities is more about geometry and physics.
120/// Detection in symbolic modalities requires cultural/linguistic knowledge.
121#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, Default)]
122pub enum Modality {
123    /// Iconic sign: signifier resembles signified (images, audio).
124    /// Detection is primarily geometric/physical.
125    Iconic,
126    /// Symbolic sign: arbitrary convention (text, notation).
127    /// Detection requires linguistic/cultural knowledge.
128    #[default]
129    Symbolic,
130    /// Hybrid: OCR text in images, captions, etc.
131    /// Has both iconic (visual layout) and symbolic (text content) aspects.
132    Hybrid,
133}
134
135impl Modality {
136    /// Check if linguistic features (negation, quantification) are relevant.
137    #[must_use]
138    pub const fn supports_linguistic_features(&self) -> bool {
139        matches!(self, Self::Symbolic | Self::Hybrid)
140    }
141
142    /// Check if geometric features (bbox, IoU) are relevant.
143    #[must_use]
144    pub const fn supports_geometric_features(&self) -> bool {
145        matches!(self, Self::Iconic | Self::Hybrid)
146    }
147}
148
149// =============================================================================
150// Location: The Universal Localization Unit
151// =============================================================================
152
153/// A location in some source medium.
154///
155/// This is the universal "localization unit" that enables the isomorphism
156/// between vision detection and NER. Both tasks answer "where is it?"
157/// just in different coordinate systems.
158///
159/// # Relationship to `Span`
160///
161/// [`entity::Span`] is a simplified subset of `Location` for the detection layer:
162///
163/// | `Location` variant | `Span` equivalent |
164/// |--------------------|-------------------|
165/// | `Text` | `Span::Text` |
166/// | `BoundingBox` | `Span::BoundingBox` |
167/// | `TextWithBbox` | `Span::Hybrid` |
168/// | `Temporal` | *none* |
169/// | `Cuboid` | *none* |
170/// | `Genomic` | *none* |
171/// | `Discontinuous` | *none* (use `DiscontinuousSpan`) |
172///
173/// Use [`to_span()`](Self::to_span) to convert where possible.
174///
175/// # Design Note
176///
177/// We use an enum rather than a trait to enable:
178/// - Efficient storage in contiguous arrays
179/// - Easy serialization
180/// - Exhaustive matching for safety
181///
182/// [`entity::Span`]: super::entity::Span
183#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
184pub enum Location {
185    /// Text span: 1D interval [start, end) in character offsets.
186    Text {
187        /// Start character offset (inclusive)
188        start: usize,
189        /// End character offset (exclusive)
190        end: usize,
191    },
192    /// Visual bounding box: 2D rectangle in normalized `[0,1]` coordinates.
193    BoundingBox {
194        /// X coordinate of top-left corner
195        x: f32,
196        /// Y coordinate of top-left corner
197        y: f32,
198        /// Width
199        width: f32,
200        /// Height
201        height: f32,
202        /// Page number for multi-page documents
203        page: Option<u32>,
204    },
205    /// Temporal interval: for audio/video signals.
206    Temporal {
207        /// Start time in seconds
208        start_sec: f64,
209        /// End time in seconds
210        end_sec: f64,
211        /// Optional frame number for video
212        frame: Option<u64>,
213    },
214    /// 3D cuboid: for LiDAR/point cloud signals.
215    Cuboid {
216        /// Center position (x, y, z)
217        center: [f32; 3],
218        /// Dimensions (width, height, depth)
219        dimensions: [f32; 3],
220        /// Rotation (quaternion: w, x, y, z)
221        rotation: [f32; 4],
222    },
223    /// Genomic interval: 1D interval in sequence coordinates.
224    Genomic {
225        /// Chromosome/contig identifier
226        contig: String,
227        /// Start position (0-based, inclusive)
228        start: u64,
229        /// End position (0-based, exclusive)
230        end: u64,
231        /// Strand (+/-)
232        strand: Option<char>,
233    },
234    /// Discontinuous text span: non-contiguous regions.
235    Discontinuous {
236        /// Multiple text intervals
237        segments: Vec<(usize, usize)>,
238    },
239    /// Hybrid: text with visual location (OCR).
240    TextWithBbox {
241        /// Text start offset
242        start: usize,
243        /// Text end offset
244        end: usize,
245        /// Visual bounding box
246        bbox: Box<Location>,
247    },
248}
249
250impl Location {
251    /// Create a text location.
252    #[must_use]
253    pub const fn text(start: usize, end: usize) -> Self {
254        Self::Text { start, end }
255    }
256
257    /// Create a bounding box location.
258    #[must_use]
259    pub fn bbox(x: f32, y: f32, width: f32, height: f32) -> Self {
260        Self::BoundingBox {
261            x,
262            y,
263            width,
264            height,
265            page: None,
266        }
267    }
268
269    /// Get the modality of this location.
270    #[must_use]
271    pub const fn modality(&self) -> Modality {
272        match self {
273            Self::Text { .. } | Self::Genomic { .. } | Self::Discontinuous { .. } => {
274                Modality::Symbolic
275            }
276            Self::BoundingBox { .. } | Self::Cuboid { .. } => Modality::Iconic,
277            Self::Temporal { .. } => Modality::Iconic, // Audio/video is iconic
278            Self::TextWithBbox { .. } => Modality::Hybrid,
279        }
280    }
281
282    /// Get text offsets if this is a text location.
283    #[must_use]
284    pub fn text_offsets(&self) -> Option<(usize, usize)> {
285        match self {
286            Self::Text { start, end } => Some((*start, *end)),
287            Self::TextWithBbox { start, end, .. } => Some((*start, *end)),
288            Self::Discontinuous { segments } => {
289                let start = segments.iter().map(|(s, _)| *s).min()?;
290                let end = segments.iter().map(|(_, e)| *e).max()?;
291                Some((start, end))
292            }
293            _ => None,
294        }
295    }
296
297    /// Check if two locations overlap.
298    #[must_use]
299    pub fn overlaps(&self, other: &Self) -> bool {
300        match (self, other) {
301            (Self::Text { start: s1, end: e1 }, Self::Text { start: s2, end: e2 }) => {
302                s1 < e2 && s2 < e1
303            }
304            (
305                Self::BoundingBox {
306                    x: x1,
307                    y: y1,
308                    width: w1,
309                    height: h1,
310                    page: p1,
311                },
312                Self::BoundingBox {
313                    x: x2,
314                    y: y2,
315                    width: w2,
316                    height: h2,
317                    page: p2,
318                },
319            ) => {
320                // Pages must match (or both None)
321                if p1 != p2 {
322                    return false;
323                }
324                // Standard 2D rectangle overlap
325                x1 < &(x2 + w2) && &(x1 + w1) > x2 && y1 < &(y2 + h2) && &(y1 + h1) > y2
326            }
327            _ => false, // Different types don't overlap
328        }
329    }
330
331    /// Calculate IoU (Intersection over Union) for compatible location types.
332    ///
333    /// Returns None if the locations are incompatible (e.g., text vs bbox).
334    #[must_use]
335    pub fn iou(&self, other: &Self) -> Option<f64> {
336        match (self, other) {
337            (Self::Text { start: s1, end: e1 }, Self::Text { start: s2, end: e2 }) => {
338                let intersection_start = (*s1).max(*s2);
339                let intersection_end = (*e1).min(*e2);
340                if intersection_start >= intersection_end {
341                    return Some(0.0);
342                }
343                let intersection = (intersection_end - intersection_start) as f64;
344                let union = ((*e1).max(*e2) - (*s1).min(*s2)) as f64;
345                if union == 0.0 {
346                    Some(0.0)
347                } else {
348                    Some(intersection / union)
349                }
350            }
351            (
352                Self::BoundingBox {
353                    x: x1,
354                    y: y1,
355                    width: w1,
356                    height: h1,
357                    page: p1,
358                },
359                Self::BoundingBox {
360                    x: x2,
361                    y: y2,
362                    width: w2,
363                    height: h2,
364                    page: p2,
365                },
366            ) => {
367                if p1 != p2 {
368                    return Some(0.0);
369                }
370                let x_overlap = (x1 + w1).min(x2 + w2) - x1.max(*x2);
371                let y_overlap = (y1 + h1).min(y2 + h2) - y1.max(*y2);
372                if x_overlap <= 0.0 || y_overlap <= 0.0 {
373                    return Some(0.0);
374                }
375                let intersection = (x_overlap * y_overlap) as f64;
376                let area1 = (*w1 * *h1) as f64;
377                let area2 = (*w2 * *h2) as f64;
378                let union = area1 + area2 - intersection;
379                if union == 0.0 {
380                    Some(0.0)
381                } else {
382                    Some(intersection / union)
383                }
384            }
385            _ => None,
386        }
387    }
388}
389
390impl Default for Location {
391    fn default() -> Self {
392        Self::Text { start: 0, end: 0 }
393    }
394}
395
396impl From<&Span> for Location {
397    fn from(span: &Span) -> Self {
398        match span {
399            Span::Text { start, end } => Self::Text {
400                start: *start,
401                end: *end,
402            },
403            Span::BoundingBox {
404                x,
405                y,
406                width,
407                height,
408                page,
409            } => Self::BoundingBox {
410                x: *x,
411                y: *y,
412                width: *width,
413                height: *height,
414                page: *page,
415            },
416            Span::Hybrid { start, end, bbox } => Self::TextWithBbox {
417                start: *start,
418                end: *end,
419                bbox: Box::new(Location::from(bbox.as_ref())),
420            },
421        }
422    }
423}
424
425impl From<Span> for Location {
426    fn from(span: Span) -> Self {
427        Self::from(&span)
428    }
429}
430
431/// Convert `Location` to `Span` where possible.
432///
433/// Not all `Location` variants have a corresponding `Span`:
434/// - `Location::Text` → `Span::Text`
435/// - `Location::BoundingBox` → `Span::BoundingBox`
436/// - `Location::TextWithBbox` → `Span::Hybrid`
437/// - `Location::Discontinuous` → `None` (use `DiscontinuousSpan` instead)
438/// - `Location::Temporal`, `Location::Cuboid`, `Location::Genomic` → `None`
439impl Location {
440    /// Try to convert this Location to a Span.
441    ///
442    /// Returns `None` for `Location` variants that don't map to `Span`
443    /// (Temporal, Cuboid, Genomic, Discontinuous).
444    #[must_use]
445    pub fn to_span(&self) -> Option<Span> {
446        match self {
447            Self::Text { start, end } => Some(Span::Text {
448                start: *start,
449                end: *end,
450            }),
451            Self::BoundingBox {
452                x,
453                y,
454                width,
455                height,
456                page,
457            } => Some(Span::BoundingBox {
458                x: *x,
459                y: *y,
460                width: *width,
461                height: *height,
462                page: *page,
463            }),
464            Self::TextWithBbox { start, end, bbox } => {
465                let inner_span = bbox.to_span()?;
466                Some(Span::Hybrid {
467                    start: *start,
468                    end: *end,
469                    bbox: Box::new(inner_span),
470                })
471            }
472            // These Location variants don't have Span equivalents
473            Self::Temporal { .. }
474            | Self::Cuboid { .. }
475            | Self::Genomic { .. }
476            | Self::Discontinuous { .. } => None,
477        }
478    }
479}
480
481// =============================================================================
482// Signal (Level 1): Raw Detection
483// =============================================================================
484
485// SignalId is now a newtype in super::types::ids for type safety
486pub use super::types::SignalId;
487
488/// A raw detection signal: the atomic unit of entity extraction.
489///
490/// # The Detection Equation
491///
492/// Every signal is the product of two factors:
493///
494/// ```text
495/// Signal = Localization × Classification
496///        = "where is it?" × "what is it?"
497/// ```
498///
499/// This is true whether detecting faces in images, named entities in text,
500/// or objects in LiDAR point clouds.
501///
502/// # Design Philosophy
503///
504/// Signals are intentionally minimal. They capture:
505/// 1. **Where**: Location in the source medium
506/// 2. **What**: Classification label + confidence
507/// 3. **Provenance**: How it was detected
508///
509/// What they explicitly do NOT capture:
510/// - Coreference relationships (→ Track)
511/// - Knowledge base links (→ Identity)
512/// - Semantic embeddings (computed lazily if needed)
513///
514/// This separation enables efficient streaming pipelines where signals
515/// are produced incrementally and consumed by downstream track/identity
516/// formation without blocking.
517#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
518pub struct Signal<L = Location> {
519    /// Unique identifier within the document
520    pub id: SignalId,
521    /// Location in the source medium
522    pub location: L,
523    /// Surface form (the actual text or image patch)
524    pub surface: String,
525    /// Classification label (e.g., "Person", "Organization", "PER").
526    ///
527    /// Stored as a `TypeLabel` to support both core taxonomy types and domain-specific labels.
528    pub label: super::types::TypeLabel,
529    /// Detection confidence in [0, 1]
530    pub confidence: f32,
531    /// Hierarchical confidence if available (linkage/type/boundary)
532    pub hierarchical: Option<HierarchicalConfidence>,
533    /// Provenance: which detector produced this signal
534    pub provenance: Option<Provenance>,
535    /// Semiotic modality (derived from location, but can be overridden)
536    pub modality: Modality,
537    /// Normalized form (e.g., "Jan 15" → "2024-01-15")
538    pub normalized: Option<String>,
539    /// Whether this signal is negated (e.g., "not a doctor")
540    pub negated: bool,
541    /// Quantification if applicable (e.g., "every employee")
542    pub quantifier: Option<Quantifier>,
543}
544
545/// Quantification type for symbolic signals.
546///
547/// Only meaningful for text/symbolic modality where linguistic
548/// quantification is possible. Has no visual analogue.
549#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
550#[non_exhaustive]
551pub enum Quantifier {
552    /// Universal: "every", "all", "each"
553    Universal,
554    /// Existential: "some", "a", "certain"
555    Existential,
556    /// Negation: "no", "none"
557    None,
558    /// Specific: definite reference ("the")
559    Definite,
560    /// Approximate: "approximately", "about", "at least", "roughly"
561    Approximate,
562    /// Bare: no explicit quantifier
563    Bare,
564}
565
566impl<L> Signal<L> {
567    /// Create a new signal.
568    ///
569    /// # Arguments
570    ///
571    /// * `id` - Unique identifier (will be overwritten when added to a document)
572    /// * `location` - Where this signal was detected
573    /// * `surface` - The actual text/content of the detection
574    /// * `label` - Classification label (e.g., "Person", "Organization")
575    /// * `confidence` - Detection confidence in `[0, 1]`
576    #[must_use]
577    pub fn new(
578        id: impl Into<SignalId>,
579        location: L,
580        surface: impl Into<String>,
581        label: impl Into<super::types::TypeLabel>,
582        confidence: f32,
583    ) -> Self {
584        Self {
585            id: id.into(),
586            location,
587            surface: surface.into(),
588            label: label.into(),
589            confidence: confidence.clamp(0.0, 1.0),
590            hierarchical: None,
591            provenance: None,
592            modality: Modality::default(),
593            normalized: None,
594            negated: false,
595            quantifier: None,
596        }
597    }
598
599    /// Get the classification label as a string.
600    #[must_use]
601    pub fn label(&self) -> &str {
602        self.label.as_str()
603    }
604
605    /// Get the classification label as a type-safe `TypeLabel`.
606    #[must_use]
607    pub fn type_label(&self) -> super::types::TypeLabel {
608        self.label.clone()
609    }
610
611    /// Get the surface form.
612    #[must_use]
613    pub fn surface(&self) -> &str {
614        &self.surface
615    }
616
617    /// Check if this signal is above a confidence threshold.
618    #[must_use]
619    pub fn is_confident(&self, threshold: f32) -> bool {
620        self.confidence >= threshold
621    }
622
623    /// Set the modality.
624    #[must_use]
625    pub fn with_modality(mut self, modality: Modality) -> Self {
626        self.modality = modality;
627        self
628    }
629
630    /// Mark as negated.
631    #[must_use]
632    pub fn negated(mut self) -> Self {
633        self.negated = true;
634        self
635    }
636
637    /// Set quantifier.
638    #[must_use]
639    pub fn with_quantifier(mut self, q: Quantifier) -> Self {
640        self.quantifier = Some(q);
641        self
642    }
643
644    /// Set provenance.
645    #[must_use]
646    pub fn with_provenance(mut self, p: Provenance) -> Self {
647        self.provenance = Some(p);
648        self
649    }
650}
651
652impl Signal<Location> {
653    /// Get text offsets if this is a text signal.
654    #[must_use]
655    pub fn text_offsets(&self) -> Option<(usize, usize)> {
656        self.location.text_offsets()
657    }
658
659    /// Validate that this signal's location matches its surface text.
660    ///
661    /// Returns `None` if valid, or a description of the mismatch.
662    ///
663    /// # Example
664    ///
665    /// ```rust
666    /// use anno_core::{Signal, Location};
667    ///
668    /// let text = "Lynn Conway worked at IBM.";
669    /// let good = Signal::new(0, Location::text(0, 11), "Lynn Conway", "PER", 0.9);
670    /// assert!(good.validate_against(text).is_none());
671    ///
672    /// let bad = Signal::new(0, Location::text(0, 5), "Lynn Conway", "PER", 0.9);
673    /// assert!(bad.validate_against(text).is_some());
674    /// ```
675    #[must_use]
676    pub fn validate_against(&self, source_text: &str) -> Option<SignalValidationError> {
677        let (start, end) = self.location.text_offsets()?;
678
679        let char_count = source_text.chars().count();
680
681        // Check bounds
682        if end > char_count {
683            return Some(SignalValidationError::OutOfBounds {
684                signal_id: self.id,
685                end,
686                text_len: char_count,
687            });
688        }
689
690        if start >= end {
691            return Some(SignalValidationError::InvalidSpan {
692                signal_id: self.id,
693                start,
694                end,
695            });
696        }
697
698        // Extract actual text at offsets
699        let actual: String = source_text.chars().skip(start).take(end - start).collect();
700
701        if actual != self.surface {
702            return Some(SignalValidationError::TextMismatch {
703                signal_id: self.id,
704                expected: self.surface.clone(),
705                actual,
706                start,
707                end,
708            });
709        }
710
711        None
712    }
713
714    /// Check if this signal is valid against the given source text.
715    #[must_use]
716    pub fn is_valid(&self, source_text: &str) -> bool {
717        self.validate_against(source_text).is_none()
718    }
719
720    /// Create a signal by finding text in source (safe construction).
721    ///
722    /// Returns `None` if the surface text is not found in source.
723    ///
724    /// # Example
725    ///
726    /// ```rust
727    /// use anno_core::{Signal, Location};
728    ///
729    /// let text = "Lynn Conway worked at IBM.";
730    /// let signal = Signal::<Location>::from_text(text, "Lynn Conway", "PER", 0.95);
731    /// assert!(signal.is_some());
732    /// assert_eq!(signal.expect("signal should exist").text_offsets(), Some((0, 11)));
733    /// ```
734    #[must_use]
735    pub fn from_text(
736        source: &str,
737        surface: &str,
738        label: impl Into<super::types::TypeLabel>,
739        confidence: f32,
740    ) -> Option<Self> {
741        Self::from_text_nth(source, surface, label, confidence, 0)
742    }
743
744    /// Create a signal by finding the nth occurrence of text in source.
745    #[must_use]
746    pub fn from_text_nth(
747        source: &str,
748        surface: &str,
749        label: impl Into<super::types::TypeLabel>,
750        confidence: f32,
751        occurrence: usize,
752    ) -> Option<Self> {
753        // Find nth occurrence using char offsets
754        for (count, (byte_idx, _)) in source.match_indices(surface).enumerate() {
755            if count == occurrence {
756                // Convert byte offset to char offset
757                let start = source[..byte_idx].chars().count();
758                let end = start + surface.chars().count();
759
760                return Some(Self::new(
761                    SignalId::ZERO,
762                    Location::text(start, end),
763                    surface,
764                    label,
765                    confidence,
766                ));
767            }
768        }
769
770        None
771    }
772}
773
774/// Validation error for a signal.
775#[derive(Debug, Clone, PartialEq)]
776pub enum SignalValidationError {
777    /// Signal's end offset exceeds text length.
778    OutOfBounds {
779        /// Signal ID
780        signal_id: SignalId,
781        /// End offset that exceeds text
782        end: usize,
783        /// Actual text length in chars
784        text_len: usize,
785    },
786    /// Signal has invalid span (start >= end).
787    InvalidSpan {
788        /// Signal ID
789        signal_id: SignalId,
790        /// Start offset
791        start: usize,
792        /// End offset
793        end: usize,
794    },
795    /// Signal's surface text doesn't match text at offsets.
796    TextMismatch {
797        /// Signal ID
798        signal_id: SignalId,
799        /// Surface text stored in signal
800        expected: String,
801        /// Actual text found at offsets
802        actual: String,
803        /// Start offset
804        start: usize,
805        /// End offset
806        end: usize,
807    },
808}
809
810impl std::fmt::Display for SignalValidationError {
811    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
812        match self {
813            Self::OutOfBounds {
814                signal_id,
815                end,
816                text_len,
817            } => {
818                write!(
819                    f,
820                    "S{}: end offset {} exceeds text length {}",
821                    signal_id, end, text_len
822                )
823            }
824            Self::InvalidSpan {
825                signal_id,
826                start,
827                end,
828            } => {
829                write!(f, "S{}: invalid span [{}, {})", signal_id, start, end)
830            }
831            Self::TextMismatch {
832                signal_id,
833                expected,
834                actual,
835                start,
836                end,
837            } => {
838                write!(
839                    f,
840                    "S{}: text mismatch at [{}, {}): expected '{}', found '{}'",
841                    signal_id, start, end, expected, actual
842                )
843            }
844        }
845    }
846}
847
848impl std::error::Error for SignalValidationError {}
849
850/// Convert an [`Entity`] to a [`Signal<Location>`], mapping Entity's `f64` confidence
851/// to Signal's `f32` (clamped to `[0,1]`).
852///
853/// Uses `Location::Text` for the span and preserves `normalized`, `provenance`,
854/// and `hierarchical_confidence` fields. Discontinuous and visual spans are not
855/// handled; use [`GroundedDocument::from_entities`] for full fidelity.
856impl From<&Entity> for Signal<Location> {
857    fn from(e: &Entity) -> Self {
858        let mut signal = Signal::new(
859            SignalId::ZERO,
860            Location::text(e.start, e.end),
861            &e.text,
862            e.entity_type.as_label(),
863            f32::from(e.confidence),
864        );
865        signal.normalized = e.normalized.clone();
866        signal.provenance = e.provenance.clone();
867        signal.hierarchical = e.hierarchical_confidence;
868        signal
869    }
870}
871
872// =============================================================================
873// Track (Level 2): Within-Document Coreference
874// =============================================================================
875
876// TrackId is now a newtype in super::types::ids for type safety
877pub use super::types::TrackId;
878
879/// A reference to a signal within a track.
880#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
881pub struct SignalRef {
882    /// Signal ID
883    pub signal_id: SignalId,
884    /// Position in document order (for antecedent relationships)
885    pub position: u32,
886}
887
888/// A reference to a track in a specific document.
889///
890/// Used for cross-document operations where we need to reference
891/// tracks without copying them. This enables efficient inter-document
892/// coreference resolution.
893#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
894pub struct TrackRef {
895    /// Document ID containing the track
896    pub doc_id: String,
897    /// Track ID within that document
898    pub track_id: TrackId,
899}
900
901/// A track: a cluster of signals referring to the same entity within a document.
902///
903/// # Terminology Mapping
904///
905/// | Vision | NLP |
906/// |--------|-----|
907/// | Tracklet | CorefChain |
908/// | Object track | Entity cluster |
909/// | Re-identification | Coreference resolution |
910///
911/// # Design Philosophy
912///
913/// Tracks are the bridge between raw signals and global identities.
914/// They answer: "which signals in THIS document refer to the same entity?"
915///
916/// Key properties:
917/// - **Document-scoped**: A track only exists within one document
918/// - **Homogeneous type**: All signals in a track should have compatible types
919/// - **Representative**: The track has a "canonical" signal (usually the first proper mention)
920#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
921pub struct Track {
922    /// Unique identifier within the document
923    pub id: TrackId,
924    /// Signal references in this track (document order)
925    pub signals: Vec<SignalRef>,
926    /// Entity type (consensus from signals).
927    ///
928    /// This is a `TypeLabel` to support both core taxonomy types and domain-specific labels.
929    pub entity_type: Option<super::types::TypeLabel>,
930    /// Canonical surface form (the "best" name for this entity)
931    pub canonical_surface: String,
932    /// Link to global identity (Level 3), if resolved
933    pub identity_id: Option<IdentityId>,
934    /// Confidence that signals are correctly clustered
935    pub cluster_confidence: f32,
936    /// Optional embedding for track-level representation
937    /// (aggregated from signal embeddings)
938    pub embedding: Option<Vec<f32>>,
939}
940
941impl Track {
942    /// Create a new track.
943    #[must_use]
944    pub fn new(id: impl Into<TrackId>, canonical_surface: impl Into<String>) -> Self {
945        Self {
946            id: id.into(),
947            signals: Vec::new(),
948            entity_type: None,
949            canonical_surface: canonical_surface.into(),
950            identity_id: None,
951            cluster_confidence: 1.0,
952            embedding: None,
953        }
954    }
955
956    /// Add a signal to this track.
957    pub fn add_signal(&mut self, signal_id: impl Into<SignalId>, position: u32) {
958        let signal_id = signal_id.into();
959        self.signals.push(SignalRef {
960            signal_id,
961            position,
962        });
963    }
964
965    /// Get the number of mentions in this track.
966    #[must_use]
967    pub fn len(&self) -> usize {
968        self.signals.len()
969    }
970
971    /// Check if this track is empty.
972    #[must_use]
973    pub fn is_empty(&self) -> bool {
974        self.signals.is_empty()
975    }
976
977    /// Check if this is a singleton (single mention).
978    #[must_use]
979    pub fn is_singleton(&self) -> bool {
980        self.signals.len() == 1
981    }
982
983    /// Get the track's unique identifier.
984    #[must_use]
985    pub const fn id(&self) -> TrackId {
986        self.id
987    }
988
989    /// Get the signal references in this track.
990    #[must_use]
991    pub fn signals(&self) -> &[SignalRef] {
992        &self.signals
993    }
994
995    /// Get the canonical surface form.
996    #[must_use]
997    pub fn canonical_surface(&self) -> &str {
998        &self.canonical_surface
999    }
1000
1001    /// Get the linked identity ID, if any.
1002    #[must_use]
1003    pub const fn identity_id(&self) -> Option<IdentityId> {
1004        self.identity_id
1005    }
1006
1007    /// Get the cluster confidence score.
1008    #[must_use]
1009    pub const fn cluster_confidence(&self) -> f32 {
1010        self.cluster_confidence
1011    }
1012
1013    /// Set the cluster confidence score.
1014    pub fn set_cluster_confidence(&mut self, confidence: f32) {
1015        self.cluster_confidence = confidence.clamp(0.0, 1.0);
1016    }
1017
1018    /// Link this track to a global identity (mutable setter).
1019    pub fn set_identity_id(&mut self, identity_id: IdentityId) {
1020        self.identity_id = Some(identity_id);
1021    }
1022
1023    /// Unlink this track from its identity.
1024    pub fn clear_identity_id(&mut self) {
1025        self.identity_id = None;
1026    }
1027
1028    /// Link this track to a global identity.
1029    #[must_use]
1030    pub fn with_identity(mut self, identity_id: IdentityId) -> Self {
1031        self.identity_id = Some(identity_id);
1032        self
1033    }
1034
1035    /// Set the entity type from a string.
1036    ///
1037    /// For new code, prefer [`Self::with_type_label`] which provides type safety.
1038    #[must_use]
1039    pub fn with_type(mut self, entity_type: impl Into<String>) -> Self {
1040        let s = entity_type.into();
1041        self.entity_type = Some(super::types::TypeLabel::from(s.as_str()));
1042        self
1043    }
1044
1045    /// Set the entity type using a type-safe label.
1046    ///
1047    /// This is the preferred method for new code as it provides type safety
1048    /// and integrates with the core `EntityType` taxonomy.
1049    ///
1050    /// # Example
1051    ///
1052    /// ```rust
1053    /// use anno_core::{Track, TypeLabel, EntityType};
1054    ///
1055    /// let track = Track::new(0, "Marie Curie")
1056    ///     .with_type_label(TypeLabel::Core(EntityType::Person));
1057    /// ```
1058    #[must_use]
1059    pub fn with_type_label(mut self, label: super::types::TypeLabel) -> Self {
1060        self.entity_type = Some(label);
1061        self
1062    }
1063
1064    /// Get the entity type as a type-safe label.
1065    ///
1066    /// This converts the internal string representation to a `TypeLabel`,
1067    /// attempting to parse it as a core `EntityType` first.
1068    #[must_use]
1069    pub fn type_label(&self) -> Option<super::types::TypeLabel> {
1070        self.entity_type.clone()
1071    }
1072
1073    /// Set the embedding for this track.
1074    #[must_use]
1075    pub fn with_embedding(mut self, embedding: Vec<f32>) -> Self {
1076        self.embedding = Some(embedding);
1077        self
1078    }
1079
1080    /// Get the spread (distance from first to last mention).
1081    ///
1082    /// Requires document to resolve signal positions.
1083    pub fn compute_spread(&self, doc: &GroundedDocument) -> Option<usize> {
1084        if self.signals.is_empty() {
1085            return Some(0);
1086        }
1087
1088        let positions: Vec<usize> = self
1089            .signals
1090            .iter()
1091            .filter_map(|sr| {
1092                doc.signals
1093                    .iter()
1094                    .find(|s| s.id == sr.signal_id)
1095                    .and_then(|s| s.location.text_offsets())
1096                    .map(|(start, _)| start)
1097            })
1098            .collect();
1099
1100        if positions.is_empty() {
1101            return None;
1102        }
1103
1104        let min_pos = *positions.iter().min().expect("positions non-empty");
1105        let max_pos = *positions.iter().max().expect("positions non-empty");
1106        Some(max_pos.saturating_sub(min_pos))
1107    }
1108
1109    /// Collect all surface form variations from signals.
1110    ///
1111    /// Requires document to resolve signal surfaces.
1112    pub fn collect_variations(&self, doc: &GroundedDocument) -> Vec<String> {
1113        let mut variations: std::collections::HashSet<String> = std::collections::HashSet::new();
1114
1115        for sr in &self.signals {
1116            if let Some(signal) = doc.signals.iter().find(|s| s.id == sr.signal_id) {
1117                variations.insert(signal.surface.clone());
1118            }
1119        }
1120
1121        variations.into_iter().collect()
1122    }
1123
1124    /// Get confidence statistics across all signals.
1125    ///
1126    /// Returns (min, max, mean) confidence values.
1127    pub fn confidence_stats(&self, doc: &GroundedDocument) -> Option<(f32, f32, f32)> {
1128        let confidences: Vec<f32> = self
1129            .signals
1130            .iter()
1131            .filter_map(|sr| {
1132                doc.signals
1133                    .iter()
1134                    .find(|s| s.id == sr.signal_id)
1135                    .map(|s| s.confidence)
1136            })
1137            .collect();
1138
1139        if confidences.is_empty() {
1140            return None;
1141        }
1142
1143        let min = confidences.iter().cloned().fold(f32::INFINITY, f32::min);
1144        let max = confidences
1145            .iter()
1146            .cloned()
1147            .fold(f32::NEG_INFINITY, f32::max);
1148        let mean = confidences.iter().sum::<f32>() / confidences.len() as f32;
1149
1150        Some((min, max, mean))
1151    }
1152
1153    /// Compute aggregate statistics for this track.
1154    ///
1155    /// Returns a `TrackStats` struct with comprehensive aggregate features.
1156    pub fn compute_stats(&self, doc: &GroundedDocument, text_len: usize) -> TrackStats {
1157        let chain_length = self.signals.len();
1158        let spread = self.compute_spread(doc).unwrap_or(0);
1159        let variations = self.collect_variations(doc);
1160        let (min_conf, max_conf, mean_conf) = self.confidence_stats(doc).unwrap_or((0.0, 0.0, 0.0));
1161
1162        // Compute first/last positions
1163        let positions: Vec<usize> = self
1164            .signals
1165            .iter()
1166            .filter_map(|sr| {
1167                doc.signals
1168                    .iter()
1169                    .find(|s| s.id == sr.signal_id)
1170                    .and_then(|s| s.location.text_offsets())
1171                    .map(|(start, _)| start)
1172            })
1173            .collect();
1174
1175        let first_position = positions.iter().min().copied().unwrap_or(0);
1176        let last_position = positions.iter().max().copied().unwrap_or(0);
1177        let relative_spread = if text_len > 0 {
1178            spread as f64 / text_len as f64
1179        } else {
1180            0.0
1181        };
1182
1183        TrackStats {
1184            chain_length,
1185            variation_count: variations.len(),
1186            variations,
1187            spread,
1188            relative_spread,
1189            first_position,
1190            last_position,
1191            min_confidence: min_conf,
1192            max_confidence: max_conf,
1193            mean_confidence: mean_conf,
1194            has_embedding: self.embedding.is_some(),
1195        }
1196    }
1197}
1198
1199/// Aggregate statistics for a track (coreference chain).
1200#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
1201pub struct TrackStats {
1202    /// Number of mentions in the track.
1203    pub chain_length: usize,
1204    /// Number of unique surface form variations.
1205    pub variation_count: usize,
1206    /// All surface form variations.
1207    pub variations: Vec<String>,
1208    /// Spread in characters (first to last mention).
1209    pub spread: usize,
1210    /// Spread as fraction of document length.
1211    pub relative_spread: f64,
1212    /// Position of first mention.
1213    pub first_position: usize,
1214    /// Position of last mention.
1215    pub last_position: usize,
1216    /// Minimum confidence across mentions.
1217    pub min_confidence: f32,
1218    /// Maximum confidence across mentions.
1219    pub max_confidence: f32,
1220    /// Mean confidence across mentions.
1221    pub mean_confidence: f32,
1222    /// Whether this track has an embedding.
1223    pub has_embedding: bool,
1224}
1225
1226// =============================================================================
1227// Identity (Level 3): Cross-Document Entity Linking
1228// =============================================================================
1229
1230// IdentityId is now a newtype in super::types::ids for type safety
1231pub use super::types::IdentityId;
1232
1233/// Source of identity formation.
1234///
1235/// Tracks how an identity was created, which affects how it should be
1236/// used and what operations are valid on it.
1237#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
1238pub enum IdentitySource {
1239    /// Created from cross-document track clustering (inter-doc coref).
1240    /// No KB link yet - this is pure clustering.
1241    CrossDocCoref {
1242        /// Tracks that were clustered to form this identity
1243        track_refs: Vec<TrackRef>,
1244    },
1245    /// Linked from knowledge base (entity linking/NED).
1246    /// Single track or identity linked to KB.
1247    KnowledgeBase {
1248        /// Knowledge base name (e.g., "wikidata")
1249        kb_name: String,
1250        /// Knowledge base ID (e.g., "Q7186")
1251        kb_id: String,
1252    },
1253    /// Both: clustered from tracks AND linked to KB.
1254    /// This is the most complete identity.
1255    Hybrid {
1256        /// Tracks that were clustered
1257        track_refs: Vec<TrackRef>,
1258        /// Knowledge base name
1259        kb_name: String,
1260        /// Knowledge base ID
1261        kb_id: String,
1262    },
1263}
1264
1265/// A global identity: a real-world entity linked to a knowledge base.
1266///
1267/// # The Modal Gap
1268///
1269/// There's a fundamental representational gap between:
1270/// - **Text mentions**: Contextual, variable surface forms ("Marie Curie", "she", "the scientist")
1271/// - **KB entities**: Canonical, static representations (Q7186 in Wikidata)
1272///
1273/// Bridging this gap requires:
1274/// 1. Learning aligned embeddings (text encoder ↔ KB encoder)
1275/// 2. Type consistency constraints
1276/// 3. Cross-encoder re-ranking for hard cases
1277///
1278/// # Design Philosophy
1279///
1280/// Identities are the "global truth" that tracks point to. They represent:
1281/// - A canonical name and description
1282/// - A knowledge base reference (if available)
1283/// - An embedding in the entity space (for similarity/clustering)
1284///
1285/// Identities can exist without KB links (for novel entities not in the KB).
1286#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
1287pub struct Identity {
1288    /// Unique identifier
1289    pub id: IdentityId,
1290    /// Canonical name (the "official" name)
1291    pub canonical_name: String,
1292    /// Entity type/category.
1293    ///
1294    /// Stored as a `TypeLabel` to support both core and custom (domain) labels.
1295    pub entity_type: Option<super::types::TypeLabel>,
1296    /// Knowledge base reference (e.g., "Q7186" for Wikidata)
1297    pub kb_id: Option<String>,
1298    /// Knowledge base name (e.g., "wikidata", "umls")
1299    pub kb_name: Option<String>,
1300    /// Description from knowledge base
1301    pub description: Option<String>,
1302    /// Entity embedding in the KB/entity space
1303    /// This is aligned with the text encoder space for similarity computation
1304    pub embedding: Option<Vec<f32>>,
1305    /// Alias names (other known surface forms)
1306    pub aliases: Vec<String>,
1307    /// Confidence that this identity is correctly resolved
1308    pub confidence: f32,
1309    /// Source of identity formation (how it was created)
1310    #[serde(default, skip_serializing_if = "Option::is_none")]
1311    pub source: Option<IdentitySource>,
1312}
1313
1314impl Identity {
1315    /// Create a new identity.
1316    #[must_use]
1317    pub fn new(id: impl Into<IdentityId>, canonical_name: impl Into<String>) -> Self {
1318        Self {
1319            id: id.into(),
1320            canonical_name: canonical_name.into(),
1321            entity_type: None,
1322            kb_id: None,
1323            kb_name: None,
1324            description: None,
1325            embedding: None,
1326            aliases: Vec::new(),
1327            confidence: 1.0,
1328            source: None,
1329        }
1330    }
1331
1332    /// Create an identity from a knowledge base entry.
1333    #[must_use]
1334    pub fn from_kb(
1335        id: impl Into<IdentityId>,
1336        canonical_name: impl Into<String>,
1337        kb_name: impl Into<String>,
1338        kb_id: impl Into<String>,
1339    ) -> Self {
1340        let kb_name_str = kb_name.into();
1341        let kb_id_str = kb_id.into();
1342        Self {
1343            id: id.into(),
1344            canonical_name: canonical_name.into(),
1345            entity_type: None,
1346            kb_id: Some(kb_id_str.clone()),
1347            kb_name: Some(kb_name_str.clone()),
1348            description: None,
1349            embedding: None,
1350            aliases: Vec::new(),
1351            confidence: 1.0,
1352            source: Some(IdentitySource::KnowledgeBase {
1353                kb_name: kb_name_str,
1354                kb_id: kb_id_str,
1355            }),
1356        }
1357    }
1358
1359    /// Add an alias.
1360    pub fn add_alias(&mut self, alias: impl Into<String>) {
1361        self.aliases.push(alias.into());
1362    }
1363
1364    /// Get the identity's unique identifier.
1365    #[must_use]
1366    pub const fn id(&self) -> IdentityId {
1367        self.id
1368    }
1369
1370    /// Get the canonical name.
1371    #[must_use]
1372    pub fn canonical_name(&self) -> &str {
1373        &self.canonical_name
1374    }
1375
1376    /// Get the KB ID, if linked.
1377    #[must_use]
1378    pub fn kb_id(&self) -> Option<&str> {
1379        self.kb_id.as_deref()
1380    }
1381
1382    /// Get the KB name, if linked.
1383    #[must_use]
1384    pub fn kb_name(&self) -> Option<&str> {
1385        self.kb_name.as_deref()
1386    }
1387
1388    /// Get the aliases.
1389    #[must_use]
1390    pub fn aliases(&self) -> &[String] {
1391        &self.aliases
1392    }
1393
1394    /// Get the confidence score.
1395    #[must_use]
1396    pub const fn confidence(&self) -> f32 {
1397        self.confidence
1398    }
1399
1400    /// Set the confidence score.
1401    pub fn set_confidence(&mut self, confidence: f32) {
1402        self.confidence = confidence.clamp(0.0, 1.0);
1403    }
1404
1405    /// Get the identity source.
1406    #[must_use]
1407    pub fn source(&self) -> Option<&IdentitySource> {
1408        self.source.as_ref()
1409    }
1410
1411    /// Set the embedding.
1412    #[must_use]
1413    pub fn with_embedding(mut self, embedding: Vec<f32>) -> Self {
1414        self.embedding = Some(embedding);
1415        self
1416    }
1417
1418    /// Set the entity type from a string.
1419    ///
1420    /// For new code, prefer [`Self::with_type_label`] which provides type safety.
1421    #[must_use]
1422    pub fn with_type(mut self, entity_type: impl Into<String>) -> Self {
1423        let s = entity_type.into();
1424        self.entity_type = Some(super::types::TypeLabel::from(s.as_str()));
1425        self
1426    }
1427
1428    /// Set the entity type using a type-safe label.
1429    ///
1430    /// This is the preferred method for new code as it provides type safety
1431    /// and integrates with the core `EntityType` taxonomy.
1432    #[must_use]
1433    pub fn with_type_label(mut self, label: super::types::TypeLabel) -> Self {
1434        self.entity_type = Some(label);
1435        self
1436    }
1437
1438    /// Get the entity type as a type-safe label.
1439    ///
1440    /// This converts the internal string representation to a `TypeLabel`,
1441    /// attempting to parse it as a core `EntityType` first.
1442    #[must_use]
1443    pub fn type_label(&self) -> Option<super::types::TypeLabel> {
1444        self.entity_type.clone()
1445    }
1446
1447    /// Set description.
1448    #[must_use]
1449    pub fn with_description(mut self, description: impl Into<String>) -> Self {
1450        self.description = Some(description.into());
1451        self
1452    }
1453
1454    // Note: from_cross_doc_cluster moved to anno crate (see anno/src/eval/cdcr.rs)
1455}
1456
1457// =============================================================================
1458// GroundedDocument: The Container
1459// =============================================================================
1460
1461/// Wire format for [`GroundedDocument`] — contains only the persisted fields.
1462/// Internal indexes are rebuilt automatically via [`GroundedDocument::rebuild_indexes`]
1463/// during deserialization.
1464#[derive(Deserialize)]
1465struct GroundedDocumentWire {
1466    id: String,
1467    text: String,
1468    signals: Vec<Signal<Location>>,
1469    tracks: HashMap<TrackId, Track>,
1470    identities: HashMap<IdentityId, Identity>,
1471}
1472
1473impl From<GroundedDocumentWire> for GroundedDocument {
1474    fn from(wire: GroundedDocumentWire) -> Self {
1475        let mut doc = Self {
1476            id: wire.id,
1477            text: wire.text,
1478            signals: wire.signals,
1479            tracks: wire.tracks,
1480            identities: wire.identities,
1481            signal_to_track: HashMap::new(),
1482            track_to_identity: HashMap::new(),
1483            next_signal_id: SignalId::ZERO,
1484            next_track_id: TrackId::ZERO,
1485            next_identity_id: IdentityId::ZERO,
1486        };
1487        doc.rebuild_indexes();
1488        doc
1489    }
1490}
1491
1492/// A document with grounded entity annotations using the three-level hierarchy.
1493///
1494/// # Entity-Centric Design
1495///
1496/// Traditional document representations store entities as a flat list.
1497/// This design uses an entity-centric representation where:
1498///
1499/// 1. **Signals** are the atomic detections (Level 1)
1500/// 2. **Tracks** cluster signals into within-document entities (Level 2)
1501/// 3. **Identities** link tracks to global KB entities (Level 3)
1502///
1503/// This enables efficient:
1504/// - Streaming signal processing (add signals incrementally)
1505/// - Incremental coreference (cluster signals as they arrive)
1506/// - Lazy entity linking (resolve identities only when needed)
1507///
1508/// # Usage
1509///
1510/// ```rust
1511/// use anno_core::{GroundedDocument, Signal, Track, Identity, Location};
1512///
1513/// let mut doc = GroundedDocument::new("doc1", "Marie Curie won the Nobel Prize. She was a physicist.");
1514///
1515/// // Add signals (Level 1)
1516/// doc.add_signal(Signal::new(0, Location::text(0, 11), "Marie Curie", "Person", 0.95));
1517/// doc.add_signal(Signal::new(1, Location::text(33, 36), "She", "Person", 0.88));
1518///
1519/// // Form track (Level 2)
1520/// let mut track = Track::new(0, "Marie Curie");
1521/// track.add_signal(0, 0);
1522/// track.add_signal(1, 1);
1523/// doc.add_track(track);
1524///
1525/// // Link identity (Level 3)
1526/// let identity = Identity::from_kb(0, "Marie Curie", "wikidata", "Q7186");
1527/// doc.add_identity(identity);
1528/// doc.link_track_to_identity(0, 0);
1529/// ```
1530///
1531/// # Invariants
1532///
1533/// `GroundedDocument` maintains internal indices (`signal_to_track`, `track_to_identity`)
1534/// that must be consistent with the public collections. The following invariants hold:
1535///
1536/// 1. **Signal ID uniqueness**: All signals in `signals` have distinct `id` values.
1537/// 2. **Track signal references**: Every `SignalRef` in a `Track.signals` points to
1538///    a valid signal ID in `signals`.
1539/// 3. **Signal-to-track consistency**: If `signal_to_track[s] == t`, then the track `t`
1540///    contains a `SignalRef` pointing to `s`.
1541/// 4. **Track-to-identity consistency**: If `track_to_identity[t] == i`, then
1542///    `tracks[t].identity_id == Some(i)` and `identities` contains `i`.
1543/// 5. **Signal offsets validity**: Signal text locations should match `self.text`.
1544///
1545/// **Prefer mutation via provided methods** (`add_signal`, `add_track`, `add_signal_to_track`,
1546/// `link_track_to_identity`) rather than direct field manipulation to preserve invariants.
1547///
1548/// Use [`validate_invariants()`](Self::validate_invariants) to check structural consistency
1549/// after external modifications.
1550///
1551/// ## Serialization
1552///
1553/// Internal indexes (`signal_to_track`, `track_to_identity`, counter fields) are **not**
1554/// serialized. They are rebuilt automatically on deserialization via [`rebuild_indexes`](Self::rebuild_indexes).
1555#[derive(Debug, Clone, Serialize, Deserialize)]
1556#[serde(from = "GroundedDocumentWire")]
1557pub struct GroundedDocument {
1558    /// Document identifier
1559    pub id: String,
1560    /// Raw text content
1561    pub text: String,
1562    /// Level 1: Raw signals (detections)
1563    pub signals: Vec<Signal<Location>>,
1564    /// Level 2: Tracks (within-document coreference chains)
1565    pub tracks: HashMap<TrackId, Track>,
1566    /// Level 3: Global identities (KB-linked entities)
1567    pub identities: HashMap<IdentityId, Identity>,
1568    /// Index: signal_id → track_id (for efficient lookup).
1569    /// Not serialized; rebuilt on deserialization.
1570    #[serde(skip)]
1571    signal_to_track: HashMap<SignalId, TrackId>,
1572    /// Index: track_id → identity_id (for efficient lookup).
1573    /// Not serialized; rebuilt on deserialization.
1574    #[serde(skip)]
1575    track_to_identity: HashMap<TrackId, IdentityId>,
1576    /// Next signal ID (for auto-incrementing).
1577    /// Not serialized; rebuilt on deserialization.
1578    #[serde(skip)]
1579    next_signal_id: SignalId,
1580    /// Next track ID.
1581    /// Not serialized; rebuilt on deserialization.
1582    #[serde(skip)]
1583    next_track_id: TrackId,
1584    /// Next identity ID.
1585    /// Not serialized; rebuilt on deserialization.
1586    #[serde(skip)]
1587    next_identity_id: IdentityId,
1588}
1589
1590impl GroundedDocument {
1591    /// Create a new grounded document.
1592    #[must_use]
1593    pub fn new(id: impl Into<String>, text: impl Into<String>) -> Self {
1594        Self {
1595            id: id.into(),
1596            text: text.into(),
1597            signals: Vec::new(),
1598            tracks: HashMap::new(),
1599            identities: HashMap::new(),
1600            signal_to_track: HashMap::new(),
1601            track_to_identity: HashMap::new(),
1602            next_signal_id: SignalId::ZERO,
1603            next_track_id: TrackId::ZERO,
1604            next_identity_id: IdentityId::ZERO,
1605        }
1606    }
1607
1608    /// Rebuild all internal indexes from the public data fields.
1609    ///
1610    /// Call this after deserializing a `GroundedDocument` or after directly mutating the
1611    /// `signals`, `tracks`, or `identities` fields. The method recomputes:
1612    /// - `signal_to_track` from each track's signal list
1613    /// - `track_to_identity` from each track's `identity_id`
1614    /// - `next_signal_id`, `next_track_id`, `next_identity_id` counters
1615    pub fn rebuild_indexes(&mut self) {
1616        self.signal_to_track.clear();
1617        self.track_to_identity.clear();
1618
1619        for (&track_id, track) in &self.tracks {
1620            for sig_ref in &track.signals {
1621                self.signal_to_track.insert(sig_ref.signal_id, track_id);
1622            }
1623            if let Some(identity_id) = track.identity_id {
1624                self.track_to_identity.insert(track_id, identity_id);
1625            }
1626        }
1627
1628        self.next_signal_id = self
1629            .signals
1630            .iter()
1631            .map(|s| s.id)
1632            .max()
1633            .map_or(SignalId::ZERO, |id| id + 1);
1634        self.next_track_id = self
1635            .tracks
1636            .keys()
1637            .copied()
1638            .max()
1639            .map_or(TrackId::ZERO, |id| id + 1);
1640        self.next_identity_id = self
1641            .identities
1642            .keys()
1643            .copied()
1644            .max()
1645            .map_or(IdentityId::ZERO, |id| id + 1);
1646    }
1647
1648    // -------------------------------------------------------------------------
1649    // Signal operations (Level 1)
1650    // -------------------------------------------------------------------------
1651
1652    /// Add a signal and return its ID.
1653    pub fn add_signal(&mut self, mut signal: Signal<Location>) -> SignalId {
1654        let id = self.next_signal_id;
1655        signal.id = id;
1656        self.signals.push(signal);
1657        self.next_signal_id += 1;
1658        id
1659    }
1660
1661    /// Get a signal by ID.
1662    #[must_use]
1663    pub fn get_signal(&self, id: impl Into<SignalId>) -> Option<&Signal<Location>> {
1664        let id = id.into();
1665        self.signals.iter().find(|s| s.id == id)
1666    }
1667
1668    /// Get all signals.
1669    pub fn signals(&self) -> &[Signal<Location>] {
1670        &self.signals
1671    }
1672
1673    // -------------------------------------------------------------------------
1674    // Track operations (Level 2)
1675    // -------------------------------------------------------------------------
1676
1677    /// Add a track and return its ID.
1678    pub fn add_track(&mut self, mut track: Track) -> TrackId {
1679        let id = self.next_track_id;
1680        track.id = id;
1681
1682        // Update signal → track index
1683        for signal_ref in &track.signals {
1684            self.signal_to_track.insert(signal_ref.signal_id, id);
1685        }
1686
1687        self.tracks.insert(id, track);
1688        self.next_track_id += 1;
1689        id
1690    }
1691
1692    /// Get a track by ID.
1693    #[must_use]
1694    pub fn get_track(&self, id: impl Into<TrackId>) -> Option<&Track> {
1695        self.tracks.get(&id.into())
1696    }
1697
1698    /// Get a mutable reference to a track by ID.
1699    #[must_use]
1700    pub fn get_track_mut(&mut self, id: impl Into<TrackId>) -> Option<&mut Track> {
1701        self.tracks.get_mut(&id.into())
1702    }
1703
1704    /// Add a signal to an existing track.
1705    ///
1706    /// This properly updates the signal_to_track index.
1707    /// Returns true if the signal was added, false if track doesn't exist.
1708    pub fn add_signal_to_track(
1709        &mut self,
1710        signal_id: impl Into<SignalId>,
1711        track_id: impl Into<TrackId>,
1712        position: u32,
1713    ) -> bool {
1714        let signal_id = signal_id.into();
1715        let track_id = track_id.into();
1716        if let Some(track) = self.tracks.get_mut(&track_id) {
1717            track.add_signal(signal_id, position);
1718            self.signal_to_track.insert(signal_id, track_id);
1719            true
1720        } else {
1721            false
1722        }
1723    }
1724
1725    /// Get the track containing a signal.
1726    #[must_use]
1727    pub fn track_for_signal(&self, signal_id: SignalId) -> Option<&Track> {
1728        let track_id = self.signal_to_track.get(&signal_id)?;
1729        self.tracks.get(track_id)
1730    }
1731
1732    /// Get all tracks.
1733    pub fn tracks(&self) -> impl Iterator<Item = &Track> {
1734        self.tracks.values()
1735    }
1736
1737    // -------------------------------------------------------------------------
1738    // Identity operations (Level 3)
1739    // -------------------------------------------------------------------------
1740
1741    /// Add an identity and return its ID.
1742    pub fn add_identity(&mut self, mut identity: Identity) -> IdentityId {
1743        let id = self.next_identity_id;
1744        identity.id = id;
1745        self.identities.insert(id, identity);
1746        self.next_identity_id += 1;
1747        id
1748    }
1749
1750    /// Link a track to an identity.
1751    pub fn link_track_to_identity(
1752        &mut self,
1753        track_id: impl Into<TrackId>,
1754        identity_id: impl Into<IdentityId>,
1755    ) {
1756        let track_id = track_id.into();
1757        let identity_id = identity_id.into();
1758        if let Some(track) = self.tracks.get_mut(&track_id) {
1759            track.identity_id = Some(identity_id);
1760            self.track_to_identity.insert(track_id, identity_id);
1761        }
1762    }
1763
1764    /// Get an identity by ID.
1765    #[must_use]
1766    pub fn get_identity(&self, id: IdentityId) -> Option<&Identity> {
1767        self.identities.get(&id)
1768    }
1769
1770    /// Get the identity for a track.
1771    #[must_use]
1772    pub fn identity_for_track(&self, track_id: TrackId) -> Option<&Identity> {
1773        let identity_id = self.track_to_identity.get(&track_id)?;
1774        self.identities.get(identity_id)
1775    }
1776
1777    /// Get the identity for a signal (transitively through track).
1778    #[must_use]
1779    pub fn identity_for_signal(&self, signal_id: SignalId) -> Option<&Identity> {
1780        let track_id = self.signal_to_track.get(&signal_id)?;
1781        self.identity_for_track(*track_id)
1782    }
1783
1784    /// Get all identities.
1785    pub fn identities(&self) -> impl Iterator<Item = &Identity> {
1786        self.identities.values()
1787    }
1788
1789    /// Get a TrackRef for a track in this document.
1790    ///
1791    /// Returns `None` if the track doesn't exist in this document.
1792    /// This validates that the track is still present (tracks can be removed).
1793    #[must_use]
1794    pub fn track_ref(&self, track_id: TrackId) -> Option<TrackRef> {
1795        // Validate that the track actually exists
1796        if self.tracks.contains_key(&track_id) {
1797            Some(TrackRef {
1798                doc_id: self.id.clone(),
1799                track_id,
1800            })
1801        } else {
1802            None
1803        }
1804    }
1805
1806    // -------------------------------------------------------------------------
1807    // Conversion utilities
1808    // -------------------------------------------------------------------------
1809
1810    /// Convert to legacy Entity format for backwards compatibility.
1811    #[must_use]
1812    pub fn to_entities(&self) -> Vec<Entity> {
1813        self.signals
1814            .iter()
1815            .map(|signal| {
1816                let (start, end) = signal.location.text_offsets().unwrap_or((0, 0));
1817                let track = self.track_for_signal(signal.id);
1818                let identity = track.and_then(|t| self.identity_for_track(t.id));
1819
1820                Entity {
1821                    text: signal.surface.clone(),
1822                    entity_type: EntityType::from_label(signal.label.as_str()),
1823                    start,
1824                    end,
1825                    confidence: Confidence::from(signal.confidence),
1826                    normalized: signal.normalized.clone(),
1827                    provenance: signal.provenance.clone(),
1828                    kb_id: identity.and_then(|i| i.kb_id.clone()),
1829                    canonical_id: track.map(|t| super::types::CanonicalId::new(t.id.get())),
1830                    hierarchical_confidence: signal.hierarchical,
1831                    visual_span: match &signal.location {
1832                        Location::BoundingBox {
1833                            x,
1834                            y,
1835                            width,
1836                            height,
1837                            page,
1838                        } => Some(Span::BoundingBox {
1839                            x: *x,
1840                            y: *y,
1841                            width: *width,
1842                            height: *height,
1843                            page: *page,
1844                        }),
1845                        Location::TextWithBbox { bbox, .. } => {
1846                            if let Location::BoundingBox {
1847                                x,
1848                                y,
1849                                width,
1850                                height,
1851                                page,
1852                            } = bbox.as_ref()
1853                            {
1854                                Some(Span::BoundingBox {
1855                                    x: *x,
1856                                    y: *y,
1857                                    width: *width,
1858                                    height: *height,
1859                                    page: *page,
1860                                })
1861                            } else {
1862                                None
1863                            }
1864                        }
1865                        _ => None,
1866                    },
1867                    discontinuous_span: match &signal.location {
1868                        Location::Discontinuous { segments } => Some(DiscontinuousSpan::new(
1869                            segments.iter().map(|(s, e)| (*s)..(*e)).collect(),
1870                        )),
1871                        _ => None,
1872                    },
1873                    valid_from: None,
1874                    valid_until: None,
1875                    viewport: None,
1876                    phi_features: None,
1877                    mention_type: None,
1878                }
1879            })
1880            .collect()
1881    }
1882
1883    /// Create from legacy Entity slice.
1884    #[must_use]
1885    pub fn from_entities(
1886        id: impl Into<String>,
1887        text: impl Into<String>,
1888        entities: &[Entity],
1889    ) -> Self {
1890        let mut doc = Self::new(id, text);
1891
1892        // Group entities by canonical_id to form tracks.
1893        //
1894        // IMPORTANT: Entities without a `canonical_id` are *not* coreferent by default.
1895        // They must each form their own singleton track (otherwise all NER mentions would
1896        // collapse into one giant track).
1897        #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
1898        enum TrackKey {
1899            Canonical(super::types::CanonicalId),
1900            Singleton(usize),
1901        }
1902
1903        let mut tracks_map: HashMap<TrackKey, Vec<SignalId>> = HashMap::new();
1904        let mut signal_to_entity_idx: HashMap<SignalId, usize> = HashMap::new();
1905
1906        for (idx, entity) in entities.iter().enumerate() {
1907            let location = if let Some(disc) = &entity.discontinuous_span {
1908                Location::Discontinuous {
1909                    segments: disc.segments().iter().map(|r| (r.start, r.end)).collect(),
1910                }
1911            } else if let Some(visual) = &entity.visual_span {
1912                Location::from(visual)
1913            } else {
1914                Location::text(entity.start, entity.end)
1915            };
1916
1917            let mut signal = Signal::new(
1918                SignalId::new(idx as u64),
1919                location,
1920                &entity.text,
1921                entity.entity_type.as_label(),
1922                f32::from(entity.confidence),
1923            );
1924            signal.normalized = entity.normalized.clone();
1925            signal.provenance = entity.provenance.clone();
1926            signal.hierarchical = entity.hierarchical_confidence;
1927
1928            let signal_id = doc.add_signal(signal);
1929            signal_to_entity_idx.insert(signal_id, idx);
1930
1931            let key = match entity.canonical_id {
1932                Some(cid) => TrackKey::Canonical(cid),
1933                None => TrackKey::Singleton(idx),
1934            };
1935            tracks_map.entry(key).or_default().push(signal_id);
1936        }
1937
1938        // Create tracks from grouped signals
1939        for (_key, signal_ids) in tracks_map {
1940            if let Some(first_signal) = signal_ids.first().and_then(|id| doc.get_signal(*id)) {
1941                let mut track = Track::new(doc.next_track_id, &first_signal.surface);
1942                track.entity_type =
1943                    Some(super::types::TypeLabel::from(first_signal.label.as_str()));
1944
1945                for (pos, &signal_id) in signal_ids.iter().enumerate() {
1946                    track.add_signal(signal_id, pos as u32);
1947                }
1948
1949                // If any member entity is linked to a KB entry, create an identity and link it.
1950                // (We intentionally do this even for singleton tracks without canonical_id.)
1951                let kb_id = signal_ids.iter().find_map(|sid| {
1952                    let ent_idx = signal_to_entity_idx.get(sid).copied()?;
1953                    entities.get(ent_idx)?.kb_id.clone()
1954                });
1955                if let Some(kb_id) = kb_id {
1956                    let identity = Identity::from_kb(
1957                        doc.next_identity_id,
1958                        &track.canonical_surface,
1959                        "unknown",
1960                        kb_id,
1961                    );
1962                    let identity_id = doc.add_identity(identity);
1963                    track = track.with_identity(identity_id);
1964                }
1965
1966                doc.add_track(track);
1967            }
1968        }
1969
1970        doc
1971    }
1972
1973    /// Get signals filtered by label.
1974    #[must_use]
1975    pub fn signals_with_label(&self, label: &str) -> Vec<&Signal<Location>> {
1976        let want = super::types::TypeLabel::from(label);
1977        self.signals.iter().filter(|s| s.label == want).collect()
1978    }
1979
1980    /// Get signals above a confidence threshold.
1981    #[must_use]
1982    pub fn confident_signals(&self, threshold: f32) -> Vec<&Signal<Location>> {
1983        self.signals
1984            .iter()
1985            .filter(|s| s.confidence >= threshold)
1986            .collect()
1987    }
1988
1989    /// Get tracks that are linked to an identity.
1990    pub fn linked_tracks(&self) -> impl Iterator<Item = &Track> {
1991        self.tracks.values().filter(|t| t.identity_id.is_some())
1992    }
1993
1994    /// Get tracks that are NOT linked to any identity (need resolution).
1995    pub fn unlinked_tracks(&self) -> impl Iterator<Item = &Track> {
1996        self.tracks.values().filter(|t| t.identity_id.is_none())
1997    }
1998
1999    /// Count of signals that are not yet assigned to any track.
2000    #[must_use]
2001    pub fn untracked_signal_count(&self) -> usize {
2002        self.signals
2003            .iter()
2004            .filter(|s| !self.signal_to_track.contains_key(&s.id))
2005            .count()
2006    }
2007
2008    /// Get untracked signals (need coreference resolution).
2009    #[must_use]
2010    pub fn untracked_signals(&self) -> Vec<&Signal<Location>> {
2011        self.signals
2012            .iter()
2013            .filter(|s| !self.signal_to_track.contains_key(&s.id))
2014            .collect()
2015    }
2016
2017    // -------------------------------------------------------------------------
2018    // Advanced Query Methods
2019    // -------------------------------------------------------------------------
2020
2021    /// Get signals filtered by modality.
2022    #[must_use]
2023    pub fn signals_by_modality(&self, modality: Modality) -> Vec<&Signal<Location>> {
2024        self.signals
2025            .iter()
2026            .filter(|s| s.modality == modality)
2027            .collect()
2028    }
2029
2030    /// Get all text-based signals (symbolic modality).
2031    #[must_use]
2032    pub fn text_signals(&self) -> Vec<&Signal<Location>> {
2033        self.signals_by_modality(Modality::Symbolic)
2034    }
2035
2036    /// Get all visual signals (iconic modality).
2037    #[must_use]
2038    pub fn visual_signals(&self) -> Vec<&Signal<Location>> {
2039        self.signals_by_modality(Modality::Iconic)
2040    }
2041
2042    /// Find signals that overlap with a given location.
2043    #[must_use]
2044    pub fn overlapping_signals(&self, location: &Location) -> Vec<&Signal<Location>> {
2045        self.signals
2046            .iter()
2047            .filter(|s| s.location.overlaps(location))
2048            .collect()
2049    }
2050
2051    /// Find signals within a text range.
2052    #[must_use]
2053    pub fn signals_in_range(&self, start: usize, end: usize) -> Vec<&Signal<Location>> {
2054        self.signals
2055            .iter()
2056            .filter(|s| {
2057                if let Some((s_start, s_end)) = s.location.text_offsets() {
2058                    s_start >= start && s_end <= end
2059                } else {
2060                    false
2061                }
2062            })
2063            .collect()
2064    }
2065
2066    /// Get signals that are negated.
2067    #[must_use]
2068    pub fn negated_signals(&self) -> Vec<&Signal<Location>> {
2069        self.signals.iter().filter(|s| s.negated).collect()
2070    }
2071
2072    /// Get signals with a specific quantifier.
2073    #[must_use]
2074    pub fn quantified_signals(&self, quantifier: Quantifier) -> Vec<&Signal<Location>> {
2075        self.signals
2076            .iter()
2077            .filter(|s| s.quantifier == Some(quantifier))
2078            .collect()
2079    }
2080
2081    // -------------------------------------------------------------------------
2082    // Validation
2083    // -------------------------------------------------------------------------
2084
2085    /// Validate all signals against the document text.
2086    ///
2087    /// Returns a list of validation errors. Empty means all valid.
2088    ///
2089    /// # Example
2090    ///
2091    /// ```rust
2092    /// use anno_core::{GroundedDocument, Signal, Location};
2093    ///
2094    /// let mut doc = GroundedDocument::new("test", "Marie Curie was a physicist.");
2095    /// doc.add_signal(Signal::new(0, Location::text(0, 11), "Marie Curie", "PER", 0.9));
2096    /// assert!(doc.validate().is_empty());
2097    ///
2098    /// // Bad signal: wrong text at offset
2099    /// doc.add_signal(Signal::new(0, Location::text(0, 5), "WRONG", "PER", 0.9));
2100    /// assert!(!doc.validate().is_empty());
2101    /// ```
2102    #[must_use]
2103    pub fn validate(&self) -> Vec<SignalValidationError> {
2104        self.signals
2105            .iter()
2106            .filter_map(|s| s.validate_against(&self.text))
2107            .collect()
2108    }
2109
2110    /// Validate structural invariants of the document.
2111    ///
2112    /// Returns a list of invariant violations. An empty list means the document
2113    /// is structurally consistent.
2114    ///
2115    /// This checks:
2116    /// 1. Signal ID uniqueness
2117    /// 2. Track signal references point to existing signals
2118    /// 3. `signal_to_track` index consistency
2119    /// 4. `track_to_identity` index consistency
2120    /// 5. Track identity references point to existing identities
2121    ///
2122    /// Use this after any direct field manipulation to ensure consistency.
2123    ///
2124    /// # Example
2125    ///
2126    /// ```rust
2127    /// use anno_core::{GroundedDocument, Signal, Location};
2128    ///
2129    /// let mut doc = GroundedDocument::new("test", "Marie Curie was a physicist.");
2130    /// doc.add_signal(Signal::new(0, Location::text(0, 11), "Marie Curie", "PER", 0.9));
2131    /// assert!(doc.validate_invariants().is_empty());
2132    /// ```
2133    #[must_use]
2134    pub fn validate_invariants(&self) -> Vec<String> {
2135        let mut errors = Vec::new();
2136
2137        // 1. Signal ID uniqueness
2138        let mut seen_ids = std::collections::HashSet::new();
2139        for signal in &self.signals {
2140            if !seen_ids.insert(signal.id) {
2141                errors.push(format!("Duplicate signal ID: {}", signal.id));
2142            }
2143        }
2144
2145        // Build signal ID set for reference checks
2146        let signal_ids: std::collections::HashSet<_> = self.signals.iter().map(|s| s.id).collect();
2147
2148        // 2. Track signal references point to existing signals
2149        for (track_id, track) in &self.tracks {
2150            for signal_ref in &track.signals {
2151                if !signal_ids.contains(&signal_ref.signal_id) {
2152                    errors.push(format!(
2153                        "Track {} references non-existent signal {}",
2154                        track_id, signal_ref.signal_id
2155                    ));
2156                }
2157            }
2158        }
2159
2160        // 3. signal_to_track consistency
2161        for (signal_id, track_id) in &self.signal_to_track {
2162            // Check track exists
2163            if let Some(track) = self.tracks.get(track_id) {
2164                // Check track contains the signal reference
2165                if !track.signals.iter().any(|r| r.signal_id == *signal_id) {
2166                    errors.push(format!(
2167                        "signal_to_track[{}] = {} but track doesn't contain signal",
2168                        signal_id, track_id
2169                    ));
2170                }
2171            } else {
2172                errors.push(format!(
2173                    "signal_to_track[{}] = {} but track doesn't exist",
2174                    signal_id, track_id
2175                ));
2176            }
2177        }
2178
2179        // 4. track_to_identity consistency
2180        for (track_id, identity_id) in &self.track_to_identity {
2181            // Check track exists and has matching identity_id
2182            if let Some(track) = self.tracks.get(track_id) {
2183                if track.identity_id != Some(*identity_id) {
2184                    errors.push(format!(
2185                        "track_to_identity[{}] = {} but track.identity_id = {:?}",
2186                        track_id, identity_id, track.identity_id
2187                    ));
2188                }
2189            } else {
2190                errors.push(format!(
2191                    "track_to_identity[{}] = {} but track doesn't exist",
2192                    track_id, identity_id
2193                ));
2194            }
2195
2196            // Check identity exists
2197            if !self.identities.contains_key(identity_id) {
2198                errors.push(format!(
2199                    "track_to_identity[{}] = {} but identity doesn't exist",
2200                    track_id, identity_id
2201                ));
2202            }
2203        }
2204
2205        // 5. Track identity references point to existing identities
2206        for (track_id, track) in &self.tracks {
2207            if let Some(identity_id) = track.identity_id {
2208                if !self.identities.contains_key(&identity_id) {
2209                    errors.push(format!(
2210                        "Track {} references non-existent identity {}",
2211                        track_id, identity_id
2212                    ));
2213                }
2214            }
2215        }
2216
2217        errors
2218    }
2219
2220    /// Check if all structural invariants hold.
2221    #[must_use]
2222    pub fn invariants_hold(&self) -> bool {
2223        self.validate_invariants().is_empty()
2224    }
2225
2226    /// Check if all signals are valid against document text.
2227    #[must_use]
2228    pub fn is_valid(&self) -> bool {
2229        self.signals.iter().all(|s| s.is_valid(&self.text))
2230    }
2231
2232    /// Add a signal, validating it first.
2233    ///
2234    /// Returns `Err` if the signal's offsets don't match the document text.
2235    pub fn add_signal_validated(
2236        &mut self,
2237        signal: Signal<Location>,
2238    ) -> Result<SignalId, SignalValidationError> {
2239        if let Some(err) = signal.validate_against(&self.text) {
2240            return Err(err);
2241        }
2242        Ok(self.add_signal(signal))
2243    }
2244
2245    /// Add a signal by finding text in document (safe construction).
2246    ///
2247    /// Returns the signal ID, or `None` if text not found.
2248    ///
2249    /// # Example
2250    ///
2251    /// ```rust
2252    /// use anno_core::GroundedDocument;
2253    ///
2254    /// let mut doc = GroundedDocument::new("test", "Marie Curie was a physicist.");
2255    /// let id = doc.add_signal_from_text("Marie Curie", "PER", 0.95);
2256    /// assert!(id.is_some());
2257    /// ```
2258    pub fn add_signal_from_text(
2259        &mut self,
2260        surface: &str,
2261        label: impl Into<super::types::TypeLabel>,
2262        confidence: f32,
2263    ) -> Option<SignalId> {
2264        let signal = Signal::from_text(&self.text, surface, label, confidence)?;
2265        Some(self.add_signal(signal))
2266    }
2267
2268    /// Add a signal by finding the nth occurrence of text.
2269    pub fn add_signal_from_text_nth(
2270        &mut self,
2271        surface: &str,
2272        label: impl Into<super::types::TypeLabel>,
2273        confidence: f32,
2274        occurrence: usize,
2275    ) -> Option<SignalId> {
2276        let signal = Signal::from_text_nth(&self.text, surface, label, confidence, occurrence)?;
2277        Some(self.add_signal(signal))
2278    }
2279
2280    // -------------------------------------------------------------------------
2281    // Statistics
2282    // -------------------------------------------------------------------------
2283
2284    /// Get statistics about the document.
2285    #[must_use]
2286    pub fn stats(&self) -> DocumentStats {
2287        let signal_count = self.signals.len();
2288        let track_count = self.tracks.len();
2289        let identity_count = self.identities.len();
2290
2291        let linked_track_count = self
2292            .tracks
2293            .values()
2294            .filter(|t| t.identity_id.is_some())
2295            .count();
2296        let untracked_count = self.untracked_signal_count();
2297
2298        let avg_track_size = if track_count > 0 {
2299            self.tracks.values().map(|t| t.len()).sum::<usize>() as f32 / track_count as f32
2300        } else {
2301            0.0
2302        };
2303
2304        let singleton_count = self.tracks.values().filter(|t| t.is_singleton()).count();
2305
2306        let avg_confidence = if signal_count > 0 {
2307            self.signals.iter().map(|s| s.confidence).sum::<f32>() / signal_count as f32
2308        } else {
2309            0.0
2310        };
2311
2312        let negated_count = self.signals.iter().filter(|s| s.negated).count();
2313
2314        // Count by modality
2315        let symbolic_count = self
2316            .signals
2317            .iter()
2318            .filter(|s| s.modality == Modality::Symbolic)
2319            .count();
2320        let iconic_count = self
2321            .signals
2322            .iter()
2323            .filter(|s| s.modality == Modality::Iconic)
2324            .count();
2325        let hybrid_count = self
2326            .signals
2327            .iter()
2328            .filter(|s| s.modality == Modality::Hybrid)
2329            .count();
2330
2331        DocumentStats {
2332            signal_count,
2333            track_count,
2334            identity_count,
2335            linked_track_count,
2336            untracked_count,
2337            avg_track_size,
2338            singleton_count,
2339            avg_confidence,
2340            negated_count,
2341            symbolic_count,
2342            iconic_count,
2343            hybrid_count,
2344        }
2345    }
2346
2347    // -------------------------------------------------------------------------
2348    // Batch Operations
2349    // -------------------------------------------------------------------------
2350
2351    /// Add multiple signals at once.
2352    ///
2353    /// Returns the IDs of all added signals.
2354    pub fn add_signals(
2355        &mut self,
2356        signals: impl IntoIterator<Item = Signal<Location>>,
2357    ) -> Vec<SignalId> {
2358        signals.into_iter().map(|s| self.add_signal(s)).collect()
2359    }
2360
2361    /// Create a track from a list of signal IDs.
2362    ///
2363    /// Automatically sets positions based on order.
2364    pub fn create_track_from_signals(
2365        &mut self,
2366        canonical: impl Into<String>,
2367        signal_ids: &[SignalId],
2368    ) -> Option<TrackId> {
2369        if signal_ids.is_empty() {
2370            return None;
2371        }
2372
2373        let mut track = Track::new(TrackId::ZERO, canonical);
2374        for (pos, &id) in signal_ids.iter().enumerate() {
2375            track.add_signal(id, pos as u32);
2376        }
2377        Some(self.add_track(track))
2378    }
2379
2380    /// Merge multiple tracks into one.
2381    ///
2382    /// The resulting track has all signals from the input tracks.
2383    /// The canonical surface comes from the first track.
2384    pub fn merge_tracks(&mut self, track_ids: &[TrackId]) -> Option<TrackId> {
2385        if track_ids.is_empty() {
2386            return None;
2387        }
2388
2389        // Collect all signals from tracks to merge
2390        let mut all_signals: Vec<SignalRef> = Vec::new();
2391        let mut canonical = String::new();
2392        let mut entity_type = None;
2393
2394        for &track_id in track_ids {
2395            if let Some(track) = self.tracks.get(&track_id) {
2396                if canonical.is_empty() {
2397                    canonical = track.canonical_surface.clone();
2398                    entity_type = track.entity_type.clone();
2399                }
2400                all_signals.extend(track.signals.iter().cloned());
2401            }
2402        }
2403
2404        if all_signals.is_empty() {
2405            return None;
2406        }
2407
2408        // Sort by position
2409        all_signals.sort_by_key(|s| s.position);
2410
2411        // Remove old tracks
2412        for &track_id in track_ids {
2413            self.tracks.remove(&track_id);
2414        }
2415
2416        // Create new merged track
2417        let mut new_track = Track::new(TrackId::ZERO, canonical);
2418        new_track.entity_type = entity_type;
2419        for (pos, signal_ref) in all_signals.iter().enumerate() {
2420            new_track.add_signal(signal_ref.signal_id, pos as u32);
2421        }
2422
2423        Some(self.add_track(new_track))
2424    }
2425
2426    /// Find all pairs of overlapping signals (potential duplicates or nested entities).
2427    #[must_use]
2428    pub fn find_overlapping_signal_pairs(&self) -> Vec<(SignalId, SignalId)> {
2429        let mut pairs = Vec::new();
2430        let signals: Vec<_> = self.signals.iter().collect();
2431
2432        for i in 0..signals.len() {
2433            for j in (i + 1)..signals.len() {
2434                if signals[i].location.overlaps(&signals[j].location) {
2435                    pairs.push((signals[i].id, signals[j].id));
2436                }
2437            }
2438        }
2439
2440        pairs
2441    }
2442}
2443
2444/// Statistics about a grounded document.
2445#[derive(Debug, Clone, Copy, Default)]
2446pub struct DocumentStats {
2447    /// Total number of signals
2448    pub signal_count: usize,
2449    /// Total number of tracks
2450    pub track_count: usize,
2451    /// Total number of identities
2452    pub identity_count: usize,
2453    /// Number of tracks linked to identities
2454    pub linked_track_count: usize,
2455    /// Number of signals not in any track
2456    pub untracked_count: usize,
2457    /// Average signals per track
2458    pub avg_track_size: f32,
2459    /// Number of singleton tracks (single mention)
2460    pub singleton_count: usize,
2461    /// Average signal confidence
2462    pub avg_confidence: f32,
2463    /// Number of negated signals
2464    pub negated_count: usize,
2465    /// Number of symbolic (text) signals
2466    pub symbolic_count: usize,
2467    /// Number of iconic (visual) signals
2468    pub iconic_count: usize,
2469    /// Number of hybrid signals
2470    pub hybrid_count: usize,
2471}
2472
2473impl std::fmt::Display for DocumentStats {
2474    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
2475        writeln!(f, "Document Statistics:")?;
2476        writeln!(
2477            f,
2478            "  Signals: {} (avg confidence: {:.2})",
2479            self.signal_count, self.avg_confidence
2480        )?;
2481        writeln!(
2482            f,
2483            "  Tracks: {} (avg size: {:.1}, singletons: {})",
2484            self.track_count, self.avg_track_size, self.singleton_count
2485        )?;
2486        writeln!(
2487            f,
2488            "  Identities: {} ({} tracks linked)",
2489            self.identity_count, self.linked_track_count
2490        )?;
2491        writeln!(f, "  Untracked signals: {}", self.untracked_count)?;
2492        writeln!(
2493            f,
2494            "  Modalities: {} symbolic, {} iconic, {} hybrid",
2495            self.symbolic_count, self.iconic_count, self.hybrid_count
2496        )?;
2497        if self.negated_count > 0 {
2498            writeln!(f, "  Negated: {}", self.negated_count)?;
2499        }
2500        Ok(())
2501    }
2502}
2503
2504// =============================================================================
2505// Spatial Index for Efficient Range Queries
2506// =============================================================================
2507
2508/// A simple interval tree node for text span indexing.
2509///
2510/// This provides O(log n + k) lookup for signals within a text range,
2511/// where k is the number of results. Much faster than O(n) linear scan
2512/// for documents with many signals.
2513#[derive(Debug, Clone)]
2514struct IntervalNode {
2515    /// Signal ID
2516    signal_id: SignalId,
2517    /// Start offset (inclusive)
2518    start: usize,
2519    /// End offset (exclusive)
2520    end: usize,
2521    /// Maximum end in this subtree (for efficient pruning)
2522    max_end: usize,
2523    /// Left child
2524    left: Option<Box<IntervalNode>>,
2525    /// Right child
2526    right: Option<Box<IntervalNode>>,
2527}
2528
2529impl IntervalNode {
2530    fn new(signal_id: SignalId, start: usize, end: usize) -> Self {
2531        Self {
2532            signal_id,
2533            start,
2534            end,
2535            max_end: end,
2536            left: None,
2537            right: None,
2538        }
2539    }
2540
2541    fn insert(&mut self, signal_id: SignalId, start: usize, end: usize) {
2542        self.max_end = self.max_end.max(end);
2543
2544        if start < self.start {
2545            if let Some(ref mut left) = self.left {
2546                left.insert(signal_id, start, end);
2547            } else {
2548                self.left = Some(Box::new(IntervalNode::new(signal_id, start, end)));
2549            }
2550        } else if let Some(ref mut right) = self.right {
2551            right.insert(signal_id, start, end);
2552        } else {
2553            self.right = Some(Box::new(IntervalNode::new(signal_id, start, end)));
2554        }
2555    }
2556
2557    fn query_overlap(&self, query_start: usize, query_end: usize, results: &mut Vec<SignalId>) {
2558        // Check if this interval overlaps with query
2559        if self.start < query_end && query_start < self.end {
2560            results.push(self.signal_id);
2561        }
2562
2563        // Check left subtree if it could contain overlapping intervals
2564        if let Some(ref left) = self.left {
2565            if left.max_end > query_start {
2566                left.query_overlap(query_start, query_end, results);
2567            }
2568        }
2569
2570        // Check right subtree if query could overlap
2571        if let Some(ref right) = self.right {
2572            if self.start < query_end {
2573                right.query_overlap(query_start, query_end, results);
2574            }
2575        }
2576    }
2577
2578    fn query_containing(&self, query_start: usize, query_end: usize, results: &mut Vec<SignalId>) {
2579        // Check if this interval fully contains the query
2580        if self.start <= query_start && self.end >= query_end {
2581            results.push(self.signal_id);
2582        }
2583
2584        // Check left subtree if it could contain the range
2585        if let Some(ref left) = self.left {
2586            if left.max_end >= query_end {
2587                left.query_containing(query_start, query_end, results);
2588            }
2589        }
2590
2591        // Check right subtree
2592        if let Some(ref right) = self.right {
2593            if self.start <= query_start {
2594                right.query_containing(query_start, query_end, results);
2595            }
2596        }
2597    }
2598
2599    fn query_contained_in(
2600        &self,
2601        range_start: usize,
2602        range_end: usize,
2603        results: &mut Vec<SignalId>,
2604    ) {
2605        // Check if this interval is fully contained in range
2606        if self.start >= range_start && self.end <= range_end {
2607            results.push(self.signal_id);
2608        }
2609
2610        // Check left subtree
2611        if let Some(ref left) = self.left {
2612            left.query_contained_in(range_start, range_end, results);
2613        }
2614
2615        // Check right subtree if it could have contained intervals
2616        if let Some(ref right) = self.right {
2617            if self.start < range_end {
2618                right.query_contained_in(range_start, range_end, results);
2619            }
2620        }
2621    }
2622}
2623
2624/// Spatial index for text signals using an interval tree.
2625///
2626/// Enables efficient queries:
2627/// - `query_overlap(start, end)`: Find signals that overlap with range
2628/// - `query_containing(start, end)`: Find signals that fully contain range
2629/// - `query_contained_in(start, end)`: Find signals fully within range
2630///
2631/// # Performance
2632///
2633/// - Build: O(n log n)
2634/// - Query: O(log n + k) where k is result count
2635/// - Space: O(n)
2636///
2637/// For documents with >100 signals, this provides significant speedup
2638/// over linear scan for range queries.
2639#[derive(Debug, Clone, Default)]
2640pub struct TextSpatialIndex {
2641    root: Option<IntervalNode>,
2642    size: usize,
2643}
2644
2645impl TextSpatialIndex {
2646    /// Create a new empty index.
2647    #[must_use]
2648    pub fn new() -> Self {
2649        Self::default()
2650    }
2651
2652    /// Build index from signals in a document.
2653    #[must_use]
2654    pub fn from_signals(signals: &[Signal<Location>]) -> Self {
2655        let mut index = Self::new();
2656        for signal in signals {
2657            if let Some((start, end)) = signal.location.text_offsets() {
2658                index.insert(signal.id, start, end);
2659            }
2660        }
2661        index
2662    }
2663
2664    /// Insert a text span into the index.
2665    pub fn insert(&mut self, signal_id: SignalId, start: usize, end: usize) {
2666        if let Some(ref mut root) = self.root {
2667            root.insert(signal_id, start, end);
2668        } else {
2669            self.root = Some(IntervalNode::new(signal_id, start, end));
2670        }
2671        self.size += 1;
2672    }
2673
2674    /// Find signals that overlap with the given range.
2675    #[must_use]
2676    pub fn query_overlap(&self, start: usize, end: usize) -> Vec<SignalId> {
2677        let mut results = Vec::new();
2678        if let Some(ref root) = self.root {
2679            root.query_overlap(start, end, &mut results);
2680        }
2681        results
2682    }
2683
2684    /// Find signals that fully contain the given range.
2685    #[must_use]
2686    pub fn query_containing(&self, start: usize, end: usize) -> Vec<SignalId> {
2687        let mut results = Vec::new();
2688        if let Some(ref root) = self.root {
2689            root.query_containing(start, end, &mut results);
2690        }
2691        results
2692    }
2693
2694    /// Find signals fully contained within the given range.
2695    #[must_use]
2696    pub fn query_contained_in(&self, start: usize, end: usize) -> Vec<SignalId> {
2697        let mut results = Vec::new();
2698        if let Some(ref root) = self.root {
2699            root.query_contained_in(start, end, &mut results);
2700        }
2701        results
2702    }
2703
2704    /// Number of entries in the index.
2705    #[must_use]
2706    pub fn len(&self) -> usize {
2707        self.size
2708    }
2709
2710    /// Check if the index is empty.
2711    #[must_use]
2712    pub fn is_empty(&self) -> bool {
2713        self.size == 0
2714    }
2715}
2716
2717impl GroundedDocument {
2718    /// Build a spatial index for efficient text range queries.
2719    ///
2720    /// This is useful for documents with many signals where you need
2721    /// to frequently query by text position.
2722    ///
2723    /// # Example
2724    ///
2725    /// ```rust
2726    /// use anno_core::{GroundedDocument, Signal, Location};
2727    ///
2728    /// let mut doc = GroundedDocument::new("doc", "Some text with entities.");
2729    /// doc.add_signal(Signal::new(0, Location::text(0, 4), "Some", "T", 0.9));
2730    /// doc.add_signal(Signal::new(0, Location::text(10, 14), "with", "T", 0.9));
2731    ///
2732    /// let index = doc.build_text_index();
2733    /// let in_range = index.query_contained_in(0, 20);
2734    /// assert_eq!(in_range.len(), 2);
2735    /// ```
2736    #[must_use]
2737    pub fn build_text_index(&self) -> TextSpatialIndex {
2738        TextSpatialIndex::from_signals(&self.signals)
2739    }
2740
2741    /// Query signals using the spatial index (builds index if needed).
2742    ///
2743    /// For repeated queries, build the index once with `build_text_index()`
2744    /// and reuse it.
2745    #[must_use]
2746    pub fn query_signals_in_range_indexed(
2747        &self,
2748        start: usize,
2749        end: usize,
2750    ) -> Vec<&Signal<Location>> {
2751        let index = self.build_text_index();
2752        let ids = index.query_contained_in(start, end);
2753        ids.iter().filter_map(|&id| self.get_signal(id)).collect()
2754    }
2755
2756    /// Query overlapping signals using spatial index.
2757    #[must_use]
2758    pub fn query_overlapping_signals_indexed(
2759        &self,
2760        start: usize,
2761        end: usize,
2762    ) -> Vec<&Signal<Location>> {
2763        let index = self.build_text_index();
2764        let ids = index.query_overlap(start, end);
2765        ids.iter().filter_map(|&id| self.get_signal(id)).collect()
2766    }
2767
2768    /// Convert this grounded document into a coreference document for evaluation.
2769    ///
2770    /// This is a lightweight bridge between the production pipeline types
2771    /// (Signal/Track/Identity) and the evaluation-oriented coreference types
2772    /// (`CorefDocument`, `CorefChain`, `Mention`).
2773    ///
2774    /// - Each [`Track`] becomes a [`super::coref::CorefChain`]
2775    /// - Each track mention is derived from the track's signal locations
2776    /// - Non-text signals (iconic-only locations) are skipped
2777    ///
2778    /// Note: Mention typing (proper/nominal/pronominal) is left unset; callers
2779    /// doing mention-type evaluation should compute that separately.
2780    #[must_use]
2781    pub fn to_coref_document(&self) -> super::coref::CorefDocument {
2782        use super::coref::{CorefChain, CorefDocument, Mention};
2783        use std::collections::HashMap;
2784
2785        // Build a fast index for signal lookup.
2786        let signal_by_id: HashMap<SignalId, &Signal<Location>> =
2787            self.signals.iter().map(|s| (s.id, s)).collect();
2788
2789        let mut chains: Vec<CorefChain> = Vec::new();
2790
2791        for track in self.tracks.values() {
2792            let mut mentions: Vec<Mention> = Vec::new();
2793
2794            for sref in &track.signals {
2795                let Some(signal) = signal_by_id.get(&sref.signal_id) else {
2796                    continue;
2797                };
2798
2799                let Some((start, end)) = signal.location.text_offsets() else {
2800                    continue;
2801                };
2802
2803                let mut m = Mention::new(signal.surface.clone(), start, end);
2804                m.entity_type = Some(signal.label.to_string());
2805                mentions.push(m);
2806            }
2807
2808            if mentions.is_empty() {
2809                continue;
2810            }
2811
2812            let mut chain = CorefChain::new(mentions);
2813            chain.entity_type = track.entity_type.as_ref().map(|t| t.to_string());
2814            chains.push(chain);
2815        }
2816
2817        // Deterministic ordering: sort by earliest mention.
2818        chains.sort_by_key(|c| c.mentions.first().map(|m| m.start).unwrap_or(usize::MAX));
2819
2820        CorefDocument::with_id(&self.text, &self.id, chains)
2821    }
2822}
2823
2824// =============================================================================
2825// HTML Visualization (Brutalist/Functional Style)
2826// =============================================================================
2827
2828/// Generate an HTML visualization of a grounded document.
2829///
2830/// Brutalist design: monospace, dense tables, no decoration, raw data.
2831pub fn render_document_html(doc: &GroundedDocument) -> String {
2832    let mut html = String::new();
2833    let stats = doc.stats();
2834
2835    html.push_str(r#"<!DOCTYPE html>
2836<html>
2837<head>
2838<meta charset="UTF-8">
2839<meta name="color-scheme" content="dark light">
2840<title>grounded::GroundedDocument</title>
2841<style>
2842:root{
2843  /* Allow UA widgets (inputs/scrollbars) to match the theme */
2844  color-scheme: light dark;
2845  /* Dark (default) */
2846  --bg:#0a0a0a;
2847  --panel-bg:#0d0d0d;
2848  --text:#b0b0b0;
2849  --text-strong:#fff;
2850  --muted:#666;
2851  --border:#222;
2852  --border-strong:#333;
2853  --hover:#111;
2854  --input-bg:#080808;
2855  --active:#fff;
2856  --track-strong:rgba(255,255,255,0.35);
2857  --track-soft:rgba(255,255,255,0.18);
2858  /* Entity colors (dark) */
2859  --per-bg:#1a1a2e; --per-br:#4a4a8a; --per-tx:#8888cc;
2860  --org-bg:#1a2e1a; --org-br:#4a8a4a; --org-tx:#88cc88;
2861  --loc-bg:#2e2e1a; --loc-br:#8a8a4a; --loc-tx:#cccc88;
2862  --mis-bg:#1a1a1a; --mis-br:#4a4a4a; --mis-tx:#999;
2863  --dat-bg:#2e1a1a; --dat-br:#8a4a4a; --dat-tx:#cc8888;
2864  --badge-y-bg:#1a2e1a; --badge-y-tx:#4a8a4a; --badge-y-br:#2a4a2a;
2865  --badge-n-bg:#2e2e1a; --badge-n-tx:#8a8a4a; --badge-n-br:#4a4a2a;
2866}
2867@media (prefers-color-scheme: light){
2868  :root{
2869    --bg:#ffffff;
2870    --panel-bg:#f7f7f7;
2871    --text:#222;
2872    --text-strong:#000;
2873    --muted:#555;
2874    --border:#d6d6d6;
2875    --border-strong:#c6c6c6;
2876    --hover:#f0f0f0;
2877    --input-bg:#ffffff;
2878    --active:#000;
2879    --track-strong:rgba(0,0,0,0.25);
2880    --track-soft:rgba(0,0,0,0.12);
2881    /* Entity colors (light) */
2882    --per-bg:#e9e9ff; --per-br:#6c6cff; --per-tx:#2b2b7a;
2883    --org-bg:#e9f7e9; --org-br:#2f8a2f; --org-tx:#1f5a1f;
2884    --loc-bg:#fff7db; --loc-br:#8a7a2f; --loc-tx:#5a4d12;
2885    --mis-bg:#f2f2f2; --mis-br:#8a8a8a; --mis-tx:#333;
2886    --dat-bg:#ffe9e9; --dat-br:#8a2f2f; --dat-tx:#5a1f1f;
2887    --badge-y-bg:#e9f7e9; --badge-y-tx:#1f5a1f; --badge-y-br:#9ad19a;
2888    --badge-n-bg:#fff7db; --badge-n-tx:#5a4d12; --badge-n-br:#e2d39a;
2889  }
2890}
2891html[data-theme='dark']{
2892  --bg:#0a0a0a; --panel-bg:#0d0d0d; --text:#b0b0b0; --text-strong:#fff;
2893  --muted:#666; --border:#222; --border-strong:#333; --hover:#111;
2894  --input-bg:#080808; --active:#fff;
2895  --track-strong:rgba(255,255,255,0.35); --track-soft:rgba(255,255,255,0.18);
2896  --per-bg:#1a1a2e; --per-br:#4a4a8a; --per-tx:#8888cc;
2897  --org-bg:#1a2e1a; --org-br:#4a8a4a; --org-tx:#88cc88;
2898  --loc-bg:#2e2e1a; --loc-br:#8a8a4a; --loc-tx:#cccc88;
2899  --mis-bg:#1a1a1a; --mis-br:#4a4a4a; --mis-tx:#999;
2900  --dat-bg:#2e1a1a; --dat-br:#8a4a4a; --dat-tx:#cc8888;
2901  --badge-y-bg:#1a2e1a; --badge-y-tx:#4a8a4a; --badge-y-br:#2a4a2a;
2902  --badge-n-bg:#2e2e1a; --badge-n-tx:#8a8a4a; --badge-n-br:#4a4a2a;
2903}
2904html[data-theme='light']{
2905  --bg:#ffffff; --panel-bg:#f7f7f7; --text:#222; --text-strong:#000;
2906  --muted:#555; --border:#d6d6d6; --border-strong:#c6c6c6; --hover:#f0f0f0;
2907  --input-bg:#ffffff; --active:#000;
2908  --track-strong:rgba(0,0,0,0.25); --track-soft:rgba(0,0,0,0.12);
2909  --per-bg:#e9e9ff; --per-br:#6c6cff; --per-tx:#2b2b7a;
2910  --org-bg:#e9f7e9; --org-br:#2f8a2f; --org-tx:#1f5a1f;
2911  --loc-bg:#fff7db; --loc-br:#8a7a2f; --loc-tx:#5a4d12;
2912  --mis-bg:#f2f2f2; --mis-br:#8a8a8a; --mis-tx:#333;
2913  --dat-bg:#ffe9e9; --dat-br:#8a2f2f; --dat-tx:#5a1f1f;
2914  --badge-y-bg:#e9f7e9; --badge-y-tx:#1f5a1f; --badge-y-br:#9ad19a;
2915  --badge-n-bg:#fff7db; --badge-n-tx:#5a4d12; --badge-n-br:#e2d39a;
2916}
2917
2918*{box-sizing:border-box;margin:0;padding:0}
2919body{font:12px/1.4 monospace;background:var(--bg);color:var(--text);padding:8px}
2920h1,h2,h3{color:var(--text-strong);font-weight:normal;border-bottom:1px solid var(--border-strong);padding:4px 0;margin:16px 0 8px}
2921h1{font-size:14px}h2{font-size:12px}h3{font-size:11px;color:var(--muted)}
2922 a{color:inherit}
2923 a:hover{text-decoration:underline}
2924table{width:100%;border-collapse:collapse;font-size:11px;margin:4px 0}
2925th,td{padding:4px 8px;text-align:left;border:1px solid var(--border)}
2926th{background:var(--hover);color:var(--muted);font-weight:normal;text-transform:uppercase;font-size:10px}
2927tr:hover{background:var(--hover)}
2928.grid{display:grid;grid-template-columns:repeat(auto-fit,minmax(300px,1fr));gap:8px}
2929.panel{border:1px solid var(--border);background:var(--panel-bg);padding:8px}
2930.panel-h{display:flex;align-items:center;gap:8px}
2931.toggle{cursor:pointer;user-select:none;color:var(--muted);border:1px solid var(--border);background:var(--bg);padding:2px 6px;font-size:10px}
2932.panel-collapsed table,.panel-collapsed .panel-body{display:none}
2933.toolbar{display:flex;gap:8px;align-items:center;margin:8px 0 0}
2934.toolbar input{width:100%;max-width:520px;background:var(--input-bg);border:1px solid var(--border);color:var(--text);padding:6px 8px;font:12px monospace}
2935.muted{color:var(--muted)}
2936.panel-body{white-space:pre-wrap;word-break:break-word}
2937.text-box{background:var(--input-bg);border:1px solid var(--border);padding:8px;white-space:pre-wrap;word-break:break-word;line-height:1.6}
2938.e{padding:1px 2px;border-bottom:1px solid}
2939.seg{cursor:pointer}
2940.e-per{background:var(--per-bg);border-color:var(--per-br);color:var(--per-tx)}
2941.e-org{background:var(--org-bg);border-color:var(--org-br);color:var(--org-tx)}
2942.e-loc{background:var(--loc-bg);border-color:var(--loc-br);color:var(--loc-tx)}
2943.e-misc{background:var(--mis-bg);border-color:var(--mis-br);color:var(--mis-tx)}
2944.e-date{background:var(--dat-bg);border-color:var(--dat-br);color:var(--dat-tx)}
2945.e-track{box-shadow:inset 0 0 0 1px var(--track-strong)}
2946.e-track-hover{box-shadow:inset 0 0 0 1px var(--track-soft)}
2947.e-active{outline:2px solid var(--active);outline-offset:1px}
2948.conf{color:var(--muted);font-size:10px}
2949.badge{display:inline-block;padding:1px 4px;font-size:9px;text-transform:uppercase}
2950.badge-y{background:var(--badge-y-bg);color:var(--badge-y-tx);border:1px solid var(--badge-y-br)}
2951.badge-n{background:var(--badge-n-bg);color:var(--badge-n-tx);border:1px solid var(--badge-n-br)}
2952.stats{display:flex;gap:16px;padding:8px 0;border-bottom:1px solid var(--border);margin-bottom:8px}
2953.stat{text-align:center}.stat-v{font-size:18px;color:var(--text-strong)}.stat-l{font-size:9px;color:var(--muted);text-transform:uppercase}
2954.id{color:var(--muted);font-size:9px}
2955.kb{color:var(--muted)}
2956.arrow{color:var(--muted)}
2957</style>
2958</head>
2959<body>
2960"#);
2961
2962    // Header with stats
2963    html.push_str(&format!(
2964        r#"<div class="panel-h" style="justify-content:space-between"><h1>doc_id="{}" len={}</h1><span class="toggle" id="theme-toggle" title="toggle theme (auto → dark → light)">theme: auto</span></div>"#,
2965        html_escape(&doc.id),
2966        doc.text.len()
2967    ));
2968
2969    html.push_str(r#"<div class="stats">"#);
2970    html.push_str(&format!(
2971        r#"<div class="stat"><div class="stat-v">{}</div><div class="stat-l">signals</div></div>"#,
2972        stats.signal_count
2973    ));
2974    html.push_str(&format!(
2975        r#"<div class="stat"><div class="stat-v">{}</div><div class="stat-l">tracks</div></div>"#,
2976        stats.track_count
2977    ));
2978    html.push_str(&format!(r#"<div class="stat"><div class="stat-v">{}</div><div class="stat-l">identities</div></div>"#, stats.identity_count));
2979    html.push_str(&format!(r#"<div class="stat"><div class="stat-v">{:.2}</div><div class="stat-l">avg_conf</div></div>"#, stats.avg_confidence));
2980    html.push_str(&format!(
2981        r#"<div class="stat"><div class="stat-v">{}</div><div class="stat-l">linked</div></div>"#,
2982        stats.linked_track_count
2983    ));
2984    html.push_str(&format!(r#"<div class="stat"><div class="stat-v">{}</div><div class="stat-l">untracked</div></div>"#, stats.untracked_count));
2985    if stats.iconic_count > 0 || stats.hybrid_count > 0 {
2986        html.push_str(&format!(r#"<div class="stat"><div class="stat-v">{}/{}/{}</div><div class="stat-l">sym/ico/hyb</div></div>"#,
2987            stats.symbolic_count, stats.iconic_count, stats.hybrid_count));
2988    }
2989    html.push_str(r#"</div>"#);
2990
2991    // Annotated text
2992    html.push_str(r#"<h2>text</h2>"#);
2993    html.push_str(r#"<div class="text-box">"#);
2994    html.push_str(&annotate_text_html(
2995        &doc.text,
2996        doc.signals(),
2997        &doc.signal_to_track,
2998    ));
2999    html.push_str(r#"</div>"#);
3000
3001    // Selection panel (filled by JS)
3002    html.push_str(
3003        r#"<h2>selection</h2><div class="panel" id="selection-panel" role="region" aria-label="selection"><div class="panel-h"><h3>selection</h3><span class="muted" id="selection-hint" role="status" aria-live="polite">click a mention / row to see coref track details</span></div><pre class="panel-body" id="selection-body" role="textbox" aria-readonly="true" aria-label="selection details">—</pre></div>"#,
3004    );
3005
3006    // Grid layout for three levels
3007    html.push_str(r#"<div class="grid">"#);
3008
3009    // Level 1: Signals table
3010    html.push_str(r#"<div class="panel" id="panel-signals"><div class="panel-h"><h3>signals (level 1)</h3><span class="toggle" data-toggle="panel-signals">toggle</span></div><div class="toolbar"><input id="signal-filter" type="text" placeholder="filter signals: id / label / surface (e.g. 'PER', 'S12', 'Paris')" /><span class="muted" id="signal-filter-count"></span></div><table id="signals-table">"#);
3011    html.push_str(r#"<tr><th>id</th><th>span</th><th>surface</th><th>label</th><th>conf</th><th>track</th></tr>"#);
3012    for signal in doc.signals() {
3013        let (span, start_opt, end_opt) = if let Some((s, e)) = signal.location.text_offsets() {
3014            (format!("[{},{})", s, e), Some(s), Some(e))
3015        } else {
3016            ("bbox".to_string(), None, None)
3017        };
3018        let track_id_num = doc.signal_to_track.get(&signal.id).copied();
3019        let track_id = track_id_num
3020            .map(|t| format!("T{}", t))
3021            .unwrap_or_else(|| "-".to_string());
3022        let track_attr = track_id_num
3023            .map(|t| format!(r#" data-track="{}""#, t))
3024            .unwrap_or_default();
3025        let offs_attr = match (start_opt, end_opt) {
3026            (Some(s), Some(e)) => format!(r#" data-start="{}" data-end="{}""#, s, e),
3027            _ => String::new(),
3028        };
3029        let neg = if signal.negated { " NEG" } else { "" };
3030        html.push_str(&format!(
3031            r#"<tr data-sid="S{sid}" data-label="{label}" data-surface="{surface}"{track_attr}{offs_attr} data-conf="{conf:.2}"><td class="id"><a href='#S{sid}'>S{sid}</a></td><td>{span}</td><td>{surface}</td><td>{label}{neg}</td><td class="conf">{conf:.2}</td><td class="id">{track}</td></tr>"#,
3032            sid = signal.id,
3033            span = span,
3034            surface = html_escape(&signal.surface),
3035            label = html_escape(signal.label.as_str()),
3036            neg = neg,
3037            conf = signal.confidence,
3038            track = track_id,
3039            track_attr = track_attr,
3040            offs_attr = offs_attr
3041        ));
3042    }
3043    html.push_str(r#"</table></div>"#);
3044
3045    // Level 2: Tracks table
3046    html.push_str(r#"<div class="panel" id="panel-tracks"><div class="panel-h"><h3>tracks (level 2)</h3><span class="toggle" data-toggle="panel-tracks">toggle</span></div><table id="tracks-table">"#);
3047    html.push_str(r#"<tr><th>id</th><th>canonical</th><th>type</th><th>|S|</th><th>signals</th><th>identity</th></tr>"#);
3048    for track in doc.tracks() {
3049        let entity_type = track
3050            .entity_type
3051            .as_ref()
3052            .map(|t| t.as_str())
3053            .unwrap_or("-");
3054        let signals: Vec<String> = track
3055            .signals
3056            .iter()
3057            .map(|s| format!("S{}", s.signal_id))
3058            .collect();
3059        let identity = doc
3060            .identity_for_track(track.id)
3061            .map(|i| format!("I{}", i.id))
3062            .unwrap_or_else(|| "-".to_string());
3063        let linked_badge = if track.identity_id.is_some() {
3064            r#"<span class="badge badge-y">y</span>"#
3065        } else {
3066            r#"<span class="badge badge-n">n</span>"#
3067        };
3068        html.push_str(&format!(
3069            r#"<tr data-tid="{tid}"><td class="id">T{tid}</td><td>{canonical_surface}</td><td>{etype}</td><td>{n}</td><td class="id">{sigs}</td><td class="id">{ident} {badge}</td></tr>"#,
3070            tid = track.id,
3071            canonical_surface = html_escape(&track.canonical_surface),
3072            etype = html_escape(entity_type),
3073            n = track.len(),
3074            sigs = html_escape(&signals.join(" ")),
3075            ident = identity,
3076            badge = linked_badge
3077        ));
3078    }
3079    html.push_str(r#"</table></div>"#);
3080
3081    // Level 3: Identities table
3082    html.push_str(r#"<div class="panel" id="panel-identities"><div class="panel-h"><h3>identities (level 3)</h3><span class="toggle" data-toggle="panel-identities">toggle</span></div><table>"#);
3083    html.push_str(r#"<tr><th>id</th><th>name</th><th>type</th><th>kb</th><th>kb_id</th><th>aliases</th></tr>"#);
3084    for identity in doc.identities() {
3085        let kb = identity.kb_name.as_deref().unwrap_or("-");
3086        let kb_id = identity.kb_id.as_deref().unwrap_or("-");
3087        let entity_type = identity
3088            .entity_type
3089            .as_ref()
3090            .map(|t| t.as_str())
3091            .unwrap_or("-");
3092        let aliases = if identity.aliases.is_empty() {
3093            "-".to_string()
3094        } else {
3095            identity.aliases.join(", ")
3096        };
3097        html.push_str(&format!(
3098            r#"<tr><td class="id">I{}</td><td>{}</td><td>{}</td><td class="kb">{}</td><td class="kb">{}</td><td>{}</td></tr>"#,
3099            identity.id, html_escape(&identity.canonical_name), entity_type, kb, kb_id, html_escape(&aliases)
3100        ));
3101    }
3102    html.push_str(r#"</table></div>"#);
3103
3104    html.push_str(r#"</div>"#); // end grid
3105
3106    // Signal-Track-Identity mapping (compact view)
3107    html.push_str(r#"<h2>hierarchy trace</h2><div class="panel"><table>"#);
3108    html.push_str(r#"<tr><th>signal</th><th></th><th>track</th><th></th><th>identity</th><th>kb_id</th></tr>"#);
3109    for signal in doc.signals() {
3110        let track = doc.track_for_signal(signal.id);
3111        let identity = doc.identity_for_signal(signal.id);
3112
3113        let track_str = track
3114            .map(|t| format!("T{} \"{}\"", t.id, html_escape(&t.canonical_surface)))
3115            .unwrap_or_else(|| "-".to_string());
3116        let identity_str = identity
3117            .map(|i| format!("I{} \"{}\"", i.id, html_escape(&i.canonical_name)))
3118            .unwrap_or_else(|| "-".to_string());
3119        let kb_str = identity
3120            .and_then(|i| i.kb_id.as_ref())
3121            .map(|s| s.as_str())
3122            .unwrap_or("-");
3123
3124        html.push_str(&format!(
3125            r#"<tr><td>S{} "{}"</td><td class="arrow">→</td><td>{}</td><td class="arrow">→</td><td>{}</td><td class="kb">{}</td></tr>"#,
3126            signal.id, html_escape(&signal.surface), track_str, identity_str, kb_str
3127        ));
3128    }
3129    html.push_str(r#"</table></div>"#);
3130
3131    // Minimal JS: click a signal row → highlight that mention in the text box.
3132    // Also support filtering signals by substring match.
3133    html.push_str(r#"<script>
3134(() => {
3135  // Index signal metadata from the signals table, and map signal/track → text elements.
3136  const signalMeta = new Map();
3137  document.querySelectorAll('#signals-table tr[data-sid]').forEach((row) => {
3138    const sid = row.getAttribute('data-sid');
3139    if (!sid) return;
3140    signalMeta.set(sid, {
3141      sid,
3142      label: row.getAttribute('data-label') || '',
3143      surface: row.getAttribute('data-surface') || '',
3144      conf: row.getAttribute('data-conf') || '',
3145      start: row.getAttribute('data-start'),
3146      end: row.getAttribute('data-end'),
3147      track: row.getAttribute('data-track'),
3148    });
3149  });
3150
3151  const signalEls = new Map();
3152  const addSignalEl = (sid, el) => {
3153    if (!sid || !el) return;
3154    const arr = signalEls.get(sid) || [];
3155    arr.push(el);
3156    signalEls.set(sid, arr);
3157  };
3158  // Old-style inline spans (non-overlapping renderer).
3159  document.querySelectorAll('span.e[data-sid]').forEach((el) => {
3160    addSignalEl(el.getAttribute('data-sid'), el);
3161  });
3162  // Segmented spans (overlap/discontinuous-safe renderer).
3163  document.querySelectorAll('span.seg[data-sids]').forEach((el) => {
3164    const raw = (el.getAttribute('data-sids') || '').trim();
3165    if (!raw) return;
3166    raw.split(/\s+/).filter(Boolean).forEach((sid) => addSignalEl(sid, el));
3167  });
3168
3169  const trackEls = new Map();
3170  for (const [sid, els] of signalEls.entries()) {
3171    const meta = signalMeta.get(sid);
3172    const tid = meta ? meta.track : null;
3173    if (!tid) continue;
3174    const arr = trackEls.get(tid) || [];
3175    els.forEach((el) => arr.push(el));
3176    trackEls.set(tid, arr);
3177  }
3178
3179  const selectionBody = document.getElementById('selection-body');
3180  const selectionHint = document.getElementById('selection-hint');
3181  const defaultHint = selectionHint ? (selectionHint.textContent || '') : '';
3182  const setSelection = (text) => {
3183    if (!selectionBody) return;
3184    selectionBody.textContent = text;
3185  };
3186  const setHint = (text) => {
3187    if (!selectionHint) return;
3188    selectionHint.textContent = text || defaultHint;
3189  };
3190
3191  // Theme toggle: auto (prefers-color-scheme) → dark → light.
3192  const themeBtn = document.getElementById('theme-toggle');
3193  const themeKey = 'anno-theme';
3194  const applyTheme = (theme) => {
3195    const t = theme || 'auto';
3196    if (t === 'auto') {
3197      delete document.documentElement.dataset.theme;
3198    } else {
3199      document.documentElement.dataset.theme = t;
3200    }
3201    if (themeBtn) themeBtn.textContent = `theme: ${t}`;
3202  };
3203  const readTheme = () => {
3204    try { return localStorage.getItem(themeKey) || 'auto'; } catch (_) { return 'auto'; }
3205  };
3206  const writeTheme = (t) => {
3207    try { localStorage.setItem(themeKey, t); } catch (_) { /* ignore */ }
3208  };
3209  applyTheme(readTheme());
3210  if (themeBtn) {
3211    themeBtn.addEventListener('click', () => {
3212      const cur = readTheme();
3213      const next = cur === 'auto' ? 'dark' : (cur === 'dark' ? 'light' : 'auto');
3214      writeTheme(next);
3215      applyTheme(next);
3216    });
3217  }
3218
3219  let activeSignalEls = [];
3220  let activeSignalRow = null;
3221  const clearActive = () => {
3222    if (activeSignalEls && activeSignalEls.length) {
3223      activeSignalEls.forEach((el) => el.classList.remove('e-active'));
3224    }
3225    if (activeSignalRow) activeSignalRow.classList.remove('e-active');
3226    activeSignalEls = [];
3227    activeSignalRow = null;
3228  };
3229
3230  let activeTrack = null;
3231  let hoverTrack = null;
3232
3233  const removeTrackClass = (tid, cls) => {
3234    if (!tid) return;
3235    const els = trackEls.get(tid);
3236    if (!els) return;
3237    els.forEach((el) => el.classList.remove(cls));
3238  };
3239
3240  const addTrackClass = (tid, cls) => {
3241    if (!tid) return;
3242    const els = trackEls.get(tid);
3243    if (!els) return;
3244    els.forEach((el) => el.classList.add(cls));
3245  };
3246
3247  const trackSize = (tid) => {
3248    const els = tid ? trackEls.get(tid) : null;
3249    return els ? els.length : 0;
3250  };
3251
3252  const getTrackSelectionText = (tid) => {
3253    if (!tid) return 'track: - (untracked)';
3254    const row = document.querySelector(`#tracks-table tr[data-tid='${tid}']`);
3255    if (!row) return `track T${tid}`;
3256    const cells = row.querySelectorAll('td');
3257    const canonical = (cells[1]?.textContent || '').trim();
3258    const etype = (cells[2]?.textContent || '').trim();
3259    const count = (cells[3]?.textContent || '').trim();
3260    const sigs = (cells[4]?.textContent || '').trim();
3261    const lines = [];
3262    lines.push(`track T${tid} canonical="${canonical}" type="${etype}" mentions=${count}`);
3263    if (sigs) lines.push(`track signals: ${sigs}`);
3264    return lines.join('\n');
3265  };
3266
3267  const renderTrackSelection = (tid) => setSelection(getTrackSelectionText(tid));
3268
3269  const renderSignalSelectionBySid = (sid) => {
3270    const meta = signalMeta.get(sid);
3271    const label = meta ? (meta.label || '') : '';
3272    const conf = meta ? (meta.conf || '') : '';
3273    const start = meta ? meta.start : null;
3274    const end = meta ? meta.end : null;
3275    const tid = meta ? meta.track : null;
3276    const lines = [];
3277    if (start !== null && end !== null) {
3278      lines.push(`signal ${sid} label=${label} conf=${conf} span=[${start},${end})`);
3279    } else {
3280      lines.push(`signal ${sid} label=${label} conf=${conf}`);
3281    }
3282    if (meta && meta.surface) lines.push(`surface: ${meta.surface}`);
3283    lines.push('');
3284    lines.push(getTrackSelectionText(tid));
3285    setSelection(lines.join('\n'));
3286  };
3287
3288  const setActiveTrack = (tid) => {
3289    const next = tid || null;
3290    if (activeTrack === next) return;
3291    removeTrackClass(activeTrack, 'e-track');
3292    activeTrack = next;
3293    if (activeTrack) addTrackClass(activeTrack, 'e-track');
3294    if (hoverTrack && activeTrack && hoverTrack === activeTrack) {
3295      removeTrackClass(hoverTrack, 'e-track-hover');
3296    }
3297  };
3298
3299  const setHoverTrack = (tid) => {
3300    const next = tid || null;
3301    if (hoverTrack === next) return;
3302    removeTrackClass(hoverTrack, 'e-track-hover');
3303    hoverTrack = next;
3304    if (!hoverTrack) {
3305      setHint('');
3306      return;
3307    }
3308    if (activeTrack && hoverTrack === activeTrack) {
3309      setHint(`selected track T${hoverTrack} (${trackSize(hoverTrack)} mentions)`);
3310      return;
3311    }
3312    addTrackClass(hoverTrack, 'e-track-hover');
3313    setHint(`hover track T${hoverTrack} (${trackSize(hoverTrack)} mentions)`);
3314  };
3315
3316  const emitToParentSpan = (start, end) => {
3317    try {
3318      if (!window.parent || window.parent === window) return;
3319      if (start === null || end === null) return;
3320      window.parent.postMessage({ type: 'anno:activate-span', start: Number(start), end: Number(end) }, '*');
3321    } catch (_) {
3322      // ignore: best-effort bridge for iframe containers
3323    }
3324  };
3325
3326  const activateBySpan = (start, end, emit) => {
3327    if (start === null || end === null || start === undefined || end === undefined) return;
3328    // Prefer an exact signal span if present; otherwise fall back to the table row metadata.
3329    const el = document.querySelector(`span.e[data-sid][data-start='${start}'][data-end='${end}']`);
3330    if (el) {
3331      const sid = el.getAttribute('data-sid');
3332      if (sid) activateSignal(sid, emit);
3333      return;
3334    }
3335    const row = document.querySelector(`#signals-table tr[data-start='${start}'][data-end='${end}']`);
3336    if (!row) return;
3337    const sid = row.getAttribute('data-sid');
3338    if (!sid) return;
3339    activateSignal(sid, emit);
3340  };
3341
3342  const activateSignal = (sid, emit) => {
3343    clearActive();
3344    const els = signalEls.get(sid) || [];
3345    if (!els.length) return;
3346    els.forEach((el) => el.classList.add('e-active'));
3347    activeSignalEls = els;
3348    const row = document.querySelector(`#signals-table tr[data-sid='${sid}']`);
3349    if (row) {
3350      row.classList.add('e-active');
3351      activeSignalRow = row;
3352    }
3353    const primaryEl = els[0];
3354    primaryEl.scrollIntoView({ block: 'center', behavior: 'smooth' });
3355    const meta = signalMeta.get(sid);
3356    const tid = meta ? meta.track : primaryEl.getAttribute('data-track');
3357    setActiveTrack(tid);
3358    renderSignalSelectionBySid(sid);
3359    if (emit && meta && meta.start !== null && meta.end !== null) {
3360      emitToParentSpan(meta.start, meta.end);
3361    }
3362  };
3363
3364  // Table click
3365  const signalsTable = document.getElementById('signals-table');
3366  if (signalsTable) {
3367    signalsTable.addEventListener('click', (ev) => {
3368      const a = ev.target && ev.target.closest ? ev.target.closest("a[href^='#S']") : null;
3369      const row = ev.target && ev.target.closest ? ev.target.closest('tr[data-sid]') : null;
3370      const sid = (a && a.getAttribute('href') ? a.getAttribute('href').slice(1) : null) || (row ? row.getAttribute('data-sid') : null);
3371      if (!sid) return;
3372      ev.preventDefault();
3373      activateSignal(sid, true);
3374      history.replaceState(null, '', '#' + sid);
3375    });
3376
3377    // Hover a signals row → preview track highlight
3378    signalsTable.addEventListener('mouseover', (ev) => {
3379      const row = ev.target && ev.target.closest ? ev.target.closest('tr[data-sid]') : null;
3380      if (!row) return;
3381      const tid = row.getAttribute('data-track');
3382      setHoverTrack(tid);
3383    });
3384    signalsTable.addEventListener('mouseout', (ev) => {
3385      const to = ev.relatedTarget;
3386      if (to && signalsTable.contains(to)) return;
3387      setHoverTrack(null);
3388    });
3389  }
3390
3391  // Clicking an inline entity should also toggle active highlight.
3392  const pickPrimarySid = (el) => {
3393    if (!el) return null;
3394    const p = el.getAttribute('data-primary');
3395    if (p) return p;
3396    const raw = (el.getAttribute('data-sids') || '').trim();
3397    if (!raw) return null;
3398    const sids = raw.split(/\s+/).filter(Boolean);
3399    if (!sids.length) return null;
3400    // Prefer the shortest mention span from metadata.
3401    let best = sids[0];
3402    let bestLen = null;
3403    for (const sid of sids) {
3404      const meta = signalMeta.get(sid);
3405      const s = meta && meta.start !== null ? Number(meta.start) : null;
3406      const e = meta && meta.end !== null ? Number(meta.end) : null;
3407      const len = (s !== null && e !== null) ? (e - s) : null;
3408      if (len === null) continue;
3409      if (bestLen === null || len < bestLen) {
3410        best = sid;
3411        bestLen = len;
3412      }
3413    }
3414    return best;
3415  };
3416
3417  document.addEventListener('click', (ev) => {
3418    const span = ev.target && ev.target.closest ? ev.target.closest('span.e[data-sid]') : null;
3419    if (span) {
3420      activateSignal(span.getAttribute('data-sid'), true);
3421      return;
3422    }
3423    const seg = ev.target && ev.target.closest ? ev.target.closest('span.seg[data-sids]') : null;
3424    if (!seg) return;
3425    activateSignal(pickPrimarySid(seg), true);
3426  });
3427
3428  // Hover an inline entity → preview highlight its track
3429  document.addEventListener('mouseover', (ev) => {
3430    const span = ev.target && ev.target.closest ? ev.target.closest('span.e[data-sid]') : null;
3431    if (span) {
3432      setHoverTrack(span.getAttribute('data-track'));
3433      return;
3434    }
3435    const seg = ev.target && ev.target.closest ? ev.target.closest('span.seg[data-sids]') : null;
3436    if (!seg) return;
3437    const sid = pickPrimarySid(seg);
3438    const meta = sid ? signalMeta.get(sid) : null;
3439    setHoverTrack(meta ? meta.track : null);
3440  });
3441  document.addEventListener('mouseout', (ev) => {
3442    const span = ev.target && ev.target.closest ? ev.target.closest('span.e[data-sid]') : null;
3443    const seg = ev.target && ev.target.closest ? ev.target.closest('span.seg[data-sids]') : null;
3444    if (!span && !seg) return;
3445    const to = ev.relatedTarget;
3446    if (to && to.closest && (to.closest('span.e[data-sid]') || to.closest('span.seg[data-sids]'))) return;
3447    setHoverTrack(null);
3448  });
3449
3450  // Clicking a track row → select track (highlight + details)
3451  const tracksTable = document.getElementById('tracks-table');
3452  if (tracksTable) {
3453    tracksTable.addEventListener('click', (ev) => {
3454      const row = ev.target && ev.target.closest ? ev.target.closest('tr[data-tid]') : null;
3455      if (!row) return;
3456      const tid = row.getAttribute('data-tid');
3457      setActiveTrack(tid);
3458      renderTrackSelection(tid);
3459    });
3460    tracksTable.addEventListener('mouseover', (ev) => {
3461      const row = ev.target && ev.target.closest ? ev.target.closest('tr[data-tid]') : null;
3462      if (!row) return;
3463      setHoverTrack(row.getAttribute('data-tid'));
3464    });
3465    tracksTable.addEventListener('mouseout', (ev) => {
3466      const to = ev.relatedTarget;
3467      if (to && tracksTable.contains(to)) return;
3468      setHoverTrack(null);
3469    });
3470  }
3471
3472  // Filter
3473  const input = document.getElementById('signal-filter');
3474  const countEl = document.getElementById('signal-filter-count');
3475  if (input && signalsTable) {
3476    const update = () => {
3477      const q = (input.value || '').trim().toLowerCase();
3478      let shown = 0;
3479      const rows = signalsTable.querySelectorAll('tr[data-sid]');
3480      rows.forEach(row => {
3481        const sid = (row.getAttribute('data-sid') || '').toLowerCase();
3482        const label = (row.getAttribute('data-label') || '').toLowerCase();
3483        const surface = (row.getAttribute('data-surface') || '').toLowerCase();
3484        const ok = !q || sid.includes(q) || label.includes(q) || surface.includes(q);
3485        row.style.display = ok ? '' : 'none';
3486        if (ok) shown += 1;
3487      });
3488      if (countEl) countEl.textContent = shown + ' shown';
3489    };
3490    input.addEventListener('input', update);
3491    update();
3492  }
3493
3494  // Panel toggles
3495  document.querySelectorAll('[data-toggle]').forEach(btn => {
3496    btn.addEventListener('click', () => {
3497      const id = btn.getAttribute('data-toggle');
3498      const panel = id ? document.getElementById(id) : null;
3499      if (!panel) return;
3500      panel.classList.toggle('panel-collapsed');
3501    });
3502  });
3503
3504  // If URL hash is #S123, focus it.
3505  const hash = (location.hash || '').slice(1);
3506  if (hash && hash.startsWith('S')) activateSignal(hash, false);
3507
3508  // Optional: allow parent pages (e.g., dataset explorers) to sync selection across iframes.
3509  window.addEventListener('message', (ev) => {
3510    const data = ev && ev.data ? ev.data : null;
3511    if (!data || data.type !== 'anno:activate-span') return;
3512    if (typeof data.start !== 'number' || typeof data.end !== 'number') return;
3513    activateBySpan(data.start, data.end, false);
3514  });
3515})();
3516</script>"#);
3517
3518    html.push_str(r#"</body></html>"#);
3519    html
3520}
3521
3522fn html_escape(s: &str) -> String {
3523    s.replace('&', "&amp;")
3524        .replace('<', "&lt;")
3525        .replace('>', "&gt;")
3526        .replace('"', "&quot;")
3527}
3528
3529fn annotate_text_html(
3530    text: &str,
3531    signals: &[Signal<Location>],
3532    signal_to_track: &std::collections::HashMap<SignalId, TrackId>,
3533) -> String {
3534    let char_count = text.chars().count();
3535    if char_count == 0 {
3536        return String::new();
3537    }
3538
3539    #[derive(Debug, Clone)]
3540    struct SigMeta {
3541        sid: String,
3542        label: String,
3543        conf: f32,
3544        track_id: Option<TrackId>,
3545        covered_len: usize,
3546    }
3547
3548    #[derive(Debug, Clone)]
3549    struct Event {
3550        pos: usize,
3551        meta_idx: usize,
3552        delta: i32, // -1 end, +1 start
3553    }
3554
3555    // Collect text segments for each signal (supports discontinuous spans).
3556    let mut metas: Vec<SigMeta> = Vec::new();
3557    let mut events: Vec<Event> = Vec::new();
3558    let mut boundaries: Vec<usize> = vec![0, char_count];
3559
3560    for s in signals {
3561        let raw_segments: Vec<(usize, usize)> = match &s.location {
3562            Location::Text { start, end } => vec![(*start, *end)],
3563            Location::TextWithBbox { start, end, .. } => vec![(*start, *end)],
3564            Location::Discontinuous { segments } => segments.clone(),
3565            _ => Vec::new(),
3566        };
3567        if raw_segments.is_empty() {
3568            continue;
3569        }
3570
3571        let mut cleaned: Vec<(usize, usize)> = Vec::new();
3572        let mut covered_len = 0usize;
3573        for (start, end) in raw_segments {
3574            let start = start.min(char_count);
3575            let end = end.min(char_count);
3576            if start >= end {
3577                continue;
3578            }
3579            covered_len = covered_len.saturating_add(end - start);
3580            cleaned.push((start, end));
3581        }
3582        if cleaned.is_empty() {
3583            continue;
3584        }
3585
3586        let meta_idx = metas.len();
3587        let track_id = signal_to_track.get(&s.id).copied();
3588        metas.push(SigMeta {
3589            sid: format!("S{}", s.id),
3590            label: s.label.to_string(),
3591            conf: s.confidence,
3592            track_id,
3593            covered_len,
3594        });
3595
3596        for (start, end) in cleaned {
3597            boundaries.push(start);
3598            boundaries.push(end);
3599            events.push(Event {
3600                pos: start,
3601                meta_idx,
3602                delta: 1,
3603            });
3604            events.push(Event {
3605                pos: end,
3606                meta_idx,
3607                delta: -1,
3608            });
3609        }
3610    }
3611
3612    if metas.is_empty() {
3613        return html_escape(text);
3614    }
3615
3616    boundaries.sort_unstable();
3617    boundaries.dedup();
3618    events.sort_by(|a, b| a.pos.cmp(&b.pos).then_with(|| a.delta.cmp(&b.delta)));
3619
3620    let mut active_counts: Vec<u32> = vec![0; metas.len()];
3621    let mut active: Vec<usize> = Vec::new();
3622    let mut ev_idx = 0usize;
3623
3624    let mut result = String::new();
3625
3626    for bi in 0..boundaries.len().saturating_sub(1) {
3627        let pos = boundaries[bi];
3628        // Apply all events at this boundary.
3629        while ev_idx < events.len() && events[ev_idx].pos == pos {
3630            let e = &events[ev_idx];
3631            let idx = e.meta_idx;
3632            if e.delta < 0 {
3633                if active_counts[idx] > 0 {
3634                    active_counts[idx] -= 1;
3635                    if active_counts[idx] == 0 {
3636                        active.retain(|&x| x != idx);
3637                    }
3638                }
3639            } else {
3640                active_counts[idx] += 1;
3641                if active_counts[idx] == 1 {
3642                    active.push(idx);
3643                }
3644            }
3645            ev_idx += 1;
3646        }
3647
3648        let next = boundaries[bi + 1];
3649        if next <= pos {
3650            continue;
3651        }
3652
3653        let seg_text: String = text.chars().skip(pos).take(next - pos).collect();
3654        if active.is_empty() {
3655            result.push_str(&html_escape(&seg_text));
3656            continue;
3657        }
3658
3659        // Determine primary (for coloring + click default): shortest covered len, then highest conf.
3660        let primary_idx = active
3661            .iter()
3662            .copied()
3663            .min_by(|a, b| {
3664                metas[*a]
3665                    .covered_len
3666                    .cmp(&metas[*b].covered_len)
3667                    .then_with(|| {
3668                        metas[*b]
3669                            .conf
3670                            .partial_cmp(&metas[*a].conf)
3671                            .unwrap_or(std::cmp::Ordering::Equal)
3672                    })
3673            })
3674            .unwrap_or(active[0]);
3675        let primary = &metas[primary_idx];
3676
3677        let class = match primary.label.to_uppercase().as_str() {
3678            "PER" | "PERSON" => "e-per",
3679            "ORG" | "ORGANIZATION" | "COMPANY" => "e-org",
3680            "LOC" | "LOCATION" | "GPE" => "e-loc",
3681            "DATE" | "TIME" => "e-date",
3682            _ => "e-misc",
3683        };
3684
3685        let mut sids: Vec<&str> = active.iter().map(|i| metas[*i].sid.as_str()).collect();
3686        sids.sort_unstable();
3687        let data_sids = sids.join(" ");
3688
3689        let mut title = format!(
3690            "sids=[{}] primary={} [{}..{})",
3691            data_sids, primary.sid, pos, next
3692        );
3693        if let Some(t) = primary.track_id {
3694            title.push_str(&format!(" track=T{}", t));
3695        }
3696
3697        result.push_str(&format!(
3698            r#"<span class="e seg {class}" data-sids="{sids}" data-start="{start}" data-end="{end}" data-primary="{primary}" title="{title}">{text}</span>"#,
3699            class = class,
3700            sids = html_escape(&data_sids),
3701            start = pos,
3702            end = next,
3703            primary = html_escape(&primary.sid),
3704            title = html_escape(&title),
3705            text = html_escape(&seg_text),
3706        ));
3707    }
3708
3709    result
3710}
3711
3712// =============================================================================
3713// Eval Comparison HTML Rendering
3714// =============================================================================
3715
3716/// Comparison between gold (ground truth) and predicted entities.
3717#[derive(Debug, Clone)]
3718pub struct EvalComparison {
3719    /// Document text
3720    pub text: String,
3721    /// Gold/ground truth signals
3722    pub gold: Vec<Signal<Location>>,
3723    /// Predicted signals
3724    pub predicted: Vec<Signal<Location>>,
3725    /// Match results
3726    pub matches: Vec<EvalMatch>,
3727}
3728
3729/// Result of matching a gold or predicted signal.
3730#[derive(Debug, Clone)]
3731pub enum EvalMatch {
3732    /// Exact match: gold and predicted align perfectly.
3733    Correct {
3734        /// Gold signal ID
3735        gold_id: SignalId,
3736        /// Predicted signal ID
3737        pred_id: SignalId,
3738    },
3739    /// Type mismatch: same span, different label.
3740    TypeMismatch {
3741        /// Gold signal ID
3742        gold_id: SignalId,
3743        /// Predicted signal ID
3744        pred_id: SignalId,
3745        /// Gold label
3746        gold_label: String,
3747        /// Predicted label
3748        pred_label: String,
3749    },
3750    /// Boundary error: overlapping but not exact span.
3751    BoundaryError {
3752        /// Gold signal ID
3753        gold_id: SignalId,
3754        /// Predicted signal ID
3755        pred_id: SignalId,
3756        /// Intersection over Union
3757        iou: f64,
3758    },
3759    /// False positive: predicted with no gold match.
3760    Spurious {
3761        /// Predicted signal ID
3762        pred_id: SignalId,
3763    },
3764    /// False negative: gold with no prediction.
3765    Missed {
3766        /// Gold signal ID
3767        gold_id: SignalId,
3768    },
3769}
3770
3771impl EvalComparison {
3772    /// Create a comparison from gold and predicted entities.
3773    ///
3774    /// # Example
3775    ///
3776    /// ```rust
3777    /// use anno_core::core::grounded::{EvalComparison};
3778    /// use anno_core::{Signal, Location};
3779    ///
3780    /// let text = "Marie Curie won the Nobel Prize.";
3781    /// let gold = vec![
3782    ///     Signal::new(0, Location::text(0, 11), "Marie Curie", "PER", 1.0),
3783    ///     Signal::new(1, Location::text(20, 31), "Nobel Prize", "AWARD", 1.0),
3784    /// ];
3785    /// let pred = vec![
3786    ///     Signal::new(0, Location::text(0, 11), "Marie Curie", "PER", 0.95),
3787    /// ];
3788    /// let cmp = EvalComparison::compare(text, gold, pred);
3789    /// assert_eq!(cmp.matches.len(), 2); // 1 correct, 1 missed
3790    /// ```
3791    #[must_use]
3792    pub fn compare(
3793        text: &str,
3794        gold: Vec<Signal<Location>>,
3795        predicted: Vec<Signal<Location>>,
3796    ) -> Self {
3797        let mut matches = Vec::new();
3798        let mut gold_matched = vec![false; gold.len()];
3799        let mut pred_matched = vec![false; predicted.len()];
3800
3801        // First pass: find exact matches and type mismatches
3802        for (pi, pred) in predicted.iter().enumerate() {
3803            let pred_offsets = match pred.location.text_offsets() {
3804                Some(o) => o,
3805                None => continue,
3806            };
3807
3808            for (gi, g) in gold.iter().enumerate() {
3809                if gold_matched[gi] {
3810                    continue;
3811                }
3812                let gold_offsets = match g.location.text_offsets() {
3813                    Some(o) => o,
3814                    None => continue,
3815                };
3816
3817                // Exact span match
3818                if pred_offsets == gold_offsets {
3819                    if pred.label == g.label {
3820                        matches.push(EvalMatch::Correct {
3821                            gold_id: g.id,
3822                            pred_id: pred.id,
3823                        });
3824                    } else {
3825                        matches.push(EvalMatch::TypeMismatch {
3826                            gold_id: g.id,
3827                            pred_id: pred.id,
3828                            gold_label: g.label.to_string(),
3829                            pred_label: pred.label.to_string(),
3830                        });
3831                    }
3832                    gold_matched[gi] = true;
3833                    pred_matched[pi] = true;
3834                    break;
3835                }
3836            }
3837        }
3838
3839        // Second pass: find boundary errors (overlapping but not exact)
3840        for (pi, pred) in predicted.iter().enumerate() {
3841            if pred_matched[pi] {
3842                continue;
3843            }
3844            let pred_offsets = match pred.location.text_offsets() {
3845                Some(o) => o,
3846                None => continue,
3847            };
3848
3849            for (gi, g) in gold.iter().enumerate() {
3850                if gold_matched[gi] {
3851                    continue;
3852                }
3853                let gold_offsets = match g.location.text_offsets() {
3854                    Some(o) => o,
3855                    None => continue,
3856                };
3857
3858                // Check overlap
3859                if pred_offsets.0 < gold_offsets.1 && pred_offsets.1 > gold_offsets.0 {
3860                    let iou = pred.location.iou(&g.location).unwrap_or(0.0);
3861                    matches.push(EvalMatch::BoundaryError {
3862                        gold_id: g.id,
3863                        pred_id: pred.id,
3864                        iou,
3865                    });
3866                    gold_matched[gi] = true;
3867                    pred_matched[pi] = true;
3868                    break;
3869                }
3870            }
3871        }
3872
3873        // Remaining unmatched predictions are spurious
3874        for (pi, pred) in predicted.iter().enumerate() {
3875            if !pred_matched[pi] {
3876                matches.push(EvalMatch::Spurious { pred_id: pred.id });
3877            }
3878        }
3879
3880        // Remaining unmatched gold are missed
3881        for (gi, g) in gold.iter().enumerate() {
3882            if !gold_matched[gi] {
3883                matches.push(EvalMatch::Missed { gold_id: g.id });
3884            }
3885        }
3886
3887        Self {
3888            text: text.to_string(),
3889            gold,
3890            predicted,
3891            matches,
3892        }
3893    }
3894
3895    /// Count correct matches.
3896    #[must_use]
3897    pub fn correct_count(&self) -> usize {
3898        self.matches
3899            .iter()
3900            .filter(|m| matches!(m, EvalMatch::Correct { .. }))
3901            .count()
3902    }
3903
3904    /// Count errors (type mismatch + boundary + spurious + missed).
3905    #[must_use]
3906    pub fn error_count(&self) -> usize {
3907        self.matches.len() - self.correct_count()
3908    }
3909
3910    /// Calculate precision.
3911    #[must_use]
3912    pub fn precision(&self) -> f64 {
3913        if self.predicted.is_empty() {
3914            0.0
3915        } else {
3916            self.correct_count() as f64 / self.predicted.len() as f64
3917        }
3918    }
3919
3920    /// Calculate recall.
3921    #[must_use]
3922    pub fn recall(&self) -> f64 {
3923        if self.gold.is_empty() {
3924            0.0
3925        } else {
3926            self.correct_count() as f64 / self.gold.len() as f64
3927        }
3928    }
3929
3930    /// Calculate F1.
3931    #[must_use]
3932    pub fn f1(&self) -> f64 {
3933        let p = self.precision();
3934        let r = self.recall();
3935        if p + r > 0.0 {
3936            2.0 * p * r / (p + r)
3937        } else {
3938            0.0
3939        }
3940    }
3941}
3942
3943/// Render an eval comparison as HTML.
3944///
3945/// Shows gold vs predicted side by side with error highlighting.
3946pub fn render_eval_html(cmp: &EvalComparison) -> String {
3947    render_eval_html_with_title(cmp, "eval comparison")
3948}
3949
3950/// Render an eval comparison as HTML, with a custom title.
3951///
3952/// The title is used for both the page `<title>` and the top `<h1>`.
3953#[must_use]
3954pub fn render_eval_html_with_title(cmp: &EvalComparison, title: &str) -> String {
3955    let mut html = String::new();
3956    let title = html_escape(title);
3957
3958    html.push_str(
3959        r#"<!DOCTYPE html>
3960<html>
3961<head>
3962<meta charset="UTF-8">
3963<meta name="color-scheme" content="dark light">
3964"#,
3965    );
3966    html.push_str(&format!("<title>{}</title>", title));
3967    html.push_str(r#"
3968:root{
3969  color-scheme: light dark;
3970  --bg:#0a0a0a;
3971  --panel-bg:#0d0d0d;
3972  --text:#b0b0b0;
3973  --text-strong:#fff;
3974  --muted:#666;
3975  --border:#222;
3976  --border-strong:#333;
3977  --hover:#111;
3978  --input-bg:#080808;
3979  --active:#ddd;
3980  /* Eval entity colors (dark) */
3981  --gold-bg:#1a2e1a; --gold-br:#4a8a4a; --gold-tx:#88cc88;
3982  --pred-bg:#1a1a2e; --pred-br:#4a4a8a; --pred-tx:#8888cc;
3983  /* Match row borders */
3984  --m-ok:#4a8a4a;
3985  --m-type:#8a8a4a;
3986  --m-bound:#4a8a8a;
3987  --m-fp:#8a4a4a;
3988  --m-fn:#8a4a8a;
3989}
3990@media (prefers-color-scheme: light){
3991  :root{
3992    --bg:#ffffff;
3993    --panel-bg:#f7f7f7;
3994    --text:#222;
3995    --text-strong:#000;
3996    --muted:#555;
3997    --border:#d6d6d6;
3998    --border-strong:#c6c6c6;
3999    --hover:#f0f0f0;
4000    --input-bg:#ffffff;
4001    --active:#000;
4002    --gold-bg:#e9f7e9; --gold-br:#2f8a2f; --gold-tx:#1f5a1f;
4003    --pred-bg:#e9e9ff; --pred-br:#6c6cff; --pred-tx:#2b2b7a;
4004    --m-ok:#2f8a2f;
4005    --m-type:#8a7a2f;
4006    --m-bound:#2f7a8a;
4007    --m-fp:#8a2f2f;
4008    --m-fn:#6a2f8a;
4009  }
4010}
4011html[data-theme='dark']{
4012  --bg:#0a0a0a; --panel-bg:#0d0d0d; --text:#b0b0b0; --text-strong:#fff;
4013  --muted:#666; --border:#222; --border-strong:#333; --hover:#111; --input-bg:#080808; --active:#ddd;
4014  --gold-bg:#1a2e1a; --gold-br:#4a8a4a; --gold-tx:#88cc88;
4015  --pred-bg:#1a1a2e; --pred-br:#4a4a8a; --pred-tx:#8888cc;
4016  --m-ok:#4a8a4a; --m-type:#8a8a4a; --m-bound:#4a8a8a; --m-fp:#8a4a4a; --m-fn:#8a4a8a;
4017}
4018html[data-theme='light']{
4019  --bg:#ffffff; --panel-bg:#f7f7f7; --text:#222; --text-strong:#000;
4020  --muted:#555; --border:#d6d6d6; --border-strong:#c6c6c6; --hover:#f0f0f0; --input-bg:#ffffff; --active:#000;
4021  --gold-bg:#e9f7e9; --gold-br:#2f8a2f; --gold-tx:#1f5a1f;
4022  --pred-bg:#e9e9ff; --pred-br:#6c6cff; --pred-tx:#2b2b7a;
4023  --m-ok:#2f8a2f; --m-type:#8a7a2f; --m-bound:#2f7a8a; --m-fp:#8a2f2f; --m-fn:#6a2f8a;
4024}
4025
4026<style>
4027*{box-sizing:border-box;margin:0;padding:0}
4028body{font:12px/1.4 monospace;background:var(--bg);color:var(--text);padding:8px}
4029h1,h2{color:var(--text-strong);font-weight:normal;border-bottom:1px solid var(--border-strong);padding:4px 0;margin:16px 0 8px}
4030h1{font-size:14px}h2{font-size:12px}
4031table{width:100%;border-collapse:collapse;font-size:11px;margin:4px 0}
4032th,td{padding:4px 8px;text-align:left;border:1px solid var(--border)}
4033th{background:var(--hover);color:var(--muted);font-weight:normal;text-transform:uppercase;font-size:10px}
4034tr:hover{background:var(--hover)}
4035.grid{display:grid;grid-template-columns:1fr 1fr;gap:8px}
4036.panel{border:1px solid var(--border);background:var(--panel-bg);padding:8px}
4037.text-box{background:var(--input-bg);border:1px solid var(--border);padding:8px;white-space:pre-wrap;word-break:break-word;line-height:1.6}
4038.stats{display:flex;gap:24px;padding:8px 0;border-bottom:1px solid var(--border);margin-bottom:8px}
4039.stat{text-align:center}.stat-v{font-size:18px;color:var(--text-strong)}.stat-l{font-size:9px;color:var(--muted);text-transform:uppercase}
4040/* Entities */
4041.e{padding:1px 2px;border-bottom:2px solid}
4042.seg{cursor:pointer}
4043.e-gold{background:var(--gold-bg);border-color:var(--gold-br);color:var(--gold-tx)}
4044.e-pred{background:var(--pred-bg);border-color:var(--pred-br);color:var(--pred-tx)}
4045.e-active{outline:1px solid var(--active);outline-offset:1px}
4046/* Match types */
4047.correct{background:#1a2e1a;border-color:#4a8a4a}
4048.type-err{background:#2e2e1a;border-color:#8a8a4a}
4049.boundary{background:#1a2e2e;border-color:#4a8a8a}
4050.spurious{background:#2e1a1a;border-color:#8a4a4a}
4051.missed{background:#2e1a2e;border-color:#8a4a8a}
4052.match-row.correct{border-left:3px solid var(--m-ok)}
4053.match-row.type-err{border-left:3px solid var(--m-type)}
4054.match-row.boundary{border-left:3px solid var(--m-bound)}
4055.match-row.spurious{border-left:3px solid var(--m-fp)}
4056.match-row.missed{border-left:3px solid var(--m-fn)}
4057.match-row.active{outline:1px solid var(--muted)}
4058.sel{color:var(--muted);margin:6px 0 12px}
4059.metric{font-size:14px;color:var(--muted)}.metric b{color:var(--text-strong)}
4060</style>
4061</head>
4062<body>
4063"#);
4064
4065    // Header (with theme toggle)
4066    html.push_str(&format!(
4067        "<div class=\"panel-h\" style=\"justify-content:space-between\"><h1>{}</h1><span class=\"toggle\" id=\"theme-toggle\" title=\"toggle theme (auto → dark → light)\">theme: auto</span></div>",
4068        title
4069    ));
4070
4071    // Metrics bar
4072    html.push_str("<div class=\"stats\">");
4073    html.push_str(&format!(
4074        "<div class=\"stat\"><div class=\"stat-v\">{}</div><div class=\"stat-l\">gold</div></div>",
4075        cmp.gold.len()
4076    ));
4077    html.push_str(&format!(
4078        "<div class=\"stat\"><div class=\"stat-v\">{}</div><div class=\"stat-l\">predicted</div></div>",
4079        cmp.predicted.len()
4080    ));
4081    html.push_str(&format!(
4082        "<div class=\"stat\"><div class=\"stat-v\">{}</div><div class=\"stat-l\">correct</div></div>",
4083        cmp.correct_count()
4084    ));
4085    html.push_str(&format!(
4086        "<div class=\"stat\"><div class=\"stat-v\">{}</div><div class=\"stat-l\">errors</div></div>",
4087        cmp.error_count()
4088    ));
4089    html.push_str(&format!(
4090        "<div class=\"metric\">P=<b>{:.1}%</b> R=<b>{:.1}%</b> F1=<b>{:.1}%</b></div>",
4091        cmp.precision() * 100.0,
4092        cmp.recall() * 100.0,
4093        cmp.f1() * 100.0
4094    ));
4095    html.push_str("</div>");
4096
4097    // Simple selection readout (helps debugging + browser-based verification)
4098    html.push_str("<div id=\"selection\" class=\"sel\">click a match row to select spans</div>");
4099
4100    // Side-by-side text
4101    html.push_str("<div class=\"grid\">");
4102
4103    // Gold panel
4104    html.push_str("<div class=\"panel\"><h2>gold (ground truth)</h2><div class=\"text-box\">");
4105    let gold_spans: Vec<EvalHtmlSpan> = cmp
4106        .gold
4107        .iter()
4108        .map(|s| {
4109            let (start, end) = s.location.text_offsets().unwrap_or((0, 0));
4110            EvalHtmlSpan {
4111                start,
4112                end,
4113                label: s.label.to_string(),
4114                class: "e-gold",
4115                id: format!("G{}", s.id),
4116            }
4117        })
4118        .collect();
4119    html.push_str(&annotate_text_spans(&cmp.text, &gold_spans));
4120    html.push_str("</div></div>");
4121
4122    // Predicted panel
4123    html.push_str("<div class=\"panel\"><h2>predicted</h2><div class=\"text-box\">");
4124    let pred_spans: Vec<EvalHtmlSpan> = cmp
4125        .predicted
4126        .iter()
4127        .map(|s| {
4128            let (start, end) = s.location.text_offsets().unwrap_or((0, 0));
4129            EvalHtmlSpan {
4130                start,
4131                end,
4132                label: s.label.to_string(),
4133                class: "e-pred",
4134                id: format!("P{}", s.id),
4135            }
4136        })
4137        .collect();
4138    html.push_str(&annotate_text_spans(&cmp.text, &pred_spans));
4139    html.push_str("</div></div>");
4140
4141    html.push_str("</div>");
4142
4143    // Match table
4144    html.push_str("<h2>matches</h2><table>");
4145    html.push_str("<tr><th>type</th><th>gold</th><th>predicted</th><th>notes</th></tr>");
4146
4147    for (mi, m) in cmp.matches.iter().enumerate() {
4148        let (class, mtype, gold_text, pred_text, notes, gid, pid) = match m {
4149            EvalMatch::Correct { gold_id, pred_id } => {
4150                let g = cmp.gold.iter().find(|s| s.id == *gold_id);
4151                let p = cmp.predicted.iter().find(|s| s.id == *pred_id);
4152                (
4153                    "correct",
4154                    "✓",
4155                    g.map(|s| format!("[{}] {}", s.label, s.surface()))
4156                        .unwrap_or_default(),
4157                    p.map(|s| format!("[{}] {}", s.label, s.surface()))
4158                        .unwrap_or_default(),
4159                    String::new(),
4160                    Some(format!("G{}", gold_id)),
4161                    Some(format!("P{}", pred_id)),
4162                )
4163            }
4164            EvalMatch::TypeMismatch {
4165                gold_id,
4166                pred_id,
4167                gold_label,
4168                pred_label,
4169            } => {
4170                let g = cmp.gold.iter().find(|s| s.id == *gold_id);
4171                let p = cmp.predicted.iter().find(|s| s.id == *pred_id);
4172                (
4173                    "type-err",
4174                    "type",
4175                    g.map(|s| format!("[{}] {}", s.label, s.surface()))
4176                        .unwrap_or_default(),
4177                    p.map(|s| format!("[{}] {}", s.label, s.surface()))
4178                        .unwrap_or_default(),
4179                    format!("{} → {}", gold_label, pred_label),
4180                    Some(format!("G{}", gold_id)),
4181                    Some(format!("P{}", pred_id)),
4182                )
4183            }
4184            EvalMatch::BoundaryError {
4185                gold_id,
4186                pred_id,
4187                iou,
4188            } => {
4189                let g = cmp.gold.iter().find(|s| s.id == *gold_id);
4190                let p = cmp.predicted.iter().find(|s| s.id == *pred_id);
4191                (
4192                    "boundary",
4193                    "bound",
4194                    g.map(|s| format!("[{}] \"{}\"", s.label, s.surface()))
4195                        .unwrap_or_default(),
4196                    p.map(|s| format!("[{}] \"{}\"", s.label, s.surface()))
4197                        .unwrap_or_default(),
4198                    format!("IoU={:.2}", iou),
4199                    Some(format!("G{}", gold_id)),
4200                    Some(format!("P{}", pred_id)),
4201                )
4202            }
4203            EvalMatch::Spurious { pred_id } => {
4204                let p = cmp.predicted.iter().find(|s| s.id == *pred_id);
4205                (
4206                    "spurious",
4207                    "FP",
4208                    String::new(),
4209                    p.map(|s| format!("[{}] {}", s.label, s.surface()))
4210                        .unwrap_or_default(),
4211                    "false positive".to_string(),
4212                    None,
4213                    Some(format!("P{}", pred_id)),
4214                )
4215            }
4216            EvalMatch::Missed { gold_id } => {
4217                let g = cmp.gold.iter().find(|s| s.id == *gold_id);
4218                (
4219                    "missed",
4220                    "FN",
4221                    g.map(|s| format!("[{}] {}", s.label, s.surface()))
4222                        .unwrap_or_default(),
4223                    String::new(),
4224                    "false negative".to_string(),
4225                    Some(format!("G{}", gold_id)),
4226                    None,
4227                )
4228            }
4229        };
4230
4231        let mut data_attrs = String::new();
4232        if let Some(gid) = gid.as_deref() {
4233            data_attrs.push_str(&format!(" data-gid=\"{}\"", html_escape(gid)));
4234        }
4235        if let Some(pid) = pid.as_deref() {
4236            data_attrs.push_str(&format!(" data-pid=\"{}\"", html_escape(pid)));
4237        }
4238
4239        html.push_str(&format!(
4240            "<tr id=\"M{mid}\" class=\"match-row {class}\"{attrs}><td><a class=\"match-link\" href=\"#M{mid}\">{mtype}</a></td><td>{gold}</td><td>{pred}</td><td>{notes}</td></tr>",
4241            mid = mi,
4242            class = class,
4243            attrs = data_attrs,
4244            mtype = html_escape(mtype),
4245            gold = html_escape(&gold_text),
4246            pred = html_escape(&pred_text),
4247            notes = html_escape(&notes)
4248        ));
4249    }
4250    html.push_str("</table>");
4251
4252    html.push_str(
4253        r#"<script>
4254(() => {
4255  // Theme toggle: auto (prefers-color-scheme) → dark → light.
4256  const themeBtn = document.getElementById('theme-toggle');
4257  const themeKey = 'anno-theme';
4258  const applyTheme = (theme) => {
4259    const t = theme || 'auto';
4260    if (t === 'auto') {
4261      delete document.documentElement.dataset.theme;
4262    } else {
4263      document.documentElement.dataset.theme = t;
4264    }
4265    if (themeBtn) themeBtn.textContent = `theme: ${t}`;
4266  };
4267  const readTheme = () => {
4268    try { return localStorage.getItem(themeKey) || 'auto'; } catch (_) { return 'auto'; }
4269  };
4270  const writeTheme = (t) => {
4271    try { localStorage.setItem(themeKey, t); } catch (_) { /* ignore */ }
4272  };
4273  applyTheme(readTheme());
4274  if (themeBtn) {
4275    themeBtn.addEventListener('click', () => {
4276      const cur = readTheme();
4277      const next = cur === 'auto' ? 'dark' : (cur === 'dark' ? 'light' : 'auto');
4278      writeTheme(next);
4279      applyTheme(next);
4280    });
4281  }
4282
4283  function clearActive() {
4284    document.querySelectorAll(".e-active").forEach((el) => el.classList.remove("e-active"));
4285    document.querySelectorAll("tr.match-row.active").forEach((el) => el.classList.remove("active"));
4286  }
4287
4288  function findSpanEls(eid) {
4289    if (!eid) return [];
4290    // New segmented renderer: one span can be split across multiple elements.
4291    const els = Array.from(document.querySelectorAll(`span.e[data-eids~='${eid}']`));
4292    if (els.length) return els;
4293    // Back-compat: older HTML used a single element id.
4294    const single = document.getElementById(eid);
4295    return single ? [single] : [];
4296  }
4297
4298  function activate(gid, pid, row) {
4299    clearActive();
4300    const gEls = findSpanEls(gid);
4301    const pEls = findSpanEls(pid);
4302    const sel = document.getElementById("selection");
4303    gEls.forEach((el) => el.classList.add("e-active"));
4304    pEls.forEach((el) => el.classList.add("e-active"));
4305    if (row) row.classList.add("active");
4306    if (sel) {
4307      const parts = [];
4308      if (gEls.length) {
4309        const lbl = gEls[0].dataset && gEls[0].dataset.label ? ` [${gEls[0].dataset.label}]` : "";
4310        parts.push(`gold ${gid}${lbl}`);
4311      }
4312      if (pEls.length) {
4313        const lbl = pEls[0].dataset && pEls[0].dataset.label ? ` [${pEls[0].dataset.label}]` : "";
4314        parts.push(`pred ${pid}${lbl}`);
4315      }
4316      sel.textContent = parts.length ? parts.join("  |  ") : "no selection";
4317    }
4318    if (row && row.id) {
4319      // Keep deep links stable without triggering navigation jump.
4320      // NOTE: single quotes avoid the Rust raw-string delimiter issue with quote+hash.
4321      history.replaceState(null, "", '#' + row.id);
4322    }
4323    const target = gEls[0] || pEls[0];
4324    if (target) target.scrollIntoView({ behavior: "smooth", block: "center" });
4325  }
4326
4327  document.querySelectorAll("tr.match-row[data-gid], tr.match-row[data-pid]").forEach((tr) => {
4328    tr.addEventListener("click", () => activate(tr.dataset.gid, tr.dataset.pid, tr));
4329  });
4330
4331  document.querySelectorAll("a.match-link").forEach((a) => {
4332    a.addEventListener("click", (ev) => {
4333      ev.preventDefault();
4334      const tr = a.closest("tr.match-row");
4335      if (!tr) return;
4336      activate(tr.dataset.gid, tr.dataset.pid, tr);
4337    });
4338  });
4339
4340  // Auto-select a match row if the URL has a deep link (e.g. #M12).
4341  const hash = (location.hash || "").slice(1);
4342  if (hash && hash.startsWith("M")) {
4343    const tr = document.getElementById(hash);
4344    if (tr && tr.classList && tr.classList.contains("match-row")) {
4345      activate(tr.dataset.gid, tr.dataset.pid, tr);
4346    }
4347  }
4348})();
4349</script>"#,
4350    );
4351
4352    html.push_str("</body></html>");
4353    html
4354}
4355
4356/// Annotate text with multiple labeled spans.
4357#[derive(Debug, Clone)]
4358struct EvalHtmlSpan {
4359    start: usize,
4360    end: usize,
4361    label: String,
4362    class: &'static str,
4363    id: String,
4364}
4365
4366fn annotate_text_spans(text: &str, spans: &[EvalHtmlSpan]) -> String {
4367    let char_count = text.chars().count();
4368    if char_count == 0 || spans.is_empty() {
4369        return html_escape(text);
4370    }
4371
4372    #[derive(Debug, Clone)]
4373    struct Meta {
4374        id: String,
4375        label: String,
4376        class: &'static str,
4377        len: usize,
4378    }
4379    #[derive(Debug, Clone)]
4380    struct Event {
4381        pos: usize,
4382        meta_idx: usize,
4383        delta: i32,
4384    }
4385
4386    let mut metas: Vec<Meta> = Vec::with_capacity(spans.len());
4387    let mut events: Vec<Event> = Vec::new();
4388    let mut boundaries: Vec<usize> = vec![0, char_count];
4389
4390    for s in spans {
4391        let start = s.start.min(char_count);
4392        let end = s.end.min(char_count);
4393        if start >= end {
4394            continue;
4395        }
4396        let meta_idx = metas.len();
4397        metas.push(Meta {
4398            id: s.id.clone(),
4399            label: s.label.to_string(),
4400            class: s.class,
4401            len: end - start,
4402        });
4403        boundaries.push(start);
4404        boundaries.push(end);
4405        events.push(Event {
4406            pos: start,
4407            meta_idx,
4408            delta: 1,
4409        });
4410        events.push(Event {
4411            pos: end,
4412            meta_idx,
4413            delta: -1,
4414        });
4415    }
4416
4417    if metas.is_empty() {
4418        return html_escape(text);
4419    }
4420
4421    boundaries.sort_unstable();
4422    boundaries.dedup();
4423    events.sort_by(|a, b| a.pos.cmp(&b.pos).then_with(|| a.delta.cmp(&b.delta)));
4424
4425    let mut active_counts: Vec<u32> = vec![0; metas.len()];
4426    let mut active: Vec<usize> = Vec::new();
4427    let mut ev_idx = 0usize;
4428    let mut result = String::new();
4429
4430    for bi in 0..boundaries.len().saturating_sub(1) {
4431        let pos = boundaries[bi];
4432        while ev_idx < events.len() && events[ev_idx].pos == pos {
4433            let e = &events[ev_idx];
4434            let idx = e.meta_idx;
4435            if e.delta < 0 {
4436                if active_counts[idx] > 0 {
4437                    active_counts[idx] -= 1;
4438                    if active_counts[idx] == 0 {
4439                        active.retain(|&x| x != idx);
4440                    }
4441                }
4442            } else {
4443                active_counts[idx] += 1;
4444                if active_counts[idx] == 1 {
4445                    active.push(idx);
4446                }
4447            }
4448            ev_idx += 1;
4449        }
4450
4451        let next = boundaries[bi + 1];
4452        if next <= pos {
4453            continue;
4454        }
4455
4456        let seg_text: String = text.chars().skip(pos).take(next - pos).collect();
4457        if active.is_empty() {
4458            result.push_str(&html_escape(&seg_text));
4459            continue;
4460        }
4461
4462        let primary_idx = active
4463            .iter()
4464            .copied()
4465            .min_by_key(|i| metas[*i].len)
4466            .unwrap_or(active[0]);
4467        let primary = &metas[primary_idx];
4468        let mut eids: Vec<&str> = active.iter().map(|i| metas[*i].id.as_str()).collect();
4469        eids.sort_unstable();
4470        let data_eids = eids.join(" ");
4471
4472        let title = format!(
4473            "eids=[{}] primary={} [{}..{})",
4474            data_eids, primary.id, pos, next
4475        );
4476        result.push_str(&format!(
4477            "<span class=\"e seg {class}\" data-eids=\"{eids}\" data-label=\"{label}\" data-start=\"{start}\" data-end=\"{end}\" title=\"{title}\">{text}</span>",
4478            class = primary.class,
4479            eids = html_escape(&data_eids),
4480            label = html_escape(&primary.label),
4481            start = pos,
4482            end = next,
4483            title = html_escape(&title),
4484            text = html_escape(&seg_text)
4485        ));
4486    }
4487
4488    result
4489}
4490
4491// =============================================================================
4492// URL/Text Input Processing
4493// =============================================================================
4494
4495/// Options for processing arbitrary input.
4496#[derive(Debug, Clone, Default)]
4497pub struct ProcessOptions {
4498    /// Labels to extract (empty = all)
4499    pub labels: Vec<String>,
4500    /// Confidence threshold
4501    pub threshold: f32,
4502}
4503
4504/// Result of processing input.
4505#[derive(Debug)]
4506pub struct ProcessResult {
4507    /// The document with signals
4508    pub document: GroundedDocument,
4509    /// Whether validation passed
4510    pub valid: bool,
4511    /// Any validation errors
4512    pub errors: Vec<SignalValidationError>,
4513}
4514
4515impl ProcessResult {
4516    /// Render as HTML.
4517    #[must_use]
4518    pub fn to_html(&self) -> String {
4519        render_document_html(&self.document)
4520    }
4521}
4522
4523// =============================================================================
4524// Corpus: Multi-Document Operations
4525// =============================================================================
4526
4527/// A corpus of grounded documents for cross-document operations.
4528///
4529/// Enables inter-document coreference resolution and entity linking
4530/// across multiple documents.
4531#[derive(Debug, Clone)]
4532pub struct Corpus {
4533    documents: std::collections::HashMap<String, GroundedDocument>,
4534    identities: std::collections::HashMap<IdentityId, Identity>,
4535    next_identity_id: IdentityId,
4536}
4537
4538impl Corpus {
4539    /// Create a new empty corpus.
4540    #[must_use]
4541    pub fn new() -> Self {
4542        Self {
4543            documents: std::collections::HashMap::new(),
4544            identities: std::collections::HashMap::new(),
4545            next_identity_id: IdentityId::ZERO,
4546        }
4547    }
4548
4549    /// Get all identities in the corpus.
4550    #[must_use]
4551    pub fn identities(&self) -> &std::collections::HashMap<IdentityId, Identity> {
4552        &self.identities
4553    }
4554
4555    /// Get an identity by ID.
4556    #[must_use]
4557    pub fn get_identity(&self, id: IdentityId) -> Option<&Identity> {
4558        self.identities.get(&id)
4559    }
4560
4561    /// Add an identity to the corpus and return its ID.
4562    ///
4563    /// This method assigns the next available identity ID and inserts the identity.
4564    /// Used by coalescing operations to create cross-document identities.
4565    pub fn add_identity(&mut self, mut identity: Identity) -> IdentityId {
4566        let id = self.next_identity_id;
4567        identity.id = id;
4568        self.identities.insert(id, identity);
4569        self.next_identity_id += 1;
4570        id
4571    }
4572
4573    /// Get the next identity ID that would be assigned.
4574    ///
4575    /// This is used by coalescing operations to reserve identity IDs.
4576    #[must_use]
4577    pub fn next_identity_id(&self) -> IdentityId {
4578        self.next_identity_id
4579    }
4580
4581    /// Get all documents in the corpus.
4582    ///
4583    /// Returns an iterator over all documents.
4584    pub fn documents(&self) -> impl Iterator<Item = &GroundedDocument> {
4585        self.documents.values()
4586    }
4587
4588    /// Get a document by ID.
4589    ///
4590    /// Returns `None` if the document doesn't exist.
4591    #[must_use]
4592    pub fn get_document(&self, doc_id: &str) -> Option<&GroundedDocument> {
4593        self.documents.get(doc_id)
4594    }
4595
4596    /// Get a mutable reference to a document by ID.
4597    ///
4598    /// Returns `None` if the document doesn't exist.
4599    pub fn get_document_mut(&mut self, doc_id: &str) -> Option<&mut GroundedDocument> {
4600        self.documents.get_mut(doc_id)
4601    }
4602
4603    /// Add a document to the corpus.
4604    ///
4605    /// If a document with the same ID already exists, it will be replaced.
4606    /// Returns the document ID.
4607    pub fn add_document(&mut self, document: GroundedDocument) -> String {
4608        let doc_id = document.id.clone();
4609        self.documents.insert(doc_id.clone(), document);
4610        doc_id
4611    }
4612
4613    /// Link a track to a knowledge base entity.
4614    ///
4615    /// This is the entity linking (NED) operation. It creates or updates
4616    /// an identity with KB information.
4617    ///
4618    /// # Parameters
4619    ///
4620    /// * `track_ref` - Reference to the track to link
4621    /// * `kb_name` - Knowledge base name (e.g., "wikidata")
4622    /// * `kb_id` - Knowledge base entity ID (e.g., "Q7186")
4623    /// * `canonical_name` - Canonical name from KB
4624    ///
4625    /// # Returns
4626    ///
4627    /// The identity ID (new or existing), or an error if the track reference is invalid.
4628    ///
4629    /// # Errors
4630    ///
4631    /// Returns `Error::TrackRef` if:
4632    /// - The document ID doesn't exist in the corpus
4633    /// - The track ID doesn't exist in the document
4634    pub fn link_track_to_kb(
4635        &mut self,
4636        track_ref: &TrackRef,
4637        kb_name: impl Into<String>,
4638        kb_id: impl Into<String>,
4639        canonical_name: impl Into<String>,
4640    ) -> super::Result<IdentityId> {
4641        use super::error::Error;
4642
4643        let doc = self.documents.get_mut(&track_ref.doc_id).ok_or_else(|| {
4644            Error::track_ref(format!(
4645                "Document '{}' not found in corpus",
4646                track_ref.doc_id
4647            ))
4648        })?;
4649        let track = doc.get_track(track_ref.track_id).ok_or_else(|| {
4650            Error::track_ref(format!(
4651                "Track {} not found in document '{}'",
4652                track_ref.track_id, track_ref.doc_id
4653            ))
4654        })?;
4655
4656        let kb_name_str = kb_name.into();
4657        let kb_id_str = kb_id.into();
4658        let canonical_name_str = canonical_name.into();
4659
4660        // Check if track already has an identity
4661        let identity_id = if let Some(existing_id) = track.identity_id {
4662            // Update existing identity with KB info if it exists in corpus
4663            if let Some(identity) = self.identities.get_mut(&existing_id) {
4664                identity.kb_id = Some(kb_id_str.clone());
4665                identity.kb_name = Some(kb_name_str.clone());
4666                identity.canonical_name = canonical_name_str.clone();
4667
4668                // Update source
4669                identity.source = Some(match identity.source.take() {
4670                    Some(IdentitySource::CrossDocCoref { track_refs }) => IdentitySource::Hybrid {
4671                        track_refs,
4672                        kb_name: kb_name_str.clone(),
4673                        kb_id: kb_id_str.clone(),
4674                    },
4675                    _ => IdentitySource::KnowledgeBase {
4676                        kb_name: kb_name_str.clone(),
4677                        kb_id: kb_id_str.clone(),
4678                    },
4679                });
4680
4681                existing_id
4682            } else {
4683                // Identity ID exists in document but not in corpus - this is inconsistent.
4684                // This can happen if:
4685                // 1. Document was added to corpus with pre-existing identities
4686                // 2. Identity was removed from corpus but document still references it
4687                //
4688                // Fix: Create new identity and update ALL references in the document
4689                // to ensure consistency between document and corpus state.
4690                let new_id = self.next_identity_id;
4691                self.next_identity_id += 1;
4692
4693                let identity = Identity {
4694                    id: new_id,
4695                    canonical_name: canonical_name_str,
4696                    entity_type: track.entity_type.clone(),
4697                    kb_id: Some(kb_id_str.clone()),
4698                    kb_name: Some(kb_name_str.clone()),
4699                    description: None,
4700                    embedding: track.embedding.clone(),
4701                    aliases: Vec::new(),
4702                    confidence: track.cluster_confidence,
4703                    source: Some(IdentitySource::KnowledgeBase {
4704                        kb_name: kb_name_str,
4705                        kb_id: kb_id_str,
4706                    }),
4707                };
4708
4709                self.identities.insert(new_id, identity);
4710                // Update the track's identity reference to point to the new identity
4711                // This ensures document and corpus are consistent
4712                doc.link_track_to_identity(track_ref.track_id, new_id);
4713                new_id
4714            }
4715        } else {
4716            // Create new identity
4717            let new_id = self.next_identity_id;
4718            self.next_identity_id += 1;
4719
4720            let identity = Identity {
4721                id: new_id,
4722                canonical_name: canonical_name_str,
4723                entity_type: track.entity_type.clone(),
4724                kb_id: Some(kb_id_str.clone()),
4725                kb_name: Some(kb_name_str.clone()),
4726                description: None,
4727                embedding: track.embedding.clone(),
4728                aliases: Vec::new(),
4729                confidence: track.cluster_confidence,
4730                source: Some(IdentitySource::KnowledgeBase {
4731                    kb_name: kb_name_str,
4732                    kb_id: kb_id_str,
4733                }),
4734            };
4735
4736            self.identities.insert(new_id, identity);
4737            doc.link_track_to_identity(track_ref.track_id, new_id);
4738            new_id
4739        };
4740
4741        Ok(identity_id)
4742    }
4743}
4744
4745impl Default for Corpus {
4746    fn default() -> Self {
4747        Self::new()
4748    }
4749}
4750
4751#[cfg(test)]
4752mod tests {
4753    #![allow(clippy::unwrap_used)] // unwrap() is acceptable in test code
4754    use super::*;
4755    use crate::EntityCategory;
4756
4757    #[test]
4758    fn test_render_eval_html_has_interactive_hooks_and_is_unicode_safe() {
4759        // CJK example (multi-byte, no spaces)
4760        let text = "習近平在北京會見了普京。";
4761
4762        let gold: Vec<Signal<Location>> = vec![
4763            Signal::new(SignalId::new(0), Location::text(0, 3), "習近平", "PER", 1.0),
4764            Signal::new(SignalId::new(1), Location::text(4, 6), "北京", "LOC", 1.0),
4765        ];
4766
4767        // Intentionally introduce a type mismatch on 北京 to ensure a non-correct row exists.
4768        let predicted: Vec<Signal<Location>> = vec![
4769            Signal::new(SignalId::new(0), Location::text(0, 3), "習近平", "PER", 0.9),
4770            Signal::new(SignalId::new(1), Location::text(4, 6), "北京", "PER", 0.7),
4771        ];
4772
4773        let cmp = EvalComparison::compare(text, gold, predicted);
4774        let html = render_eval_html_with_title(&cmp, "test");
4775
4776        // Selection readout (useful for humans + enables browser-based verification)
4777        assert!(html.contains("id=\"selection\""));
4778
4779        // Span IDs must be stable and distinct between gold/pred (segmented renderer uses data-eids)
4780        assert!(html.contains("data-eids=\"G0\""));
4781        assert!(html.contains("data-eids=\"P0\""));
4782
4783        // Match rows must carry cross-links and be clickable
4784        assert!(html.contains("class=\"match-link\""));
4785        assert!(html.contains("href=\"#M0\""));
4786        assert!(html.contains("data-gid=\"G0\""));
4787        assert!(html.contains("data-pid=\"P0\""));
4788
4789        // Ensure we didn't break Unicode rendering
4790        assert!(html.contains("北京"));
4791    }
4792
4793    fn find_char_span(text: &str, needle: &str) -> Option<(usize, usize)> {
4794        let hay: Vec<char> = text.chars().collect();
4795        let pat: Vec<char> = needle.chars().collect();
4796        if pat.is_empty() || hay.len() < pat.len() {
4797            return None;
4798        }
4799        for i in 0..=(hay.len() - pat.len()) {
4800            if hay[i..(i + pat.len())] == pat[..] {
4801                return Some((i, i + pat.len()));
4802            }
4803        }
4804        None
4805    }
4806
4807    #[test]
4808    fn test_annotate_text_html_supports_overlaps_discontinuous_and_unicode() {
4809        // Intentionally include multiple scripts and an overlap + discontinuous mention.
4810        let text = "Marie Curie met Cher in Paris. 習近平在北京會見了普京。 \
4811التقى محمد بن سلمان في الرياض. Путин встретился с Си Цзиньпином в Москве. \
4812प्रधान मंत्री शर्मा दिल्ली में मिले। severe pain ... in abdomen.";
4813
4814        // Overlap: "Marie Curie" contains "Curie"
4815        let (m0s, m0e) = find_char_span(text, "Marie Curie").unwrap();
4816        let (m1s, m1e) = find_char_span(text, "Curie").unwrap();
4817
4818        // Discontinuous: "pain" + "abdomen"
4819        let pain = find_char_span(text, "pain").unwrap();
4820        let abdomen = find_char_span(text, "abdomen").unwrap();
4821
4822        let signals: Vec<Signal<Location>> = vec![
4823            Signal::new(
4824                SignalId::new(0),
4825                Location::text(m0s, m0e),
4826                "Marie Curie",
4827                "PER",
4828                0.9,
4829            ),
4830            Signal::new(
4831                SignalId::new(1),
4832                Location::text(m1s, m1e),
4833                "Curie",
4834                "PER",
4835                0.8,
4836            ),
4837            Signal::new(
4838                SignalId::new(2),
4839                Location::Discontinuous {
4840                    segments: vec![pain, abdomen],
4841                },
4842                "pain … abdomen",
4843                "SYMPTOM",
4844                0.7,
4845            ),
4846        ];
4847
4848        let html = annotate_text_html(text, &signals, &std::collections::HashMap::new());
4849
4850        // Overlap must be representable (segment(s) covered by both S0 and S1).
4851        assert!(html.contains("data-sids=\"S0 S1\"") || html.contains("data-sids=\"S1 S0\""));
4852
4853        // Discontinuous mention should be present in two places (at least one segment contains S2).
4854        assert!(html.contains("data-sids=\"S2\""));
4855
4856        // Unicode safety: the original text snippets should still appear.
4857        assert!(html.contains("北京"));
4858        assert!(html.contains("Москве"));
4859        assert!(html.contains("शर्मा"));
4860        assert!(html.contains("محمد"));
4861    }
4862
4863    #[test]
4864    fn test_location_text_iou() {
4865        let l1 = Location::text(0, 10);
4866        let l2 = Location::text(5, 15);
4867        let iou = l1.iou(&l2).unwrap();
4868        // Intersection: [5, 10) = 5 chars
4869        // Union: [0, 15) = 15 chars
4870        // IoU = 5/15 = 0.333...
4871        assert!((iou - 0.333).abs() < 0.01);
4872    }
4873
4874    #[test]
4875    fn test_location_bbox_iou() {
4876        let b1 = Location::bbox(0.0, 0.0, 0.5, 0.5);
4877        let b2 = Location::bbox(0.25, 0.25, 0.5, 0.5);
4878        let iou = b1.iou(&b2).unwrap();
4879        // Intersection: 0.25 * 0.25 = 0.0625
4880        // Union: 0.5*0.5 + 0.5*0.5 - 0.0625 = 0.4375
4881        // IoU = 0.0625/0.4375 ≈ 0.143
4882        assert!((iou - 0.143).abs() < 0.01);
4883    }
4884
4885    #[test]
4886    fn test_location_different_types_no_iou() {
4887        let text = Location::text(0, 10);
4888        let bbox = Location::bbox(0.0, 0.0, 0.5, 0.5);
4889        assert!(text.iou(&bbox).is_none());
4890    }
4891
4892    #[test]
4893    fn test_signal_creation() {
4894        let signal: Signal<Location> =
4895            Signal::new(0, Location::text(0, 11), "Marie Curie", "Person", 0.95);
4896        assert_eq!(signal.surface, "Marie Curie");
4897        assert_eq!(signal.label, "Person".into());
4898        assert!((signal.confidence - 0.95).abs() < 0.001);
4899        assert!(!signal.negated);
4900    }
4901
4902    #[test]
4903    fn test_signal_with_linguistic_features() {
4904        let signal: Signal<Location> =
4905            Signal::new(0, Location::text(0, 10), "not a doctor", "Occupation", 0.8)
4906                .negated()
4907                .with_quantifier(Quantifier::Existential)
4908                .with_modality(Modality::Symbolic);
4909
4910        assert!(signal.negated);
4911        assert_eq!(signal.quantifier, Some(Quantifier::Existential));
4912        assert!(signal.modality.supports_linguistic_features());
4913    }
4914
4915    #[test]
4916    fn test_track_formation() {
4917        let mut track = Track::new(0, "Marie Curie");
4918        track.add_signal(0, 0);
4919        track.add_signal(1, 1);
4920        track.add_signal(2, 2);
4921
4922        assert_eq!(track.len(), 3);
4923        assert!(!track.is_singleton());
4924        assert!(!track.is_empty());
4925    }
4926
4927    #[test]
4928    fn test_identity_creation() {
4929        let identity = Identity::from_kb(0, "Marie Curie", "wikidata", "Q7186")
4930            .with_type("Person")
4931            .with_embedding(vec![0.1, 0.2, 0.3]);
4932
4933        assert_eq!(identity.canonical_name, "Marie Curie");
4934        assert_eq!(identity.kb_id, Some("Q7186".to_string()));
4935        assert_eq!(identity.kb_name, Some("wikidata".to_string()));
4936        assert!(identity.embedding.is_some());
4937    }
4938
4939    #[test]
4940    fn test_grounded_document_hierarchy() {
4941        let mut doc = GroundedDocument::new(
4942            "doc1",
4943            "Marie Curie won the Nobel Prize. She was a physicist.",
4944        );
4945
4946        // Add signals (Level 1)
4947        let s1 = doc.add_signal(Signal::new(
4948            0,
4949            Location::text(0, 12),
4950            "Marie Curie",
4951            "Person",
4952            0.95,
4953        ));
4954        let s2 = doc.add_signal(Signal::new(
4955            1,
4956            Location::text(38, 41),
4957            "She",
4958            "Person",
4959            0.88,
4960        ));
4961        let s3 = doc.add_signal(Signal::new(
4962            2,
4963            Location::text(17, 29),
4964            "Nobel Prize",
4965            "Award",
4966            0.92,
4967        ));
4968
4969        // Form tracks (Level 2)
4970        let mut track1 = Track::new(0, "Marie Curie");
4971        track1.add_signal(s1, 0);
4972        track1.add_signal(s2, 1);
4973        let track1_id = doc.add_track(track1);
4974
4975        let mut track2 = Track::new(1, "Nobel Prize");
4976        track2.add_signal(s3, 0);
4977        doc.add_track(track2);
4978
4979        // Add identity (Level 3)
4980        let identity = Identity::from_kb(0, "Marie Curie", "wikidata", "Q7186");
4981        let identity_id = doc.add_identity(identity);
4982        doc.link_track_to_identity(track1_id, identity_id);
4983
4984        // Verify hierarchy traversal
4985        assert_eq!(doc.signals().len(), 3);
4986        assert_eq!(doc.tracks().count(), 2);
4987        assert_eq!(doc.identities().count(), 1);
4988
4989        // Signal → Track
4990        let track = doc.track_for_signal(s1).unwrap();
4991        assert_eq!(track.canonical_surface, "Marie Curie");
4992        assert_eq!(track.len(), 2);
4993
4994        // Track → Identity
4995        let identity = doc.identity_for_track(track1_id).unwrap();
4996        assert_eq!(identity.kb_id, Some("Q7186".to_string()));
4997
4998        // Signal → Identity (transitive)
4999        let identity = doc.identity_for_signal(s1).unwrap();
5000        assert_eq!(identity.canonical_name, "Marie Curie");
5001    }
5002
5003    #[test]
5004    fn test_modality_features() {
5005        assert!(Modality::Symbolic.supports_linguistic_features());
5006        assert!(!Modality::Symbolic.supports_geometric_features());
5007
5008        assert!(!Modality::Iconic.supports_linguistic_features());
5009        assert!(Modality::Iconic.supports_geometric_features());
5010
5011        assert!(Modality::Hybrid.supports_linguistic_features());
5012        assert!(Modality::Hybrid.supports_geometric_features());
5013    }
5014
5015    #[test]
5016    fn test_location_from_span() {
5017        let span = Span::Text { start: 0, end: 10 };
5018        let location = Location::from(&span);
5019        assert_eq!(location.text_offsets(), Some((0, 10)));
5020
5021        let span = Span::BoundingBox {
5022            x: 0.1,
5023            y: 0.2,
5024            width: 0.3,
5025            height: 0.4,
5026            page: Some(1),
5027        };
5028        let location = Location::from(&span);
5029        assert!(matches!(location, Location::BoundingBox { .. }));
5030    }
5031
5032    #[test]
5033    fn test_entity_roundtrip() {
5034        use super::EntityType;
5035
5036        let entities = vec![
5037            Entity::new("Marie Curie", EntityType::Person, 0, 12, 0.95),
5038            Entity::new(
5039                "Nobel Prize",
5040                EntityType::custom("Award", EntityCategory::Creative),
5041                17,
5042                29,
5043                0.92,
5044            ),
5045        ];
5046
5047        let doc =
5048            GroundedDocument::from_entities("doc1", "Marie Curie won the Nobel Prize.", &entities);
5049        let converted = doc.to_entities();
5050
5051        assert_eq!(converted.len(), 2);
5052        assert_eq!(converted[0].text, "Marie Curie");
5053        assert_eq!(converted[1].text, "Nobel Prize");
5054    }
5055
5056    #[test]
5057    fn test_signal_confidence_threshold() {
5058        let signal: Signal<Location> = Signal::new(0, Location::text(0, 10), "test", "Type", 0.75);
5059        assert!(signal.is_confident(0.5));
5060        assert!(signal.is_confident(0.75));
5061        assert!(!signal.is_confident(0.8));
5062    }
5063
5064    #[test]
5065    fn test_document_filtering() {
5066        let mut doc = GroundedDocument::new("doc1", "Test text");
5067
5068        // Add signals with different confidences and labels
5069        doc.add_signal(Signal::new(0, Location::text(0, 4), "high", "Person", 0.95));
5070        doc.add_signal(Signal::new(1, Location::text(5, 8), "low", "Person", 0.3));
5071        doc.add_signal(Signal::new(
5072            2,
5073            Location::text(9, 12),
5074            "org",
5075            "Organization",
5076            0.8,
5077        ));
5078
5079        // Filter by confidence
5080        let confident = doc.confident_signals(0.5);
5081        assert_eq!(confident.len(), 2);
5082
5083        // Filter by label
5084        let persons = doc.signals_with_label("Person");
5085        assert_eq!(persons.len(), 2);
5086
5087        let orgs = doc.signals_with_label("Organization");
5088        assert_eq!(orgs.len(), 1);
5089    }
5090
5091    #[test]
5092    fn test_untracked_signals() {
5093        let mut doc = GroundedDocument::new("doc1", "Test");
5094
5095        let s1 = doc.add_signal(Signal::new(0, Location::text(0, 4), "a", "T", 0.9));
5096        let s2 = doc.add_signal(Signal::new(1, Location::text(5, 8), "b", "T", 0.9));
5097        let _s3 = doc.add_signal(Signal::new(2, Location::text(9, 12), "c", "T", 0.9));
5098
5099        // Only track s1 and s2
5100        let mut track = Track::new(0, "a");
5101        track.add_signal(s1, 0);
5102        track.add_signal(s2, 1);
5103        doc.add_track(track);
5104
5105        // s3 should be untracked
5106        assert_eq!(doc.untracked_signal_count(), 1);
5107        let untracked = doc.untracked_signals();
5108        assert_eq!(untracked.len(), 1);
5109        assert_eq!(untracked[0].surface, "c");
5110    }
5111
5112    #[test]
5113    fn test_linked_unlinked_tracks() {
5114        let mut doc = GroundedDocument::new("doc1", "Test");
5115
5116        let s1 = doc.add_signal(Signal::new(0, Location::text(0, 4), "a", "T", 0.9));
5117        let s2 = doc.add_signal(Signal::new(1, Location::text(5, 8), "b", "T", 0.9));
5118
5119        let mut track1 = Track::new(0, "a");
5120        track1.add_signal(s1, 0);
5121        let track1_id = doc.add_track(track1);
5122
5123        let mut track2 = Track::new(1, "b");
5124        track2.add_signal(s2, 0);
5125        doc.add_track(track2);
5126
5127        // Link only track1 to an identity
5128        let identity = Identity::new(0, "Entity A");
5129        let identity_id = doc.add_identity(identity);
5130        doc.link_track_to_identity(track1_id, identity_id);
5131
5132        assert_eq!(doc.linked_tracks().count(), 1);
5133        assert_eq!(doc.unlinked_tracks().count(), 1);
5134    }
5135
5136    #[test]
5137    fn test_location_overlaps() {
5138        let l1 = Location::text(0, 10);
5139        let l2 = Location::text(5, 15);
5140        let l3 = Location::text(15, 20);
5141
5142        assert!(l1.overlaps(&l2));
5143        assert!(!l1.overlaps(&l3));
5144        assert!(!l2.overlaps(&l3)); // [5,15) and [15,20) don't overlap
5145
5146        // Bounding boxes
5147        let b1 = Location::bbox(0.0, 0.0, 0.5, 0.5);
5148        let b2 = Location::bbox(0.4, 0.4, 0.5, 0.5);
5149        let b3 = Location::bbox(0.6, 0.6, 0.2, 0.2);
5150
5151        assert!(b1.overlaps(&b2));
5152        assert!(!b1.overlaps(&b3));
5153    }
5154
5155    #[test]
5156    fn test_iou_edge_cases() {
5157        // No overlap
5158        let l1 = Location::text(0, 5);
5159        let l2 = Location::text(10, 15);
5160        assert_eq!(l1.iou(&l2), Some(0.0));
5161
5162        // Complete overlap (identical)
5163        let l3 = Location::text(0, 10);
5164        let l4 = Location::text(0, 10);
5165        assert_eq!(l3.iou(&l4), Some(1.0));
5166
5167        // One contains the other
5168        let l5 = Location::text(0, 20);
5169        let l6 = Location::text(5, 15);
5170        let iou = l5.iou(&l6).unwrap();
5171        // Intersection: 10, Union: 20
5172        assert!((iou - 0.5).abs() < 0.001);
5173    }
5174
5175    // Note: Tests that depend on anno::eval::coref types have been moved to anno crate
5176    // (test_coref_chain_conversion, test_from_coref_document, test_coref_roundtrip)
5177
5178    #[test]
5179    fn test_document_stats() {
5180        let mut doc = GroundedDocument::new("doc1", "Test document with entities.");
5181
5182        // Add signals with varying properties
5183        let s1 = doc.add_signal(Signal::new(0, Location::text(0, 4), "Test", "Type", 0.9));
5184        let mut negated = Signal::new(0, Location::text(5, 13), "document", "Type", 0.8);
5185        negated.negated = true;
5186        let s2 = doc.add_signal(negated);
5187        let _s3 = doc.add_signal(Signal::new(
5188            0,
5189            Location::text(19, 27),
5190            "entities",
5191            "Type",
5192            0.7,
5193        ));
5194
5195        // Create one track with 2 signals
5196        let mut track = Track::new(0, "Test");
5197        track.add_signal(s1, 0);
5198        track.add_signal(s2, 1);
5199        doc.add_track(track);
5200
5201        // Add identity for the track
5202        let identity = Identity::new(0, "Test Entity");
5203        let identity_id = doc.add_identity(identity);
5204        doc.link_track_to_identity(0, identity_id);
5205
5206        let stats = doc.stats();
5207
5208        assert_eq!(stats.signal_count, 3);
5209        assert_eq!(stats.track_count, 1);
5210        assert_eq!(stats.identity_count, 1);
5211        assert_eq!(stats.linked_track_count, 1);
5212        assert_eq!(stats.untracked_count, 1); // s3 is untracked
5213        assert_eq!(stats.negated_count, 1);
5214        assert!((stats.avg_confidence - 0.8).abs() < 0.01); // (0.9 + 0.8 + 0.7) / 3
5215        assert!((stats.avg_track_size - 2.0).abs() < 0.01);
5216    }
5217
5218    #[test]
5219    fn test_batch_operations() {
5220        let mut doc = GroundedDocument::new("doc1", "Test document.");
5221
5222        // Batch add signals
5223        let signals = vec![
5224            Signal::new(0, Location::text(0, 4), "Test", "Type", 0.9),
5225            Signal::new(0, Location::text(5, 13), "document", "Type", 0.8),
5226        ];
5227        let ids = doc.add_signals(signals);
5228
5229        assert_eq!(ids.len(), 2);
5230        assert_eq!(doc.signals().len(), 2);
5231
5232        // Create track from signal IDs
5233        let track_id = doc.create_track_from_signals("Test", &ids);
5234        assert!(track_id.is_some());
5235
5236        let track = doc.get_track(track_id.unwrap()).unwrap();
5237        assert_eq!(track.len(), 2);
5238        assert_eq!(track.canonical_surface, "Test");
5239    }
5240
5241    #[test]
5242    fn test_merge_tracks() {
5243        let mut doc = GroundedDocument::new("doc1", "John Smith works at Acme. He is great.");
5244
5245        // Add signals
5246        let s1 = doc.add_signal(Signal::new(
5247            0,
5248            Location::text(0, 10),
5249            "John Smith",
5250            "Person",
5251            0.9,
5252        ));
5253        let s2 = doc.add_signal(Signal::new(0, Location::text(26, 28), "He", "Person", 0.8));
5254
5255        // Create two separate tracks
5256        let mut track1 = Track::new(0, "John Smith");
5257        track1.add_signal(s1, 0);
5258        let track1_id = doc.add_track(track1);
5259
5260        let mut track2 = Track::new(0, "He");
5261        track2.add_signal(s2, 0);
5262        let track2_id = doc.add_track(track2);
5263
5264        assert_eq!(doc.tracks().count(), 2);
5265
5266        // Merge tracks
5267        let merged_id = doc.merge_tracks(&[track1_id, track2_id]);
5268        assert!(merged_id.is_some());
5269
5270        // Should now have only 1 track with 2 signals
5271        assert_eq!(doc.tracks().count(), 1);
5272        let merged = doc.get_track(merged_id.unwrap()).unwrap();
5273        assert_eq!(merged.len(), 2);
5274        assert_eq!(merged.canonical_surface, "John Smith"); // From first track
5275    }
5276
5277    #[test]
5278    fn test_find_overlapping_pairs() {
5279        let mut doc = GroundedDocument::new("doc1", "New York City is great.");
5280
5281        // Add overlapping signals (nested entity)
5282        doc.add_signal(Signal::new(
5283            0,
5284            Location::text(0, 13),
5285            "New York City",
5286            "Location",
5287            0.9,
5288        ));
5289        doc.add_signal(Signal::new(
5290            0,
5291            Location::text(0, 8),
5292            "New York",
5293            "Location",
5294            0.85,
5295        ));
5296        doc.add_signal(Signal::new(0, Location::text(17, 22), "great", "Adj", 0.7)); // Not overlapping
5297
5298        let pairs = doc.find_overlapping_signal_pairs();
5299
5300        // Should find one overlapping pair (New York City & New York)
5301        assert_eq!(pairs.len(), 1);
5302    }
5303
5304    #[test]
5305    fn test_signals_in_range() {
5306        let mut doc = GroundedDocument::new("doc1", "John went to Paris and Berlin last year.");
5307
5308        doc.add_signal(Signal::new(0, Location::text(0, 4), "John", "Person", 0.9));
5309        doc.add_signal(Signal::new(
5310            0,
5311            Location::text(13, 18),
5312            "Paris",
5313            "Location",
5314            0.9,
5315        ));
5316        doc.add_signal(Signal::new(
5317            0,
5318            Location::text(23, 29),
5319            "Berlin",
5320            "Location",
5321            0.9,
5322        ));
5323        doc.add_signal(Signal::new(
5324            0,
5325            Location::text(30, 39),
5326            "last year",
5327            "Date",
5328            0.8,
5329        ));
5330
5331        // Find signals in the "Paris and Berlin" section
5332        let in_range = doc.signals_in_range(10, 30);
5333        assert_eq!(in_range.len(), 2); // Paris and Berlin
5334
5335        let surfaces: Vec<_> = in_range.iter().map(|s| &s.surface).collect();
5336        assert!(surfaces.contains(&&"Paris".to_string()));
5337        assert!(surfaces.contains(&&"Berlin".to_string()));
5338    }
5339
5340    #[test]
5341    fn test_modality_filtering() {
5342        let mut doc = GroundedDocument::new("doc1", "Test");
5343
5344        // Add text signal
5345        let mut text_signal = Signal::new(0, Location::text(0, 4), "Test", "Type", 0.9);
5346        text_signal.modality = Modality::Symbolic;
5347        doc.add_signal(text_signal);
5348
5349        // Add visual signal
5350        let mut visual_signal =
5351            Signal::new(0, Location::bbox(0.0, 0.0, 0.5, 0.5), "Box", "Type", 0.8);
5352        visual_signal.modality = Modality::Iconic;
5353        doc.add_signal(visual_signal);
5354
5355        assert_eq!(doc.text_signals().len(), 1);
5356        assert_eq!(doc.visual_signals().len(), 1);
5357        assert_eq!(doc.signals_by_modality(Modality::Hybrid).len(), 0);
5358    }
5359
5360    #[test]
5361    fn test_quantifier_variants() {
5362        // Ensure all quantifier variants work
5363        let quantifiers = [
5364            Quantifier::Universal,
5365            Quantifier::Existential,
5366            Quantifier::None,
5367            Quantifier::Definite,
5368            Quantifier::Bare,
5369        ];
5370
5371        for q in quantifiers {
5372            let signal: Signal<Location> =
5373                Signal::new(0, Location::text(0, 5), "test", "Type", 0.9).with_quantifier(q);
5374
5375            assert_eq!(signal.quantifier, Some(q));
5376        }
5377    }
5378
5379    #[test]
5380    fn test_location_modality_derivation() {
5381        assert_eq!(Location::text(0, 10).modality(), Modality::Symbolic);
5382        assert_eq!(
5383            Location::bbox(0.0, 0.0, 0.5, 0.5).modality(),
5384            Modality::Iconic
5385        );
5386
5387        let temporal = Location::Temporal {
5388            start_sec: 0.0,
5389            end_sec: 5.0,
5390            frame: None,
5391        };
5392        assert_eq!(temporal.modality(), Modality::Iconic);
5393
5394        let genomic = Location::Genomic {
5395            contig: "chr1".into(),
5396            start: 0,
5397            end: 1000,
5398            strand: Some('+'),
5399        };
5400        assert_eq!(genomic.modality(), Modality::Symbolic);
5401
5402        let hybrid = Location::TextWithBbox {
5403            start: 0,
5404            end: 10,
5405            bbox: Box::new(Location::bbox(0.0, 0.0, 0.5, 0.5)),
5406        };
5407        assert_eq!(hybrid.modality(), Modality::Hybrid);
5408    }
5409
5410    // Note: CrossDocCluster conversion test moved to anno crate
5411    // since CrossDocCluster is defined in anno/src/eval/cdcr.rs
5412}
5413
5414// =============================================================================
5415// Property-Based Tests
5416// =============================================================================
5417//
5418// These tests verify invariants that should hold for ALL valid inputs,
5419// not just specific examples. They catch edge cases that unit tests miss.
5420
5421#[cfg(test)]
5422mod proptests {
5423    #![allow(clippy::unwrap_used)] // unwrap() is acceptable in property tests
5424    use super::*;
5425    use proptest::prelude::*;
5426
5427    // -------------------------------------------------------------------------
5428    // Strategies for generating test data
5429    // -------------------------------------------------------------------------
5430
5431    /// Generate valid confidence values in [0, 1].
5432    fn confidence_strategy() -> impl Strategy<Value = f32> {
5433        0.0f32..=1.0
5434    }
5435
5436    /// Generate signal labels.
5437    fn label_strategy() -> impl Strategy<Value = String> {
5438        prop_oneof![
5439            Just("Person".to_string()),
5440            Just("Organization".to_string()),
5441            Just("Location".to_string()),
5442            Just("Date".to_string()),
5443            "[A-Z][a-z]{2,10}".prop_map(|s| s),
5444        ]
5445    }
5446
5447    /// Generate surface forms (entity text).
5448    fn surface_strategy() -> impl Strategy<Value = String> {
5449        "[A-Za-z ]{1,50}".prop_map(|s| s.trim().to_string())
5450    }
5451
5452    // -------------------------------------------------------------------------
5453    // IoU Properties (Intersection over Union)
5454    // -------------------------------------------------------------------------
5455
5456    proptest! {
5457        /// IoU is symmetric: iou(a, b) == iou(b, a)
5458        #[test]
5459        fn iou_symmetric(
5460            start1 in 0usize..1000,
5461            len1 in 1usize..500,
5462            start2 in 0usize..1000,
5463            len2 in 1usize..500,
5464        ) {
5465            let a = Location::text(start1, start1 + len1);
5466            let b = Location::text(start2, start2 + len2);
5467
5468            let iou_ab = a.iou(&b);
5469            let iou_ba = b.iou(&a);
5470
5471            prop_assert_eq!(iou_ab, iou_ba, "IoU must be symmetric");
5472        }
5473
5474        /// IoU is bounded: 0 <= iou <= 1
5475        #[test]
5476        fn iou_bounded(
5477            start1 in 0usize..1000,
5478            len1 in 1usize..500,
5479            start2 in 0usize..1000,
5480            len2 in 1usize..500,
5481        ) {
5482            let a = Location::text(start1, start1 + len1);
5483            let b = Location::text(start2, start2 + len2);
5484
5485            if let Some(iou) = a.iou(&b) {
5486                prop_assert!(iou >= 0.0, "IoU must be non-negative: got {}", iou);
5487                prop_assert!(iou <= 1.0, "IoU must be at most 1: got {}", iou);
5488            }
5489        }
5490
5491        /// Self-IoU is 1: iou(a, a) == 1
5492        #[test]
5493        fn iou_self_identity(start in 0usize..1000, len in 1usize..500) {
5494            let loc = Location::text(start, start + len);
5495            let iou = loc.iou(&loc).unwrap();
5496            prop_assert!(
5497                (iou - 1.0).abs() < 1e-6,
5498                "Self-IoU must be 1.0, got {}",
5499                iou
5500            );
5501        }
5502
5503        /// Non-overlapping locations have IoU = 0
5504        #[test]
5505        fn iou_non_overlapping_zero(
5506            start1 in 0usize..500,
5507            len1 in 1usize..100,
5508        ) {
5509            let end1 = start1 + len1;
5510            let start2 = end1 + 100; // Guaranteed gap
5511            let len2 = 50;
5512
5513            let a = Location::text(start1, end1);
5514            let b = Location::text(start2, start2 + len2);
5515
5516            let iou = a.iou(&b).expect("bbox iou should be defined");
5517            prop_assert!(
5518                iou.abs() < 1e-6,
5519                "Non-overlapping IoU must be 0, got {}",
5520                iou
5521            );
5522        }
5523
5524        /// BoundingBox IoU is also symmetric and bounded
5525        #[test]
5526        fn bbox_iou_symmetric_bounded(
5527            x1 in 0.0f32..0.8,
5528            y1 in 0.0f32..0.8,
5529            w1 in 0.05f32..0.2,
5530            h1 in 0.05f32..0.2,
5531            x2 in 0.0f32..0.8,
5532            y2 in 0.0f32..0.8,
5533            w2 in 0.05f32..0.2,
5534            h2 in 0.05f32..0.2,
5535        ) {
5536            let a = Location::bbox(x1, y1, w1, h1);
5537            let b = Location::bbox(x2, y2, w2, h2);
5538
5539            let iou_ab = a.iou(&b);
5540            let iou_ba = b.iou(&a);
5541
5542            // Symmetry
5543            prop_assert_eq!(iou_ab, iou_ba, "BBox IoU must be symmetric");
5544
5545            // Bounded
5546            if let Some(iou) = iou_ab {
5547                prop_assert!(
5548                    (0.0..=1.0).contains(&iou),
5549                    "BBox IoU out of bounds: {}",
5550                    iou
5551                );
5552            }
5553        }
5554    }
5555
5556    // -------------------------------------------------------------------------
5557    // Signal Properties
5558    // -------------------------------------------------------------------------
5559
5560    proptest! {
5561        /// Confidence is always clamped to [0, 1]
5562        #[test]
5563        fn signal_confidence_clamped(raw_conf in -10.0f32..10.0) {
5564            let signal: Signal<Location> = Signal::new(
5565                0,
5566                Location::text(0, 10),
5567                "test",
5568                "Type",
5569                raw_conf,
5570            );
5571
5572            prop_assert!(signal.confidence >= 0.0, "Confidence below 0: {}", signal.confidence);
5573            prop_assert!(signal.confidence <= 1.0, "Confidence above 1: {}", signal.confidence);
5574        }
5575
5576        /// Signal with valid inputs preserves surface and label
5577        #[test]
5578        fn signal_preserves_data(
5579            surface in surface_strategy(),
5580            label in label_strategy(),
5581            conf in confidence_strategy(),
5582            start in 0usize..1000,
5583            len in 1usize..100,
5584        ) {
5585            let signal: Signal<Location> = Signal::new(
5586                0,
5587                Location::text(start, start + len),
5588                &surface,
5589                label.as_str(),
5590                conf,
5591            );
5592
5593            prop_assert_eq!(&signal.surface, &surface);
5594            let want = crate::TypeLabel::from(label.as_str());
5595            prop_assert_eq!(signal.label, want);
5596        }
5597
5598        /// Negation is idempotent: negated().negated() still has negated=true
5599        /// (Note: our API doesn't have an "un-negate", so calling negated() twice
5600        /// just keeps it negated - this tests that it doesn't toggle)
5601        #[test]
5602        fn signal_negation_stable(conf in confidence_strategy()) {
5603            let signal: Signal<Location> = Signal::new(
5604                0,
5605                Location::text(0, 10),
5606                "test",
5607                "Type",
5608                conf,
5609            )
5610            .negated();
5611
5612            prop_assert!(signal.negated, "Signal should be negated after .negated()");
5613        }
5614
5615        /// Symbolic modality supports linguistic features
5616        #[test]
5617        fn symbolic_supports_linguistic(
5618            start in 0usize..1000,
5619            len in 1usize..100,
5620        ) {
5621            let loc = Location::text(start, start + len);
5622            prop_assert!(
5623                loc.modality().supports_linguistic_features(),
5624                "Text locations must support linguistic features"
5625            );
5626        }
5627
5628        /// Iconic modality supports geometric features
5629        #[test]
5630        fn iconic_supports_geometric(
5631            x in 0.0f32..0.9,
5632            y in 0.0f32..0.9,
5633            w in 0.01f32..0.5,
5634            h in 0.01f32..0.5,
5635        ) {
5636            let loc = Location::bbox(x, y, w, h);
5637            prop_assert!(
5638                loc.modality().supports_geometric_features(),
5639                "BBox locations must support geometric features"
5640            );
5641        }
5642    }
5643
5644    // -------------------------------------------------------------------------
5645    // Track Properties
5646    // -------------------------------------------------------------------------
5647
5648    proptest! {
5649        /// Track length increases with each added signal
5650        #[test]
5651        fn track_length_monotonic(signal_count in 1usize..20) {
5652            let mut track = Track::new(0, "test");
5653
5654            for i in 0..signal_count {
5655                track.add_signal(i, i as u32);
5656                prop_assert_eq!(
5657                    track.len(),
5658                    i + 1,
5659                    "Track length should be {} after adding {} signals",
5660                    i + 1,
5661                    i + 1
5662                );
5663            }
5664        }
5665
5666        /// Track is never empty after adding a signal
5667        #[test]
5668        fn track_not_empty_after_add(canonical in surface_strategy()) {
5669            let mut track = Track::new(0, &canonical);
5670            prop_assert!(track.is_empty(), "New track should be empty");
5671
5672            track.add_signal(0, 0);
5673            prop_assert!(!track.is_empty(), "Track should not be empty after add");
5674        }
5675
5676        /// Track positions are stored correctly
5677        #[test]
5678        fn track_positions_stored(signal_count in 1usize..10) {
5679            let mut track = Track::new(0, "test");
5680
5681            for i in 0..signal_count {
5682                track.add_signal(i, i as u32);
5683            }
5684
5685            for (idx, signal_ref) in track.signals.iter().enumerate() {
5686                prop_assert_eq!(
5687                    signal_ref.position as usize,
5688                    idx,
5689                    "Signal position mismatch at index {}",
5690                    idx
5691                );
5692            }
5693        }
5694    }
5695
5696    // -------------------------------------------------------------------------
5697    // GroundedDocument Properties
5698    // -------------------------------------------------------------------------
5699
5700    proptest! {
5701        /// Signal IDs are unique and monotonically increasing
5702        #[test]
5703        fn document_signal_ids_monotonic(signal_count in 1usize..20) {
5704            let mut doc = GroundedDocument::new("test", "test text");
5705
5706            let mut prev_id: Option<SignalId> = None;
5707            for i in 0..signal_count {
5708                let id = doc.add_signal(Signal::new(
5709                    999, // Should be overwritten
5710                    Location::text(i * 10, i * 10 + 5),
5711                    format!("entity_{}", i),
5712                    "Type",
5713                    0.9,
5714                ));
5715
5716                if let Some(prev) = prev_id {
5717                    prop_assert!(id > prev, "Signal IDs should be monotonically increasing");
5718                }
5719                prev_id = Some(id);
5720            }
5721        }
5722
5723        /// Track membership is consistent: if signal is in track, track_for_signal returns that track
5724        #[test]
5725        fn document_track_membership_consistent(signal_count in 1usize..5) {
5726            let mut doc = GroundedDocument::new("test", "test text");
5727
5728            // Add signals
5729            let mut signal_ids = Vec::new();
5730            for i in 0..signal_count {
5731                let id = doc.add_signal(Signal::new(
5732                    0,
5733                    Location::text(i * 10, i * 10 + 5),
5734                    format!("entity_{}", i),
5735                    "Type",
5736                    0.9,
5737                ));
5738                signal_ids.push(id);
5739            }
5740
5741            // Create track with all signals
5742            let mut track = Track::new(0, "canonical");
5743            for (pos, &id) in signal_ids.iter().enumerate() {
5744                track.add_signal(id, pos as u32);
5745            }
5746            let track_id = doc.add_track(track);
5747
5748            // Verify membership
5749            for &signal_id in &signal_ids {
5750                let found_track = doc.track_for_signal(signal_id);
5751                prop_assert!(found_track.is_some(), "Signal should be in a track");
5752                prop_assert_eq!(
5753                    found_track.unwrap().id,
5754                    track_id,
5755                    "Signal should be in the correct track"
5756                );
5757            }
5758        }
5759
5760        /// Identity linking is transitive: signal → track → identity
5761        #[test]
5762        fn document_identity_transitivity(signal_count in 1usize..3) {
5763            let mut doc = GroundedDocument::new("test", "test text");
5764
5765            // Add signals
5766            let mut signal_ids = Vec::new();
5767            for i in 0..signal_count {
5768                let id = doc.add_signal(Signal::new(
5769                    0,
5770                    Location::text(i * 10, i * 10 + 5),
5771                    format!("entity_{}", i),
5772                    "Type",
5773                    0.9,
5774                ));
5775                signal_ids.push(id);
5776            }
5777
5778            // Create track and identity
5779            let mut track = Track::new(0, "canonical");
5780            for (pos, &id) in signal_ids.iter().enumerate() {
5781                track.add_signal(id, pos as u32);
5782            }
5783            let track_id = doc.add_track(track);
5784
5785            let identity = Identity::from_kb(0, "Entity", "wikidata", "Q123");
5786            let identity_id = doc.add_identity(identity);
5787            doc.link_track_to_identity(track_id, identity_id);
5788
5789            // Verify transitivity
5790            for &signal_id in &signal_ids {
5791                let identity = doc.identity_for_signal(signal_id);
5792                prop_assert!(identity.is_some(), "Should find identity through signal");
5793                prop_assert_eq!(
5794                    identity.unwrap().id,
5795                    identity_id,
5796                    "Should find correct identity"
5797                );
5798            }
5799        }
5800
5801        /// Untracked signals are correctly identified
5802        #[test]
5803        fn document_untracked_signals(total in 2usize..10, tracked in 0usize..10) {
5804            let tracked = tracked.min(total - 1); // Ensure at least one untracked
5805            let mut doc = GroundedDocument::new("test", "test text");
5806
5807            // Add all signals
5808            let mut signal_ids = Vec::new();
5809            for i in 0..total {
5810                let id = doc.add_signal(Signal::new(
5811                    0,
5812                    Location::text(i * 10, i * 10 + 5),
5813                    format!("entity_{}", i),
5814                    "Type",
5815                    0.9,
5816                ));
5817                signal_ids.push(id);
5818            }
5819
5820            // Track only some signals
5821            let mut track = Track::new(0, "canonical");
5822            for (pos, &id) in signal_ids.iter().take(tracked).enumerate() {
5823                track.add_signal(id, pos as u32);
5824            }
5825            if tracked > 0 {
5826                doc.add_track(track);
5827            }
5828
5829            // Verify counts
5830            prop_assert_eq!(
5831                doc.untracked_signal_count(),
5832                total - tracked,
5833                "Wrong untracked count"
5834            );
5835        }
5836    }
5837
5838    // -------------------------------------------------------------------------
5839    // Roundtrip / Conversion Properties
5840    // -------------------------------------------------------------------------
5841
5842    proptest! {
5843        /// Entity → GroundedDocument → Entities preserves core data
5844        #[test]
5845        fn entity_roundtrip_preserves_text(
5846            text in surface_strategy(),
5847            start in 0usize..1000,
5848            len in 1usize..100,
5849            conf in 0.0f64..=1.0,
5850        ) {
5851            use super::EntityType;
5852
5853            let end = start + len;
5854            let entity = super::Entity::new(&text, EntityType::Person, start, end, conf);
5855
5856            let doc = GroundedDocument::from_entities("test", "x".repeat(end + 10), &[entity]);
5857            let converted = doc.to_entities();
5858
5859            prop_assert_eq!(converted.len(), 1, "Should have exactly one entity");
5860            prop_assert_eq!(&converted[0].text, &text, "Text should be preserved");
5861            prop_assert_eq!(converted[0].start, start, "Start should be preserved");
5862            prop_assert_eq!(converted[0].end, end, "End should be preserved");
5863        }
5864
5865        // Note: Property test that depends on anno::eval::coref types has been moved to anno crate
5866        // (coref_roundtrip_preserves_count)
5867    }
5868
5869    // -------------------------------------------------------------------------
5870    // Modality Invariants
5871    // -------------------------------------------------------------------------
5872
5873    proptest! {
5874        /// Modality feature support is consistent with semiotic theory
5875        #[test]
5876        fn modality_feature_consistency(_dummy in 0..1) {
5877            // Iconic: supports geometric, not linguistic
5878            prop_assert!(Modality::Iconic.supports_geometric_features());
5879            prop_assert!(!Modality::Iconic.supports_linguistic_features());
5880
5881            // Symbolic: supports linguistic, not geometric
5882            prop_assert!(Modality::Symbolic.supports_linguistic_features());
5883            prop_assert!(!Modality::Symbolic.supports_geometric_features());
5884
5885            // Hybrid: supports both
5886            prop_assert!(Modality::Hybrid.supports_linguistic_features());
5887            prop_assert!(Modality::Hybrid.supports_geometric_features());
5888        }
5889    }
5890
5891    // -------------------------------------------------------------------------
5892    // Location Overlap Properties
5893    // -------------------------------------------------------------------------
5894
5895    proptest! {
5896        /// Overlap is symmetric: overlaps(a, b) == overlaps(b, a)
5897        #[test]
5898        fn overlap_symmetric(
5899            start1 in 0usize..1000,
5900            len1 in 1usize..100,
5901            start2 in 0usize..1000,
5902            len2 in 1usize..100,
5903        ) {
5904            let a = Location::text(start1, start1 + len1);
5905            let b = Location::text(start2, start2 + len2);
5906
5907            prop_assert_eq!(
5908                a.overlaps(&b),
5909                b.overlaps(&a),
5910                "Overlap must be symmetric"
5911            );
5912        }
5913
5914        /// A location always overlaps with itself
5915        #[test]
5916        fn overlap_reflexive(start in 0usize..1000, len in 1usize..100) {
5917            let loc = Location::text(start, start + len);
5918            prop_assert!(loc.overlaps(&loc), "Location must overlap with itself");
5919        }
5920
5921        /// If IoU > 0, then overlaps is true
5922        #[test]
5923        fn iou_implies_overlap(
5924            start1 in 0usize..500,
5925            len1 in 1usize..100,
5926            start2 in 0usize..500,
5927            len2 in 1usize..100,
5928        ) {
5929            let a = Location::text(start1, start1 + len1);
5930            let b = Location::text(start2, start2 + len2);
5931
5932            if let Some(iou) = a.iou(&b) {
5933                if iou > 0.0 {
5934                    prop_assert!(
5935                        a.overlaps(&b),
5936                        "IoU > 0 should imply overlap"
5937                    );
5938                }
5939            }
5940        }
5941    }
5942
5943    // -------------------------------------------------------------------------
5944    // DocumentStats Properties
5945    // -------------------------------------------------------------------------
5946
5947    proptest! {
5948        /// Stats signal count matches actual count
5949        #[test]
5950        fn stats_signal_count_accurate(signal_count in 0usize..20) {
5951            let mut doc = GroundedDocument::new("test", "test");
5952            for i in 0..signal_count {
5953                doc.add_signal(Signal::new(
5954                    0,
5955                    Location::text(i * 10, i * 10 + 5),
5956                    "entity",
5957                    "Type",
5958                    0.9,
5959                ));
5960            }
5961
5962            let stats = doc.stats();
5963            prop_assert_eq!(stats.signal_count, signal_count);
5964        }
5965
5966        /// Stats track count matches actual count
5967        #[test]
5968        fn stats_track_count_accurate(track_count in 0usize..10) {
5969            let mut doc = GroundedDocument::new("test", "test");
5970            for i in 0..track_count {
5971                let id = doc.add_signal(Signal::new(
5972                    0,
5973                    Location::text(i * 10, i * 10 + 5),
5974                    "entity",
5975                    "Type",
5976                    0.9,
5977                ));
5978                let mut track = Track::new(0, format!("track_{}", i));
5979                track.add_signal(id, 0);
5980                doc.add_track(track);
5981            }
5982
5983            let stats = doc.stats();
5984            prop_assert_eq!(stats.track_count, track_count);
5985        }
5986
5987        /// Avg confidence is in [0, 1]
5988        #[test]
5989        fn stats_avg_confidence_bounded(
5990            confidences in proptest::collection::vec(0.0f32..=1.0, 1..10)
5991        ) {
5992            let mut doc = GroundedDocument::new("test", "test");
5993            for (i, conf) in confidences.iter().enumerate() {
5994                doc.add_signal(Signal::new(
5995                    0,
5996                    Location::text(i * 10, i * 10 + 5),
5997                    "entity",
5998                    "Type",
5999                    *conf,
6000                ));
6001            }
6002
6003            let stats = doc.stats();
6004            prop_assert!(stats.avg_confidence >= 0.0);
6005            prop_assert!(stats.avg_confidence <= 1.0);
6006        }
6007    }
6008
6009    // -------------------------------------------------------------------------
6010    // Batch Operations Properties
6011    // -------------------------------------------------------------------------
6012
6013    proptest! {
6014        /// add_signals returns correct number of IDs
6015        #[test]
6016        fn batch_add_returns_all_ids(count in 1usize..10) {
6017            let mut doc = GroundedDocument::new("test", "test");
6018            let signals: Vec<Signal<Location>> = (0..count)
6019                .map(|i| Signal::new(0, Location::text(i * 10, i * 10 + 5), "e", "T", 0.9))
6020                .collect();
6021
6022            let ids = doc.add_signals(signals);
6023            prop_assert_eq!(ids.len(), count);
6024            prop_assert_eq!(doc.signals().len(), count);
6025        }
6026
6027        /// create_track_from_signals creates valid track
6028        #[test]
6029        fn create_track_valid(signal_count in 1usize..5) {
6030            let mut doc = GroundedDocument::new("test", "test");
6031            let mut signal_ids = Vec::new();
6032            for i in 0..signal_count {
6033                let id = doc.add_signal(Signal::new(
6034                    0,
6035                    Location::text(i * 10, i * 10 + 5),
6036                    "entity",
6037                    "Type",
6038                    0.9,
6039                ));
6040                signal_ids.push(id);
6041            }
6042
6043            let track_id = doc.create_track_from_signals("canonical", &signal_ids);
6044            prop_assert!(track_id.is_some());
6045
6046            let track = doc.get_track(track_id.unwrap());
6047            prop_assert!(track.is_some());
6048            prop_assert_eq!(track.unwrap().len(), signal_count);
6049        }
6050
6051        /// Empty signal list returns None for track creation
6052        #[test]
6053        fn create_track_empty_returns_none(_dummy in 0..1) {
6054            let mut doc = GroundedDocument::new("test", "test");
6055            let track_id = doc.create_track_from_signals("canonical", &[]);
6056            prop_assert!(track_id.is_none());
6057        }
6058    }
6059
6060    // -------------------------------------------------------------------------
6061    // Filtering Properties
6062    // -------------------------------------------------------------------------
6063
6064    proptest! {
6065        /// signals_in_range returns only signals within range
6066        #[test]
6067        fn signals_in_range_within_bounds(
6068            range_start in 0usize..100,
6069            range_len in 10usize..50,
6070        ) {
6071            let range_end = range_start + range_len;
6072            let mut doc = GroundedDocument::new("test", "x".repeat(200));
6073
6074            // Add signals: some inside, some outside
6075            doc.add_signal(Signal::new(0, Location::text(range_start + 2, range_start + 5), "inside", "T", 0.9));
6076            doc.add_signal(Signal::new(0, Location::text(0, 5), "before", "T", 0.9));
6077            doc.add_signal(Signal::new(0, Location::text(190, 195), "after", "T", 0.9));
6078
6079            let in_range = doc.signals_in_range(range_start, range_end);
6080
6081            for signal in &in_range {
6082                if let Some((start, end)) = signal.location.text_offsets() {
6083                    prop_assert!(start >= range_start, "Signal start {} < range start {}", start, range_start);
6084                    prop_assert!(end <= range_end, "Signal end {} > range end {}", end, range_end);
6085                }
6086            }
6087        }
6088
6089        /// overlapping_signals is symmetric: if A overlaps B, then B's overlaps includes A's location
6090        #[test]
6091        fn overlapping_signals_symmetric(
6092            start1 in 10usize..50,
6093            len1 in 5usize..20,
6094            start2 in 10usize..50,
6095            len2 in 5usize..20,
6096        ) {
6097            let mut doc = GroundedDocument::new("test", "x".repeat(100));
6098
6099            let loc1 = Location::text(start1, start1 + len1);
6100            let loc2 = Location::text(start2, start2 + len2);
6101
6102            doc.add_signal(Signal::new(0, loc1.clone(), "A", "T", 0.9));
6103            doc.add_signal(Signal::new(0, loc2.clone(), "B", "T", 0.9));
6104
6105            let overlaps_loc1 = doc.overlapping_signals(&loc1);
6106            let overlaps_loc2 = doc.overlapping_signals(&loc2);
6107
6108            // If loc1 overlaps loc2, both should find each other
6109            if loc1.overlaps(&loc2) {
6110                prop_assert!(overlaps_loc1.len() >= 2, "Should find both when overlapping");
6111                prop_assert!(overlaps_loc2.len() >= 2, "Should find both when overlapping");
6112            }
6113        }
6114    }
6115
6116    // -------------------------------------------------------------------------
6117    // Invariant: Modality count consistency
6118    // -------------------------------------------------------------------------
6119
6120    proptest! {
6121        /// Sum of modality counts equals total signal count
6122        #[test]
6123        fn modality_counts_sum_to_total(
6124            symbolic_count in 0usize..5,
6125            iconic_count in 0usize..5,
6126        ) {
6127            let mut doc = GroundedDocument::new("test", "test");
6128
6129            // Add symbolic signals
6130            for i in 0..symbolic_count {
6131                let mut signal = Signal::new(
6132                    0,
6133                    Location::text(i * 10, i * 10 + 5),
6134                    "entity",
6135                    "Type",
6136                    0.9,
6137                );
6138                signal.modality = Modality::Symbolic;
6139                doc.add_signal(signal);
6140            }
6141
6142            // Add iconic signals
6143            for i in 0..iconic_count {
6144                let mut signal = Signal::new(
6145                    0,
6146                    Location::bbox(i as f32 * 0.1, 0.0, 0.05, 0.05),
6147                    "entity",
6148                    "Type",
6149                    0.9,
6150                );
6151                signal.modality = Modality::Iconic;
6152                doc.add_signal(signal);
6153            }
6154
6155            let stats = doc.stats();
6156            prop_assert_eq!(
6157                stats.symbolic_count + stats.iconic_count + stats.hybrid_count,
6158                stats.signal_count,
6159                "Modality counts should sum to total"
6160            );
6161        }
6162    }
6163
6164    // -------------------------------------------------------------------------
6165    // Invariant: Signal-Text Offset Consistency
6166    // -------------------------------------------------------------------------
6167
6168    proptest! {
6169        /// Signals created via from_text are always valid
6170        #[test]
6171        fn from_text_always_valid(
6172            text in "[a-zA-Z ]{20,100}",
6173            surface_start in 0usize..15,
6174            surface_len in 1usize..8,
6175        ) {
6176            let text_char_len = text.chars().count();
6177            let surface_end = (surface_start + surface_len).min(text_char_len);
6178            let surface_start = surface_start.min(surface_end.saturating_sub(1));
6179
6180            if surface_start < surface_end && surface_end <= text_char_len {
6181                let surface: String = text.chars()
6182                    .skip(surface_start)
6183                    .take(surface_end - surface_start)
6184                    .collect();
6185
6186                if !surface.is_empty() {
6187                    // from_text should find the surface and create a valid signal
6188                    if let Some(signal) = Signal::<Location>::from_text(&text, &surface, "Test", 0.9) {
6189                        // The created signal MUST be valid
6190                        prop_assert!(
6191                            signal.validate_against(&text).is_none(),
6192                            "Signal created via from_text must be valid"
6193                        );
6194                    }
6195                }
6196            }
6197        }
6198
6199        /// Validated add never allows invalid signals
6200        #[test]
6201        fn validated_add_rejects_invalid(
6202            text in "[a-z]{10,50}",
6203            wrong_surface in "[A-Z]{3,10}",
6204        ) {
6205            let mut doc = GroundedDocument::new("test", &text);
6206
6207            // Create a signal with offsets pointing to different text than surface
6208            let signal = Signal::new(
6209                0,
6210                Location::text(0, wrong_surface.chars().count().min(text.chars().count())),
6211                wrong_surface.clone(),
6212                "Test",
6213                0.9,
6214            );
6215
6216            // If text doesn't actually contain wrong_surface at offset 0,
6217            // validated add should reject it
6218            let expected: String = text.chars().take(wrong_surface.chars().count()).collect();
6219            if expected != wrong_surface {
6220                let result = doc.add_signal_validated(signal);
6221                prop_assert!(result.is_err(), "Should reject signal with mismatched surface");
6222            }
6223        }
6224
6225        /// Round-trip: add_signal_from_text creates retrievable signals
6226        #[test]
6227        fn round_trip_signal_from_text(
6228            prefix in "[a-z]{5,20}",
6229            entity in "[A-Z][a-z]{3,10}",
6230            suffix in "[a-z]{5,20}",
6231        ) {
6232            let text = format!("{} {} {}", prefix, entity, suffix);
6233            let mut doc = GroundedDocument::new("test", &text);
6234
6235            let id = doc.add_signal_from_text(&entity, "Entity", 0.9);
6236            prop_assert!(id.is_some(), "Should find entity in text");
6237
6238            let signal = doc.signals().iter().find(|s| s.id == id.unwrap());
6239            prop_assert!(signal.is_some(), "Should retrieve added signal");
6240
6241            let signal = signal.unwrap();
6242            prop_assert_eq!(signal.surface(), entity.as_str(), "Surface should match");
6243
6244            // Validation MUST pass
6245            prop_assert!(
6246                doc.is_valid(),
6247                "Document should be valid after from_text add"
6248            );
6249        }
6250
6251        /// Multiple occurrences: nth variant finds correct occurrence
6252        #[test]
6253        fn nth_occurrence_finds_correct(
6254            entity in "[A-Z][a-z]{2,5}",
6255            sep in " [a-z]+ ",
6256        ) {
6257            // Create text with multiple occurrences
6258            let text = format!("{}{}{}{}{}", entity, sep, entity, sep, entity);
6259            let mut doc = GroundedDocument::new("test", &text);
6260
6261            // Find each occurrence
6262            for n in 0..3 {
6263                let id = doc.add_signal_from_text_nth(&entity, "Entity", 0.9, n);
6264                prop_assert!(id.is_some(), "Should find occurrence {}", n);
6265            }
6266
6267            // 4th occurrence shouldn't exist
6268            let id = doc.add_signal_from_text_nth(&entity, "Entity", 0.9, 3);
6269            prop_assert!(id.is_none(), "Should NOT find 4th occurrence");
6270
6271            // All signals should be valid
6272            prop_assert!(doc.is_valid(), "All signals should be valid");
6273
6274            // Check offsets are distinct
6275            let offsets: Vec<_> = doc.signals()
6276                .iter()
6277                .filter_map(|s| s.text_offsets())
6278                .collect();
6279            let unique: std::collections::HashSet<_> = offsets.iter().collect();
6280            prop_assert_eq!(offsets.len(), unique.len(), "Each occurrence should have distinct offset");
6281        }
6282    }
6283
6284    // =========================================================================
6285    // TrackStats Tests
6286    // =========================================================================
6287
6288    #[test]
6289    fn test_track_stats_basic() {
6290        let text = "John met Mary. He said hello. John left.";
6291        let mut doc = GroundedDocument::new("test", text);
6292        let text_len = text.chars().count();
6293
6294        // Add signals for "John" at positions 0 and 30
6295        let s1 = doc.add_signal(Signal::new(0, Location::text(0, 4), "John", "Person", 0.95));
6296        let s2 = doc.add_signal(Signal::new(
6297            0,
6298            Location::text(30, 34),
6299            "John",
6300            "Person",
6301            0.90,
6302        ));
6303
6304        // Create track linking both Johns
6305        let track_id = doc.add_track(Track::new(0, "John".to_string()));
6306        doc.add_signal_to_track(s1, track_id, 0);
6307        doc.add_signal_to_track(s2, track_id, 1);
6308
6309        // Get track and compute stats
6310        let track = doc.get_track(track_id).unwrap();
6311        let stats = track.compute_stats(&doc, text_len);
6312
6313        assert_eq!(stats.chain_length, 2, "Two mentions");
6314        assert_eq!(stats.variation_count, 1, "One unique surface form");
6315        assert!(stats.spread > 0, "Spread should be positive");
6316        assert!(stats.relative_spread > 0.0 && stats.relative_spread < 1.0);
6317        assert!((stats.min_confidence - 0.90).abs() < 0.01);
6318        assert!((stats.max_confidence - 0.95).abs() < 0.01);
6319        assert!((stats.mean_confidence - 0.925).abs() < 0.01);
6320    }
6321
6322    #[test]
6323    fn test_track_stats_singleton() {
6324        let text = "Paris is beautiful.";
6325        let mut doc = GroundedDocument::new("test", text);
6326        let text_len = text.chars().count();
6327
6328        let s1 = doc.add_signal(Signal::new(
6329            0,
6330            Location::text(0, 5),
6331            "Paris",
6332            "Location",
6333            0.88,
6334        ));
6335        let track_id = doc.add_track(Track::new(0, "Paris".to_string()));
6336        doc.add_signal_to_track(s1, track_id, 0);
6337
6338        let track = doc.get_track(track_id).unwrap();
6339        let stats = track.compute_stats(&doc, text_len);
6340
6341        assert_eq!(stats.chain_length, 1);
6342        assert_eq!(stats.spread, 0, "Singleton has zero spread");
6343        assert_eq!(stats.first_position, stats.last_position);
6344        assert!((stats.min_confidence - stats.max_confidence).abs() < 0.001);
6345    }
6346}