Skip to main content

anno_core/core/
grounded.rs

1//! Grounded entity representation with unified Signal → Track → Identity hierarchy.
2//!
3//! # Research Motivation
4//!
5//! Note: `anno` is text-first. The broad `Location` substrate here is intentionally future-facing.
6//! See `docs/LOCATION.md` in the repo for the philosophy and practical guidance.
7//!
8//! Traditional NER systems conflate three distinct levels of entity processing:
9//!
10//! 1. **Signal Detection** (Level 1): "There's something here" - localization + classification
11//! 2. **Track Formation** (Level 2): "These mentions are the same entity within this document"
12//! 3. **Identity Resolution** (Level 3): "This entity is Q7186 in Wikidata"
13//!
14//! This conflation causes issues:
15//! - Embedding models struggle when a single `Entity` type represents both mentions and KB entries
16//! - Cross-document coreference requires different similarity metrics than within-document
17//! - The "modal gap" between text spans and KB entities creates representation mismatches
18//!
19//! # The Isomorphism: Vision Detection ↔ NER
20//!
21//! ```text
22//! ┌─────────────────────────────────────────────────────────────────────────┐
23//! │                    VISION                    TEXT (NER)                 │
24//! ├─────────────────────────────────────────────────────────────────────────┤
25//! │ Localization Unit  │ BoundingBox (x,y,w,h)  │ TextSpan (start,end)     │
26//! │ Signal             │ Detection              │ Mention                  │
27//! │ Track (Level 2)    │ Tracklet (MOT)         │ CorefChain              │
28//! │ Identity (Level 3) │ Face Recognition       │ Entity Linking          │
29//! │ Region Proposal    │ RPN / DETR queries     │ Span enumeration        │
30//! │ Modality           │ Iconic (physics)       │ Symbolic (convention)   │
31//! └─────────────────────────────────────────────────────────────────────────┘
32//! ```
33//!
34//! The key insight: **detection is modality-agnostic**. Whether detecting "Steve Jobs"
35//! in text or a face in an image, the fundamental operation is:
36//!
37//! ```text
38//! Detection = Localization (where?) × Classification (what?)
39//! ```
40//!
41//! # Semiotic Gap: Icon vs Symbol
42//!
43//! A crucial nuance distinguishes text from vision:
44//!
45//! - **Iconic signs** (vision): The signifier physically resembles the signified.
46//!   A photo of a cat looks like a cat. Detection is about physics/geometry.
47//!
48//! - **Symbolic signs** (text): The signifier is arbitrary convention.
49//!   "cat" doesn't look like a cat. Detection requires learning cultural codes.
50//!
51//! This explains why text NER requires more sophisticated linguistic features
52//! (negation, quantification, recursion) that have no visual analogue.
53//!
54//! # Architecture: Entity-Centric Representation
55//!
56//! ```text
57//! ┌─────────────────────────────────────────────────────────────────────────┐
58//! │                      GroundedDocument                                   │
59//! ├─────────────────────────────────────────────────────────────────────────┤
60//! │                                                                         │
61//! │  identities: HashMap<IdentityId, Identity>                              │
62//! │       │                                                                 │
63//! │       └──► Identity { kb_id, canonical_name, embedding, ... }           │
64//! │                 │                                                       │
65//! │  tracks: HashMap<TrackId, Track<S>>                                     │
66//! │       │                                                                 │
67//! │       └──► Track { identity_id, signals: Vec<SignalRef>, ... }          │
68//! │                 │                                                       │
69//! │  signals: Vec<Signal<S>>                                                │
70//! │       │                                                                 │
71//! │       └──► Signal { location: S, label, confidence, ... }               │
72//! │                                                                         │
73//! └─────────────────────────────────────────────────────────────────────────┘
74//! ```
75//!
76//! This entity-centric design enables:
77//! - Efficient streaming/incremental coreference (signals → tracks incrementally)
78//! - Clear separation of detection, clustering, and linking
79//! - Unified treatment of text and visual signals
80//!
81//! # References
82//!
83//! - GLiNER: Bi-encoder span-label matching for zero-shot NER
84//! - DETR: End-to-end object detection with transformers
85//! - Pix2Seq: "Everything is a token" - bounding boxes as spatial tokens
86//! - CDLKT: Cross-document Language-Knowledge Transfer
87//! - Groma: Grounded multimodal assistant
88
89use super::confidence::Confidence;
90use super::entity::{
91    DiscontinuousSpan, Entity, EntityType, HierarchicalConfidence, Provenance, Span,
92};
93use serde::{Deserialize, Serialize};
94use std::collections::HashMap;
95
96// =============================================================================
97// Modality: The Semiotic Distinction
98// =============================================================================
99
100/// The semiotic modality of a signal source.
101///
102/// This captures a fundamental distinction in how meaning is encoded:
103///
104/// - **Iconic**: Physical resemblance (photos, audio waveforms)
105/// - **Symbolic**: Arbitrary convention (text, notation)
106/// - **Indexical**: Causal connection (smoke → fire, but rare in our domain)
107///
108/// # Why This Matters
109///
110/// The modality affects what linguistic features are relevant:
111///
112/// | Feature | Iconic (Vision) | Symbolic (Text) |
113/// |---------|-----------------|-----------------|
114/// | Negation | No analogue | "not a doctor" |
115/// | Quantification | Approximate | "every/some/no" |
116/// | Recursion | Rare | Nested NPs |
117/// | Compositionality | Limited | Full |
118///
119/// Detection in iconic modalities is more about geometry and physics.
120/// Detection in symbolic modalities requires cultural/linguistic knowledge.
121#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, Default)]
122pub enum Modality {
123    /// Iconic sign: signifier resembles signified (images, audio).
124    /// Detection is primarily geometric/physical.
125    Iconic,
126    /// Symbolic sign: arbitrary convention (text, notation).
127    /// Detection requires linguistic/cultural knowledge.
128    #[default]
129    Symbolic,
130    /// Hybrid: OCR text in images, captions, etc.
131    /// Has both iconic (visual layout) and symbolic (text content) aspects.
132    Hybrid,
133}
134
135// =============================================================================
136// Location: The Universal Localization Unit
137// =============================================================================
138
139/// A location in text.
140///
141/// Two variants:
142/// - `Text`: contiguous character span `[start, end)`
143/// - `Discontinuous`: non-contiguous character regions
144///
145/// Use [`to_span()`](Self::to_span) to convert `Text` to [`entity::Span`].
146///
147/// [`entity::Span`]: super::entity::Span
148#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
149pub enum Location {
150    /// Text span: 1D interval [start, end) in character offsets.
151    Text {
152        /// Start character offset (inclusive)
153        start: usize,
154        /// End character offset (exclusive)
155        end: usize,
156    },
157    /// Discontinuous text span: non-contiguous regions.
158    Discontinuous {
159        /// Multiple text intervals
160        segments: Vec<(usize, usize)>,
161    },
162}
163
164impl Location {
165    /// Create a text location.
166    #[must_use]
167    pub const fn text(start: usize, end: usize) -> Self {
168        Self::Text { start, end }
169    }
170
171    /// Get the modality of this location.
172    #[must_use]
173    pub const fn modality(&self) -> Modality {
174        match self {
175            Self::Text { .. } | Self::Discontinuous { .. } => Modality::Symbolic,
176        }
177    }
178
179    /// Get text offsets if this is a text location.
180    #[must_use]
181    pub fn text_offsets(&self) -> Option<(usize, usize)> {
182        match self {
183            Self::Text { start, end } => Some((*start, *end)),
184            Self::Discontinuous { segments } => {
185                let start = segments.iter().map(|(s, _)| *s).min()?;
186                let end = segments.iter().map(|(_, e)| *e).max()?;
187                Some((start, end))
188            }
189        }
190    }
191
192    /// Check if two locations overlap.
193    #[must_use]
194    pub fn overlaps(&self, other: &Self) -> bool {
195        match (self, other) {
196            (Self::Text { start: s1, end: e1 }, Self::Text { start: s2, end: e2 }) => {
197                s1 < e2 && s2 < e1
198            }
199            _ => false, // Different types don't overlap
200        }
201    }
202
203    /// Calculate IoU (Intersection over Union) for compatible location types.
204    ///
205    /// Returns None if the locations are incompatible (e.g., text vs bbox).
206    #[must_use]
207    pub fn iou(&self, other: &Self) -> Option<f64> {
208        match (self, other) {
209            (Self::Text { start: s1, end: e1 }, Self::Text { start: s2, end: e2 }) => {
210                let intersection_start = (*s1).max(*s2);
211                let intersection_end = (*e1).min(*e2);
212                if intersection_start >= intersection_end {
213                    return Some(0.0);
214                }
215                let intersection = (intersection_end - intersection_start) as f64;
216                let union = ((*e1).max(*e2) - (*s1).min(*s2)) as f64;
217                if union == 0.0 {
218                    Some(0.0)
219                } else {
220                    Some(intersection / union)
221                }
222            }
223            _ => None,
224        }
225    }
226}
227
228impl Default for Location {
229    fn default() -> Self {
230        Self::Text { start: 0, end: 0 }
231    }
232}
233
234impl From<&Span> for Location {
235    fn from(span: &Span) -> Self {
236        match span {
237            Span::Text { start, end } => Self::Text {
238                start: *start,
239                end: *end,
240            },
241            // BoundingBox and Hybrid spans have no Location equivalent;
242            // extract text offsets where available, otherwise default.
243            Span::BoundingBox { .. } => Self::Text { start: 0, end: 0 },
244            Span::Hybrid { start, end, .. } => Self::Text {
245                start: *start,
246                end: *end,
247            },
248        }
249    }
250}
251
252impl From<Span> for Location {
253    fn from(span: Span) -> Self {
254        Self::from(&span)
255    }
256}
257
258/// Convert `Location` to `Span` where possible.
259///
260/// - `Location::Text` -> `Span::Text`
261/// - `Location::Discontinuous` -> `None` (use `DiscontinuousSpan` instead)
262impl Location {
263    /// Try to convert this Location to a Span.
264    ///
265    /// Returns `None` for `Location::Discontinuous`.
266    #[must_use]
267    pub fn to_span(&self) -> Option<Span> {
268        match self {
269            Self::Text { start, end } => Some(Span::Text {
270                start: *start,
271                end: *end,
272            }),
273            Self::Discontinuous { .. } => None,
274        }
275    }
276}
277
278// =============================================================================
279// Signal (Level 1): Raw Detection
280// =============================================================================
281
282// SignalId is now a newtype in super::types::ids for type safety
283pub use super::types::SignalId;
284
285/// A raw detection signal: the atomic unit of entity extraction.
286///
287/// # The Detection Equation
288///
289/// Every signal is the product of two factors:
290///
291/// ```text
292/// Signal = Localization × Classification
293///        = "where is it?" × "what is it?"
294/// ```
295///
296/// This is true whether detecting faces in images, named entities in text,
297/// or objects in LiDAR point clouds.
298///
299/// # Design Philosophy
300///
301/// Signals are intentionally minimal. They capture:
302/// 1. **Where**: Location in the source medium
303/// 2. **What**: Classification label + confidence
304/// 3. **Provenance**: How it was detected
305///
306/// What they explicitly do NOT capture:
307/// - Coreference relationships (→ Track)
308/// - Knowledge base links (→ Identity)
309/// - Semantic embeddings (computed lazily if needed)
310///
311/// This separation enables efficient streaming pipelines where signals
312/// are produced incrementally and consumed by downstream track/identity
313/// formation without blocking.
314#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
315pub struct Signal<L = Location> {
316    /// Unique identifier within the document
317    pub id: SignalId,
318    /// Location in the source medium
319    pub location: L,
320    /// Surface form (the actual text or image patch)
321    pub surface: String,
322    /// Classification label (e.g., "Person", "Organization", "PER").
323    ///
324    /// Stored as a `TypeLabel` to support both core taxonomy types and domain-specific labels.
325    pub label: super::types::TypeLabel,
326    /// Detection confidence in [0, 1]
327    pub confidence: Confidence,
328    /// Hierarchical confidence if available (linkage/type/boundary)
329    pub hierarchical: Option<HierarchicalConfidence>,
330    /// Provenance: which detector produced this signal
331    pub provenance: Option<Provenance>,
332    /// Semiotic modality (derived from location, but can be overridden)
333    pub modality: Modality,
334    /// Normalized form (e.g., "Jan 15" → "2024-01-15")
335    pub normalized: Option<String>,
336    /// Whether this signal is negated (e.g., "not a doctor")
337    pub negated: bool,
338    /// Quantification if applicable (e.g., "every employee")
339    pub quantifier: Option<Quantifier>,
340}
341
342/// Quantification type for symbolic signals.
343///
344/// Only meaningful for text/symbolic modality where linguistic
345/// quantification is possible. Has no visual analogue.
346#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
347#[non_exhaustive]
348pub enum Quantifier {
349    /// Universal: "every", "all", "each"
350    Universal,
351    /// Existential: "some", "a", "certain"
352    Existential,
353    /// Negation: "no", "none"
354    None,
355    /// Specific: definite reference ("the")
356    Definite,
357    /// Approximate: "approximately", "about", "roughly"
358    Approximate,
359    /// Lower bound: "at least", "no fewer than"
360    MinBound,
361    /// Upper bound: "at most", "no more than", "up to"
362    MaxBound,
363    /// Bare: no explicit quantifier
364    Bare,
365}
366
367impl<L> Signal<L> {
368    /// Create a new signal.
369    ///
370    /// # Arguments
371    ///
372    /// * `id` - Unique identifier (will be overwritten when added to a document)
373    /// * `location` - Where this signal was detected
374    /// * `surface` - The actual text/content of the detection
375    /// * `label` - Classification label (e.g., "Person", "Organization")
376    /// * `confidence` - Detection confidence in `[0, 1]`
377    #[must_use]
378    pub fn new(
379        id: impl Into<SignalId>,
380        location: L,
381        surface: impl Into<String>,
382        label: impl Into<super::types::TypeLabel>,
383        confidence: f32,
384    ) -> Self {
385        Self {
386            id: id.into(),
387            location,
388            surface: surface.into(),
389            label: label.into(),
390            confidence: Confidence::new(confidence as f64),
391            hierarchical: None,
392            provenance: None,
393            modality: Modality::default(),
394            normalized: None,
395            negated: false,
396            quantifier: None,
397        }
398    }
399
400    /// Get the classification label as a string.
401    #[must_use]
402    pub fn label(&self) -> &str {
403        self.label.as_str()
404    }
405
406    /// Get the classification label as a type-safe `TypeLabel`.
407    #[must_use]
408    pub fn type_label(&self) -> super::types::TypeLabel {
409        self.label.clone()
410    }
411
412    /// Get the surface form.
413    #[must_use]
414    pub fn surface(&self) -> &str {
415        &self.surface
416    }
417
418    /// Check if this signal is above a confidence threshold.
419    #[must_use]
420    pub fn is_confident(&self, threshold: Confidence) -> bool {
421        self.confidence >= threshold
422    }
423
424    /// Set the modality.
425    #[must_use]
426    pub fn with_modality(mut self, modality: Modality) -> Self {
427        self.modality = modality;
428        self
429    }
430
431    /// Mark as negated.
432    #[must_use]
433    pub fn negated(mut self) -> Self {
434        self.negated = true;
435        self
436    }
437
438    /// Set quantifier.
439    #[must_use]
440    pub fn with_quantifier(mut self, q: Quantifier) -> Self {
441        self.quantifier = Some(q);
442        self
443    }
444
445    /// Set provenance.
446    #[must_use]
447    pub fn with_provenance(mut self, p: Provenance) -> Self {
448        self.provenance = Some(p);
449        self
450    }
451}
452
453impl Signal<Location> {
454    /// Get text offsets if this is a text signal.
455    #[must_use]
456    pub fn text_offsets(&self) -> Option<(usize, usize)> {
457        self.location.text_offsets()
458    }
459
460    /// Validate that this signal's location matches its surface text.
461    ///
462    /// Returns `None` if valid, or a description of the mismatch.
463    ///
464    /// # Example
465    ///
466    /// ```rust
467    /// use anno_core::{Signal, Location};
468    ///
469    /// let text = "Lynn Conway worked at IBM.";
470    /// let good = Signal::new(0, Location::text(0, 11), "Lynn Conway", "PER", 0.9);
471    /// assert!(good.validate_against(text).is_none());
472    ///
473    /// let bad = Signal::new(0, Location::text(0, 5), "Lynn Conway", "PER", 0.9);
474    /// assert!(bad.validate_against(text).is_some());
475    /// ```
476    #[must_use]
477    pub fn validate_against(&self, source_text: &str) -> Option<SignalValidationError> {
478        let (start, end) = self.location.text_offsets()?;
479
480        let char_count = source_text.chars().count();
481
482        // Check bounds
483        if end > char_count {
484            return Some(SignalValidationError::OutOfBounds {
485                signal_id: self.id,
486                end,
487                text_len: char_count,
488            });
489        }
490
491        if start >= end {
492            return Some(SignalValidationError::InvalidSpan {
493                signal_id: self.id,
494                start,
495                end,
496            });
497        }
498
499        // Extract actual text at offsets
500        let actual: String = source_text.chars().skip(start).take(end - start).collect();
501
502        if actual != self.surface {
503            return Some(SignalValidationError::TextMismatch {
504                signal_id: self.id,
505                expected: self.surface.clone(),
506                actual,
507                start,
508                end,
509            });
510        }
511
512        None
513    }
514
515    /// Check if this signal is valid against the given source text.
516    #[must_use]
517    pub fn is_valid(&self, source_text: &str) -> bool {
518        self.validate_against(source_text).is_none()
519    }
520
521    /// Create a signal by finding text in source (safe construction).
522    ///
523    /// Returns `None` if the surface text is not found in source.
524    ///
525    /// # Example
526    ///
527    /// ```rust
528    /// use anno_core::{Signal, Location};
529    ///
530    /// let text = "Lynn Conway worked at IBM.";
531    /// let signal = Signal::<Location>::from_text(text, "Lynn Conway", "PER", 0.95);
532    /// assert!(signal.is_some());
533    /// assert_eq!(signal.expect("signal should exist").text_offsets(), Some((0, 11)));
534    /// ```
535    #[must_use]
536    pub fn from_text(
537        source: &str,
538        surface: &str,
539        label: impl Into<super::types::TypeLabel>,
540        confidence: f32,
541    ) -> Option<Self> {
542        Self::from_text_nth(source, surface, label, confidence, 0)
543    }
544
545    /// Create a signal by finding the nth occurrence of text in source.
546    #[must_use]
547    pub fn from_text_nth(
548        source: &str,
549        surface: &str,
550        label: impl Into<super::types::TypeLabel>,
551        confidence: f32,
552        occurrence: usize,
553    ) -> Option<Self> {
554        // Find nth occurrence using char offsets
555        for (count, (byte_idx, _)) in source.match_indices(surface).enumerate() {
556            if count == occurrence {
557                // Convert byte offset to char offset
558                let start = source[..byte_idx].chars().count();
559                let end = start + surface.chars().count();
560
561                return Some(Self::new(
562                    SignalId::ZERO,
563                    Location::text(start, end),
564                    surface,
565                    label,
566                    confidence,
567                ));
568            }
569        }
570
571        None
572    }
573}
574
575/// Validation error for a signal.
576#[derive(Debug, Clone, PartialEq)]
577pub enum SignalValidationError {
578    /// Signal's end offset exceeds text length.
579    OutOfBounds {
580        /// Signal ID
581        signal_id: SignalId,
582        /// End offset that exceeds text
583        end: usize,
584        /// Actual text length in chars
585        text_len: usize,
586    },
587    /// Signal has invalid span (start >= end).
588    InvalidSpan {
589        /// Signal ID
590        signal_id: SignalId,
591        /// Start offset
592        start: usize,
593        /// End offset
594        end: usize,
595    },
596    /// Signal's surface text doesn't match text at offsets.
597    TextMismatch {
598        /// Signal ID
599        signal_id: SignalId,
600        /// Surface text stored in signal
601        expected: String,
602        /// Actual text found at offsets
603        actual: String,
604        /// Start offset
605        start: usize,
606        /// End offset
607        end: usize,
608    },
609}
610
611impl std::fmt::Display for SignalValidationError {
612    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
613        match self {
614            Self::OutOfBounds {
615                signal_id,
616                end,
617                text_len,
618            } => {
619                write!(
620                    f,
621                    "S{}: end offset {} exceeds text length {}",
622                    signal_id, end, text_len
623                )
624            }
625            Self::InvalidSpan {
626                signal_id,
627                start,
628                end,
629            } => {
630                write!(f, "S{}: invalid span [{}, {})", signal_id, start, end)
631            }
632            Self::TextMismatch {
633                signal_id,
634                expected,
635                actual,
636                start,
637                end,
638            } => {
639                write!(
640                    f,
641                    "S{}: text mismatch at [{}, {}): expected '{}', found '{}'",
642                    signal_id, start, end, expected, actual
643                )
644            }
645        }
646    }
647}
648
649impl std::error::Error for SignalValidationError {}
650
651/// Convert an [`Entity`] to a [`Signal<Location>`].
652///
653/// Uses `Location::Text` for the span and preserves `normalized`, `provenance`,
654/// and `hierarchical_confidence` fields. Discontinuous and visual spans are not
655/// handled; use [`GroundedDocument::from_entities`] for full fidelity.
656impl From<&Entity> for Signal<Location> {
657    fn from(e: &Entity) -> Self {
658        let mut signal = Signal::new(
659            SignalId::ZERO,
660            Location::text(e.start(), e.end()),
661            &e.text,
662            e.entity_type.as_label(),
663            f32::from(e.confidence),
664        );
665        signal.normalized = e.normalized.clone();
666        signal.provenance = e.provenance.clone();
667        signal.hierarchical = e.hierarchical_confidence;
668        signal
669    }
670}
671
672// =============================================================================
673// Track (Level 2): Within-Document Coreference
674// =============================================================================
675
676// TrackId is now a newtype in super::types::ids for type safety
677pub use super::types::TrackId;
678
679/// A reference to a signal within a track.
680#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
681pub struct SignalRef {
682    /// Signal ID
683    pub signal_id: SignalId,
684    /// Position in document order (for antecedent relationships)
685    pub position: u32,
686}
687
688/// A reference to a track in a specific document.
689///
690/// Used for cross-document operations where we need to reference
691/// tracks without copying them. This enables efficient inter-document
692/// coreference resolution.
693#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
694pub struct TrackRef {
695    /// Document ID containing the track
696    pub doc_id: String,
697    /// Track ID within that document
698    pub track_id: TrackId,
699}
700
701/// A track: a cluster of signals referring to the same entity within a document.
702///
703/// # Terminology Mapping
704///
705/// | Vision | NLP |
706/// |--------|-----|
707/// | Tracklet | CorefChain |
708/// | Object track | Entity cluster |
709/// | Re-identification | Coreference resolution |
710///
711/// # Design Philosophy
712///
713/// Tracks are the bridge between raw signals and global identities.
714/// They answer: "which signals in THIS document refer to the same entity?"
715///
716/// Key properties:
717/// - **Document-scoped**: A track only exists within one document
718/// - **Homogeneous type**: All signals in a track should have compatible types
719/// - **Representative**: The track has a "canonical" signal (usually the first proper mention)
720#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
721pub struct Track {
722    /// Unique identifier within the document
723    pub id: TrackId,
724    /// Signal references in this track (document order)
725    pub signals: Vec<SignalRef>,
726    /// Entity type (consensus from signals).
727    ///
728    /// This is a `TypeLabel` to support both core taxonomy types and domain-specific labels.
729    pub entity_type: Option<super::types::TypeLabel>,
730    /// Canonical surface form (the "best" name for this entity)
731    pub canonical_surface: String,
732    /// Link to global identity (Level 3), if resolved
733    pub identity_id: Option<IdentityId>,
734    /// Confidence that signals are correctly clustered
735    pub cluster_confidence: Confidence,
736    /// Optional embedding for track-level representation
737    /// (aggregated from signal embeddings)
738    pub embedding: Option<Vec<f32>>,
739}
740
741impl Track {
742    /// Create a new track.
743    #[must_use]
744    pub fn new(id: impl Into<TrackId>, canonical_surface: impl Into<String>) -> Self {
745        Self {
746            id: id.into(),
747            signals: Vec::new(),
748            entity_type: None,
749            canonical_surface: canonical_surface.into(),
750            identity_id: None,
751            cluster_confidence: Confidence::ONE,
752            embedding: None,
753        }
754    }
755
756    /// Add a signal to this track.
757    pub fn add_signal(&mut self, signal_id: impl Into<SignalId>, position: u32) {
758        let signal_id = signal_id.into();
759        self.signals.push(SignalRef {
760            signal_id,
761            position,
762        });
763    }
764
765    /// Get the number of mentions in this track.
766    #[must_use]
767    pub fn len(&self) -> usize {
768        self.signals.len()
769    }
770
771    /// Check if this track is empty.
772    #[must_use]
773    pub fn is_empty(&self) -> bool {
774        self.signals.is_empty()
775    }
776
777    /// Check if this is a singleton (single mention).
778    #[must_use]
779    pub fn is_singleton(&self) -> bool {
780        self.signals.len() == 1
781    }
782
783    /// Get the track's unique identifier.
784    #[must_use]
785    pub const fn id(&self) -> TrackId {
786        self.id
787    }
788
789    /// Get the signal references in this track.
790    #[must_use]
791    pub fn signals(&self) -> &[SignalRef] {
792        &self.signals
793    }
794
795    /// Get the canonical surface form.
796    #[must_use]
797    pub fn canonical_surface(&self) -> &str {
798        &self.canonical_surface
799    }
800
801    /// Get the linked identity ID, if any.
802    #[must_use]
803    pub const fn identity_id(&self) -> Option<IdentityId> {
804        self.identity_id
805    }
806
807    /// Get the cluster confidence score.
808    #[must_use]
809    pub const fn cluster_confidence(&self) -> Confidence {
810        self.cluster_confidence
811    }
812
813    /// Set the cluster confidence score.
814    pub fn set_cluster_confidence(&mut self, confidence: f32) {
815        self.cluster_confidence = Confidence::new(confidence as f64);
816    }
817
818    /// Link this track to a global identity (mutable setter).
819    pub fn set_identity_id(&mut self, identity_id: IdentityId) {
820        self.identity_id = Some(identity_id);
821    }
822
823    /// Unlink this track from its identity.
824    pub fn clear_identity_id(&mut self) {
825        self.identity_id = None;
826    }
827
828    /// Link this track to a global identity.
829    #[must_use]
830    pub fn with_identity(mut self, identity_id: IdentityId) -> Self {
831        self.identity_id = Some(identity_id);
832        self
833    }
834
835    /// Set the entity type from a string.
836    ///
837    /// For new code, prefer [`Self::with_type_label`] which provides type safety.
838    #[must_use]
839    pub fn with_type(mut self, entity_type: impl Into<String>) -> Self {
840        let s = entity_type.into();
841        self.entity_type = Some(super::types::TypeLabel::from(s.as_str()));
842        self
843    }
844
845    /// Set the entity type using a type-safe label.
846    ///
847    /// This is the preferred method for new code as it provides type safety
848    /// and integrates with the core `EntityType` taxonomy.
849    ///
850    /// # Example
851    ///
852    /// ```rust
853    /// use anno_core::{Track, TypeLabel, EntityType};
854    ///
855    /// let track = Track::new(0, "Marie Curie")
856    ///     .with_type_label(TypeLabel::Core(EntityType::Person));
857    /// ```
858    #[must_use]
859    pub fn with_type_label(mut self, label: super::types::TypeLabel) -> Self {
860        self.entity_type = Some(label);
861        self
862    }
863
864    /// Get the entity type as a type-safe label.
865    ///
866    /// This converts the internal string representation to a `TypeLabel`,
867    /// attempting to parse it as a core `EntityType` first.
868    #[must_use]
869    pub fn type_label(&self) -> Option<super::types::TypeLabel> {
870        self.entity_type.clone()
871    }
872
873    /// Set the embedding for this track.
874    #[must_use]
875    pub fn with_embedding(mut self, embedding: Vec<f32>) -> Self {
876        self.embedding = Some(embedding);
877        self
878    }
879
880    /// Get the spread (distance from first to last mention).
881    ///
882    /// Requires document to resolve signal positions.
883    pub fn compute_spread(&self, doc: &GroundedDocument) -> Option<usize> {
884        if self.signals.is_empty() {
885            return Some(0);
886        }
887
888        let positions: Vec<usize> = self
889            .signals
890            .iter()
891            .filter_map(|sr| {
892                doc.signals
893                    .iter()
894                    .find(|s| s.id == sr.signal_id)
895                    .and_then(|s| s.location.text_offsets())
896                    .map(|(start, _)| start)
897            })
898            .collect();
899
900        if positions.is_empty() {
901            return None;
902        }
903
904        let min_pos = *positions.iter().min().expect("positions non-empty");
905        let max_pos = *positions.iter().max().expect("positions non-empty");
906        Some(max_pos.saturating_sub(min_pos))
907    }
908
909    /// Collect all surface form variations from signals.
910    ///
911    /// Requires document to resolve signal surfaces.
912    pub fn collect_variations(&self, doc: &GroundedDocument) -> Vec<String> {
913        let mut variations: std::collections::HashSet<String> = std::collections::HashSet::new();
914
915        for sr in &self.signals {
916            if let Some(signal) = doc.signals.iter().find(|s| s.id == sr.signal_id) {
917                variations.insert(signal.surface.clone());
918            }
919        }
920
921        variations.into_iter().collect()
922    }
923
924    /// Get confidence statistics across all signals.
925    ///
926    /// Returns (min, max, mean) confidence values.
927    pub fn confidence_stats(&self, doc: &GroundedDocument) -> Option<(f32, f32, f32)> {
928        let confidences: Vec<f32> = self
929            .signals
930            .iter()
931            .filter_map(|sr| {
932                doc.signals
933                    .iter()
934                    .find(|s| s.id == sr.signal_id)
935                    .map(|s| s.confidence.value() as f32)
936            })
937            .collect();
938
939        if confidences.is_empty() {
940            return None;
941        }
942
943        let min = confidences.iter().cloned().fold(f32::INFINITY, f32::min);
944        let max = confidences
945            .iter()
946            .cloned()
947            .fold(f32::NEG_INFINITY, f32::max);
948        let mean = confidences.iter().sum::<f32>() / confidences.len() as f32;
949
950        Some((min, max, mean))
951    }
952
953    /// Compute aggregate statistics for this track.
954    ///
955    /// Returns a `TrackStats` struct with comprehensive aggregate features.
956    pub fn compute_stats(&self, doc: &GroundedDocument, text_len: usize) -> TrackStats {
957        let chain_length = self.signals.len();
958        let spread = self.compute_spread(doc).unwrap_or(0);
959        let variations = self.collect_variations(doc);
960        let (min_conf, max_conf, mean_conf) = self.confidence_stats(doc).unwrap_or((0.0, 0.0, 0.0));
961
962        // Compute first/last positions
963        let positions: Vec<usize> = self
964            .signals
965            .iter()
966            .filter_map(|sr| {
967                doc.signals
968                    .iter()
969                    .find(|s| s.id == sr.signal_id)
970                    .and_then(|s| s.location.text_offsets())
971                    .map(|(start, _)| start)
972            })
973            .collect();
974
975        let first_position = positions.iter().min().copied().unwrap_or(0);
976        let last_position = positions.iter().max().copied().unwrap_or(0);
977        let relative_spread = if text_len > 0 {
978            spread as f64 / text_len as f64
979        } else {
980            0.0
981        };
982
983        TrackStats {
984            chain_length,
985            variation_count: variations.len(),
986            variations,
987            spread,
988            relative_spread,
989            first_position,
990            last_position,
991            min_confidence: Confidence::new(min_conf as f64),
992            max_confidence: Confidence::new(max_conf as f64),
993            mean_confidence: Confidence::new(mean_conf as f64),
994            has_embedding: self.embedding.is_some(),
995        }
996    }
997}
998
999/// Aggregate statistics for a track (coreference chain).
1000#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
1001pub struct TrackStats {
1002    /// Number of mentions in the track.
1003    pub chain_length: usize,
1004    /// Number of unique surface form variations.
1005    pub variation_count: usize,
1006    /// All surface form variations.
1007    pub variations: Vec<String>,
1008    /// Spread in characters (first to last mention).
1009    pub spread: usize,
1010    /// Spread as fraction of document length.
1011    pub relative_spread: f64,
1012    /// Position of first mention.
1013    pub first_position: usize,
1014    /// Position of last mention.
1015    pub last_position: usize,
1016    /// Minimum confidence across mentions.
1017    pub min_confidence: Confidence,
1018    /// Maximum confidence across mentions.
1019    pub max_confidence: Confidence,
1020    /// Mean confidence across mentions.
1021    pub mean_confidence: Confidence,
1022    /// Whether this track has an embedding.
1023    pub has_embedding: bool,
1024}
1025
1026// =============================================================================
1027// Identity (Level 3): Cross-Document Entity Linking
1028// =============================================================================
1029
1030// IdentityId is now a newtype in super::types::ids for type safety
1031pub use super::types::IdentityId;
1032
1033/// Source of identity formation.
1034///
1035/// Tracks how an identity was created, which affects how it should be
1036/// used and what operations are valid on it.
1037#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
1038pub enum IdentitySource {
1039    /// Created from cross-document track clustering (inter-doc coref).
1040    /// No KB link yet - this is pure clustering.
1041    CrossDocCoref {
1042        /// Tracks that were clustered to form this identity
1043        track_refs: Vec<TrackRef>,
1044    },
1045    /// Linked from knowledge base (entity linking/NED).
1046    /// Single track or identity linked to KB.
1047    KnowledgeBase {
1048        /// Knowledge base name (e.g., "wikidata")
1049        kb_name: String,
1050        /// Knowledge base ID (e.g., "Q7186")
1051        kb_id: String,
1052    },
1053    /// Both: clustered from tracks AND linked to KB.
1054    /// This is the most complete identity.
1055    Hybrid {
1056        /// Tracks that were clustered
1057        track_refs: Vec<TrackRef>,
1058        /// Knowledge base name
1059        kb_name: String,
1060        /// Knowledge base ID
1061        kb_id: String,
1062    },
1063}
1064
1065/// A global identity: a real-world entity linked to a knowledge base.
1066///
1067/// # The Modal Gap
1068///
1069/// There's a fundamental representational gap between:
1070/// - **Text mentions**: Contextual, variable surface forms ("Marie Curie", "she", "the scientist")
1071/// - **KB entities**: Canonical, static representations (Q7186 in Wikidata)
1072///
1073/// Bridging this gap requires:
1074/// 1. Learning aligned embeddings (text encoder ↔ KB encoder)
1075/// 2. Type consistency constraints
1076/// 3. Cross-encoder re-ranking for hard cases
1077///
1078/// # Design Philosophy
1079///
1080/// Identities are the "global truth" that tracks point to. They represent:
1081/// - A canonical name and description
1082/// - A knowledge base reference (if available)
1083/// - An embedding in the entity space (for similarity/clustering)
1084///
1085/// Identities can exist without KB links (for novel entities not in the KB).
1086#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
1087pub struct Identity {
1088    /// Unique identifier
1089    pub id: IdentityId,
1090    /// Canonical name (the "official" name)
1091    pub canonical_name: String,
1092    /// Entity type/category.
1093    ///
1094    /// Stored as a `TypeLabel` to support both core and custom (domain) labels.
1095    pub entity_type: Option<super::types::TypeLabel>,
1096    /// Knowledge base reference (e.g., "Q7186" for Wikidata)
1097    pub kb_id: Option<String>,
1098    /// Knowledge base name (e.g., "wikidata", "umls")
1099    pub kb_name: Option<String>,
1100    /// Description from knowledge base
1101    pub description: Option<String>,
1102    /// Entity embedding in the KB/entity space
1103    /// This is aligned with the text encoder space for similarity computation
1104    pub embedding: Option<Vec<f32>>,
1105    /// Alias names (other known surface forms)
1106    pub aliases: Vec<String>,
1107    /// Confidence that this identity is correctly resolved
1108    pub confidence: Confidence,
1109    /// Source of identity formation (how it was created)
1110    #[serde(default, skip_serializing_if = "Option::is_none")]
1111    pub source: Option<IdentitySource>,
1112}
1113
1114impl Identity {
1115    /// Create a new identity.
1116    #[must_use]
1117    pub fn new(id: impl Into<IdentityId>, canonical_name: impl Into<String>) -> Self {
1118        Self {
1119            id: id.into(),
1120            canonical_name: canonical_name.into(),
1121            entity_type: None,
1122            kb_id: None,
1123            kb_name: None,
1124            description: None,
1125            embedding: None,
1126            aliases: Vec::new(),
1127            confidence: Confidence::ONE,
1128            source: None,
1129        }
1130    }
1131
1132    /// Create an identity from a knowledge base entry.
1133    #[must_use]
1134    pub fn from_kb(
1135        id: impl Into<IdentityId>,
1136        canonical_name: impl Into<String>,
1137        kb_name: impl Into<String>,
1138        kb_id: impl Into<String>,
1139    ) -> Self {
1140        let kb_name_str = kb_name.into();
1141        let kb_id_str = kb_id.into();
1142        Self {
1143            id: id.into(),
1144            canonical_name: canonical_name.into(),
1145            entity_type: None,
1146            kb_id: Some(kb_id_str.clone()),
1147            kb_name: Some(kb_name_str.clone()),
1148            description: None,
1149            embedding: None,
1150            aliases: Vec::new(),
1151            confidence: Confidence::ONE,
1152            source: Some(IdentitySource::KnowledgeBase {
1153                kb_name: kb_name_str,
1154                kb_id: kb_id_str,
1155            }),
1156        }
1157    }
1158
1159    /// Add an alias.
1160    pub fn add_alias(&mut self, alias: impl Into<String>) {
1161        self.aliases.push(alias.into());
1162    }
1163
1164    /// Get the identity's unique identifier.
1165    #[must_use]
1166    pub const fn id(&self) -> IdentityId {
1167        self.id
1168    }
1169
1170    /// Get the canonical name.
1171    #[must_use]
1172    pub fn canonical_name(&self) -> &str {
1173        &self.canonical_name
1174    }
1175
1176    /// Get the KB ID, if linked.
1177    #[must_use]
1178    pub fn kb_id(&self) -> Option<&str> {
1179        self.kb_id.as_deref()
1180    }
1181
1182    /// Get the KB name, if linked.
1183    #[must_use]
1184    pub fn kb_name(&self) -> Option<&str> {
1185        self.kb_name.as_deref()
1186    }
1187
1188    /// Get the aliases.
1189    #[must_use]
1190    pub fn aliases(&self) -> &[String] {
1191        &self.aliases
1192    }
1193
1194    /// Get the confidence score.
1195    #[must_use]
1196    pub const fn confidence(&self) -> Confidence {
1197        self.confidence
1198    }
1199
1200    /// Set the confidence score.
1201    pub fn set_confidence(&mut self, confidence: f32) {
1202        self.confidence = Confidence::new(confidence as f64);
1203    }
1204
1205    /// Get the identity source.
1206    #[must_use]
1207    pub fn source(&self) -> Option<&IdentitySource> {
1208        self.source.as_ref()
1209    }
1210
1211    /// Set the embedding.
1212    #[must_use]
1213    pub fn with_embedding(mut self, embedding: Vec<f32>) -> Self {
1214        self.embedding = Some(embedding);
1215        self
1216    }
1217
1218    /// Set the entity type from a string.
1219    ///
1220    /// For new code, prefer [`Self::with_type_label`] which provides type safety.
1221    #[must_use]
1222    pub fn with_type(mut self, entity_type: impl Into<String>) -> Self {
1223        let s = entity_type.into();
1224        self.entity_type = Some(super::types::TypeLabel::from(s.as_str()));
1225        self
1226    }
1227
1228    /// Set the entity type using a type-safe label.
1229    ///
1230    /// This is the preferred method for new code as it provides type safety
1231    /// and integrates with the core `EntityType` taxonomy.
1232    #[must_use]
1233    pub fn with_type_label(mut self, label: super::types::TypeLabel) -> Self {
1234        self.entity_type = Some(label);
1235        self
1236    }
1237
1238    /// Get the entity type as a type-safe label.
1239    ///
1240    /// This converts the internal string representation to a `TypeLabel`,
1241    /// attempting to parse it as a core `EntityType` first.
1242    #[must_use]
1243    pub fn type_label(&self) -> Option<super::types::TypeLabel> {
1244        self.entity_type.clone()
1245    }
1246
1247    /// Set description.
1248    #[must_use]
1249    pub fn with_description(mut self, description: impl Into<String>) -> Self {
1250        self.description = Some(description.into());
1251        self
1252    }
1253
1254    // Note: from_cross_doc_cluster moved to anno crate (see anno/src/eval/cdcr.rs)
1255}
1256
1257// =============================================================================
1258// GroundedDocument: The Container
1259// =============================================================================
1260
1261/// Wire format for [`GroundedDocument`] — contains only the persisted fields.
1262/// Internal indexes are rebuilt automatically via [`GroundedDocument::rebuild_indexes`]
1263/// during deserialization.
1264#[derive(Deserialize)]
1265struct GroundedDocumentWire {
1266    id: String,
1267    text: String,
1268    signals: Vec<Signal<Location>>,
1269    tracks: HashMap<TrackId, Track>,
1270    identities: HashMap<IdentityId, Identity>,
1271}
1272
1273impl From<GroundedDocumentWire> for GroundedDocument {
1274    fn from(wire: GroundedDocumentWire) -> Self {
1275        let mut doc = Self {
1276            id: wire.id,
1277            text: wire.text,
1278            signals: wire.signals,
1279            tracks: wire.tracks,
1280            identities: wire.identities,
1281            signal_to_track: HashMap::new(),
1282            track_to_identity: HashMap::new(),
1283            next_signal_id: SignalId::ZERO,
1284            next_track_id: TrackId::ZERO,
1285            next_identity_id: IdentityId::ZERO,
1286        };
1287        doc.rebuild_indexes();
1288        doc
1289    }
1290}
1291
1292/// A document with grounded entity annotations using the three-level hierarchy.
1293///
1294/// # Entity-Centric Design
1295///
1296/// Traditional document representations store entities as a flat list.
1297/// This design uses an entity-centric representation where:
1298///
1299/// 1. **Signals** are the atomic detections (Level 1)
1300/// 2. **Tracks** cluster signals into within-document entities (Level 2)
1301/// 3. **Identities** link tracks to global KB entities (Level 3)
1302///
1303/// This enables efficient:
1304/// - Streaming signal processing (add signals incrementally)
1305/// - Incremental coreference (cluster signals as they arrive)
1306/// - Lazy entity linking (resolve identities only when needed)
1307///
1308/// # Usage
1309///
1310/// ```rust
1311/// use anno_core::{GroundedDocument, Signal, Track, Identity, Location};
1312///
1313/// let mut doc = GroundedDocument::new("doc1", "Marie Curie won the Nobel Prize. She was a physicist.");
1314///
1315/// // Add signals (Level 1)
1316/// doc.add_signal(Signal::new(0, Location::text(0, 11), "Marie Curie", "Person", 0.95));
1317/// doc.add_signal(Signal::new(1, Location::text(33, 36), "She", "Person", 0.88));
1318///
1319/// // Form track (Level 2)
1320/// let mut track = Track::new(0, "Marie Curie");
1321/// track.add_signal(0, 0);
1322/// track.add_signal(1, 1);
1323/// doc.add_track(track);
1324///
1325/// // Link identity (Level 3)
1326/// let identity = Identity::from_kb(0, "Marie Curie", "wikidata", "Q7186");
1327/// doc.add_identity(identity);
1328/// doc.link_track_to_identity(0, 0);
1329/// ```
1330///
1331/// # Invariants
1332///
1333/// `GroundedDocument` maintains internal indices (`signal_to_track`, `track_to_identity`)
1334/// that must be consistent with the public collections. The following invariants hold:
1335///
1336/// 1. **Signal ID uniqueness**: All signals in `signals` have distinct `id` values.
1337/// 2. **Track signal references**: Every `SignalRef` in a `Track.signals` points to
1338///    a valid signal ID in `signals`.
1339/// 3. **Signal-to-track consistency**: If `signal_to_track[s] == t`, then the track `t`
1340///    contains a `SignalRef` pointing to `s`.
1341/// 4. **Track-to-identity consistency**: If `track_to_identity[t] == i`, then
1342///    `tracks[t].identity_id == Some(i)` and `identities` contains `i`.
1343/// 5. **Signal offsets validity**: Signal text locations should match `self.text`.
1344///
1345/// **Prefer mutation via provided methods** (`add_signal`, `add_track`, `add_signal_to_track`,
1346/// `link_track_to_identity`) rather than direct field manipulation to preserve invariants.
1347///
1348/// Use [`validate_invariants()`](Self::validate_invariants) to check structural consistency
1349/// after external modifications.
1350///
1351/// ## Serialization
1352///
1353/// Internal indexes (`signal_to_track`, `track_to_identity`, counter fields) are **not**
1354/// serialized. They are rebuilt automatically on deserialization via [`rebuild_indexes`](Self::rebuild_indexes).
1355#[derive(Debug, Clone, Serialize, Deserialize)]
1356#[serde(from = "GroundedDocumentWire")]
1357pub struct GroundedDocument {
1358    /// Document identifier
1359    id: String,
1360    /// Raw text content
1361    text: String,
1362    /// Level 1: Raw signals (detections)
1363    signals: Vec<Signal<Location>>,
1364    /// Level 2: Tracks (within-document coreference chains)
1365    tracks: HashMap<TrackId, Track>,
1366    /// Level 3: Global identities (KB-linked entities)
1367    identities: HashMap<IdentityId, Identity>,
1368    /// Index: signal_id → track_id (for efficient lookup).
1369    /// Not serialized; rebuilt on deserialization.
1370    #[serde(skip)]
1371    signal_to_track: HashMap<SignalId, TrackId>,
1372    /// Index: track_id → identity_id (for efficient lookup).
1373    /// Not serialized; rebuilt on deserialization.
1374    #[serde(skip)]
1375    track_to_identity: HashMap<TrackId, IdentityId>,
1376    /// Next signal ID (for auto-incrementing).
1377    /// Not serialized; rebuilt on deserialization.
1378    #[serde(skip)]
1379    next_signal_id: SignalId,
1380    /// Next track ID.
1381    /// Not serialized; rebuilt on deserialization.
1382    #[serde(skip)]
1383    next_track_id: TrackId,
1384    /// Next identity ID.
1385    /// Not serialized; rebuilt on deserialization.
1386    #[serde(skip)]
1387    next_identity_id: IdentityId,
1388}
1389
1390impl GroundedDocument {
1391    /// Create a new grounded document.
1392    #[must_use]
1393    pub fn new(id: impl Into<String>, text: impl Into<String>) -> Self {
1394        Self {
1395            id: id.into(),
1396            text: text.into(),
1397            signals: Vec::new(),
1398            tracks: HashMap::new(),
1399            identities: HashMap::new(),
1400            signal_to_track: HashMap::new(),
1401            track_to_identity: HashMap::new(),
1402            next_signal_id: SignalId::ZERO,
1403            next_track_id: TrackId::ZERO,
1404            next_identity_id: IdentityId::ZERO,
1405        }
1406    }
1407
1408    /// Get the document identifier.
1409    #[must_use]
1410    pub fn id(&self) -> &str {
1411        &self.id
1412    }
1413
1414    /// Get the raw text content.
1415    #[must_use]
1416    pub fn text(&self) -> &str {
1417        &self.text
1418    }
1419
1420    /// Get a mutable reference to the signals vector.
1421    pub fn signals_mut(&mut self) -> &mut Vec<Signal<Location>> {
1422        &mut self.signals
1423    }
1424
1425    /// Get the tracks map.
1426    #[must_use]
1427    pub fn tracks_map(&self) -> &HashMap<TrackId, Track> {
1428        &self.tracks
1429    }
1430
1431    /// Get a mutable reference to the tracks map.
1432    ///
1433    /// After mutating tracks, call [`rebuild_indexes`](Self::rebuild_indexes) to
1434    /// keep internal indexes consistent.
1435    pub fn tracks_map_mut(&mut self) -> &mut HashMap<TrackId, Track> {
1436        &mut self.tracks
1437    }
1438
1439    /// Get the identities map.
1440    #[must_use]
1441    pub fn identities_map(&self) -> &HashMap<IdentityId, Identity> {
1442        &self.identities
1443    }
1444
1445    /// Get a mutable reference to the identities map.
1446    ///
1447    /// After mutating identities, call [`rebuild_indexes`](Self::rebuild_indexes) to
1448    /// keep internal indexes consistent.
1449    pub fn identities_map_mut(&mut self) -> &mut HashMap<IdentityId, Identity> {
1450        &mut self.identities
1451    }
1452
1453    /// Rebuild all internal indexes from the data fields.
1454    ///
1455    /// Call this after deserializing a `GroundedDocument` or after mutating
1456    /// via `signals_mut()`, `tracks_map_mut()`, or `identities_map_mut()`. The method recomputes:
1457    /// - `signal_to_track` from each track's signal list
1458    /// - `track_to_identity` from each track's `identity_id`
1459    /// - `next_signal_id`, `next_track_id`, `next_identity_id` counters
1460    pub fn rebuild_indexes(&mut self) {
1461        self.signal_to_track.clear();
1462        self.track_to_identity.clear();
1463
1464        for (&track_id, track) in &self.tracks {
1465            for sig_ref in &track.signals {
1466                self.signal_to_track.insert(sig_ref.signal_id, track_id);
1467            }
1468            if let Some(identity_id) = track.identity_id {
1469                self.track_to_identity.insert(track_id, identity_id);
1470            }
1471        }
1472
1473        self.next_signal_id = self
1474            .signals
1475            .iter()
1476            .map(|s| s.id)
1477            .max()
1478            .map_or(SignalId::ZERO, |id| id + 1);
1479        self.next_track_id = self
1480            .tracks
1481            .keys()
1482            .copied()
1483            .max()
1484            .map_or(TrackId::ZERO, |id| id + 1);
1485        self.next_identity_id = self
1486            .identities
1487            .keys()
1488            .copied()
1489            .max()
1490            .map_or(IdentityId::ZERO, |id| id + 1);
1491    }
1492
1493    // -------------------------------------------------------------------------
1494    // Signal operations (Level 1)
1495    // -------------------------------------------------------------------------
1496
1497    /// Add a signal and return its ID.
1498    pub fn add_signal(&mut self, mut signal: Signal<Location>) -> SignalId {
1499        let id = self.next_signal_id;
1500        signal.id = id;
1501        self.signals.push(signal);
1502        self.next_signal_id += 1;
1503        id
1504    }
1505
1506    /// Get a signal by ID.
1507    #[must_use]
1508    pub fn get_signal(&self, id: impl Into<SignalId>) -> Option<&Signal<Location>> {
1509        let id = id.into();
1510        self.signals.iter().find(|s| s.id == id)
1511    }
1512
1513    /// Get all signals.
1514    pub fn signals(&self) -> &[Signal<Location>] {
1515        &self.signals
1516    }
1517
1518    // -------------------------------------------------------------------------
1519    // Track operations (Level 2)
1520    // -------------------------------------------------------------------------
1521
1522    /// Add a track and return its ID.
1523    pub fn add_track(&mut self, mut track: Track) -> TrackId {
1524        let id = self.next_track_id;
1525        track.id = id;
1526
1527        // Update signal → track index
1528        for signal_ref in &track.signals {
1529            self.signal_to_track.insert(signal_ref.signal_id, id);
1530        }
1531
1532        self.tracks.insert(id, track);
1533        self.next_track_id += 1;
1534        id
1535    }
1536
1537    /// Get a track by ID.
1538    #[must_use]
1539    pub fn get_track(&self, id: impl Into<TrackId>) -> Option<&Track> {
1540        self.tracks.get(&id.into())
1541    }
1542
1543    /// Get a mutable reference to a track by ID.
1544    #[must_use]
1545    pub fn get_track_mut(&mut self, id: impl Into<TrackId>) -> Option<&mut Track> {
1546        self.tracks.get_mut(&id.into())
1547    }
1548
1549    /// Add a signal to an existing track.
1550    ///
1551    /// This properly updates the signal_to_track index.
1552    /// Returns true if the signal was added, false if track doesn't exist.
1553    pub fn add_signal_to_track(
1554        &mut self,
1555        signal_id: impl Into<SignalId>,
1556        track_id: impl Into<TrackId>,
1557        position: u32,
1558    ) -> bool {
1559        let signal_id = signal_id.into();
1560        let track_id = track_id.into();
1561        if let Some(track) = self.tracks.get_mut(&track_id) {
1562            track.add_signal(signal_id, position);
1563            self.signal_to_track.insert(signal_id, track_id);
1564            true
1565        } else {
1566            false
1567        }
1568    }
1569
1570    /// Get the track containing a signal.
1571    #[must_use]
1572    pub fn track_for_signal(&self, signal_id: SignalId) -> Option<&Track> {
1573        let track_id = self.signal_to_track.get(&signal_id)?;
1574        self.tracks.get(track_id)
1575    }
1576
1577    /// Get all tracks.
1578    pub fn tracks(&self) -> impl Iterator<Item = &Track> {
1579        self.tracks.values()
1580    }
1581
1582    // -------------------------------------------------------------------------
1583    // Identity operations (Level 3)
1584    // -------------------------------------------------------------------------
1585
1586    /// Add an identity and return its ID.
1587    pub fn add_identity(&mut self, mut identity: Identity) -> IdentityId {
1588        let id = self.next_identity_id;
1589        identity.id = id;
1590        self.identities.insert(id, identity);
1591        self.next_identity_id += 1;
1592        id
1593    }
1594
1595    /// Link a track to an identity.
1596    pub fn link_track_to_identity(
1597        &mut self,
1598        track_id: impl Into<TrackId>,
1599        identity_id: impl Into<IdentityId>,
1600    ) {
1601        let track_id = track_id.into();
1602        let identity_id = identity_id.into();
1603        if let Some(track) = self.tracks.get_mut(&track_id) {
1604            track.identity_id = Some(identity_id);
1605            self.track_to_identity.insert(track_id, identity_id);
1606        }
1607    }
1608
1609    /// Get an identity by ID.
1610    #[must_use]
1611    pub fn get_identity(&self, id: IdentityId) -> Option<&Identity> {
1612        self.identities.get(&id)
1613    }
1614
1615    /// Get the identity for a track.
1616    #[must_use]
1617    pub fn identity_for_track(&self, track_id: TrackId) -> Option<&Identity> {
1618        let identity_id = self.track_to_identity.get(&track_id)?;
1619        self.identities.get(identity_id)
1620    }
1621
1622    /// Get the identity for a signal (transitively through track).
1623    #[must_use]
1624    pub fn identity_for_signal(&self, signal_id: SignalId) -> Option<&Identity> {
1625        let track_id = self.signal_to_track.get(&signal_id)?;
1626        self.identity_for_track(*track_id)
1627    }
1628
1629    /// Get all identities.
1630    pub fn identities(&self) -> impl Iterator<Item = &Identity> {
1631        self.identities.values()
1632    }
1633
1634    /// Get a TrackRef for a track in this document.
1635    ///
1636    /// Returns `None` if the track doesn't exist in this document.
1637    /// This validates that the track is still present (tracks can be removed).
1638    #[must_use]
1639    pub fn track_ref(&self, track_id: TrackId) -> Option<TrackRef> {
1640        // Validate that the track actually exists
1641        if self.tracks.contains_key(&track_id) {
1642            Some(TrackRef {
1643                doc_id: self.id.clone(),
1644                track_id,
1645            })
1646        } else {
1647            None
1648        }
1649    }
1650
1651    // -------------------------------------------------------------------------
1652    // Conversion utilities
1653    // -------------------------------------------------------------------------
1654
1655    /// Convert to legacy Entity format for backwards compatibility.
1656    #[must_use]
1657    pub fn to_entities(&self) -> Vec<Entity> {
1658        self.signals
1659            .iter()
1660            .map(|signal| {
1661                let (start, end) = signal.location.text_offsets().unwrap_or((0, 0));
1662                let track = self.track_for_signal(signal.id);
1663                let identity = track.and_then(|t| self.identity_for_track(t.id));
1664
1665                {
1666                    let mut entity = Entity::new(
1667                        signal.surface.clone(),
1668                        EntityType::from_label(signal.label.as_str()),
1669                        start,
1670                        end,
1671                        signal.confidence,
1672                    );
1673                    entity.normalized = signal.normalized.clone();
1674                    entity.provenance = signal.provenance.clone();
1675                    entity.kb_id = identity.and_then(|i| i.kb_id.clone());
1676                    entity.canonical_id = track.map(|t| super::types::CanonicalId::new(t.id.get()));
1677                    entity.hierarchical_confidence = signal.hierarchical;
1678                    if let Location::Discontinuous { segments } = &signal.location {
1679                        entity.set_discontinuous_span(DiscontinuousSpan::new(
1680                            segments.iter().map(|(s, e)| (*s)..(*e)).collect(),
1681                        ));
1682                    }
1683                    entity
1684                }
1685            })
1686            .collect()
1687    }
1688
1689    /// Create from legacy Entity slice.
1690    #[must_use]
1691    pub fn from_entities(
1692        id: impl Into<String>,
1693        text: impl Into<String>,
1694        entities: &[Entity],
1695    ) -> Self {
1696        let mut doc = Self::new(id, text);
1697
1698        // Group entities by canonical_id to form tracks.
1699        //
1700        // IMPORTANT: Entities without a `canonical_id` are *not* coreferent by default.
1701        // They must each form their own singleton track (otherwise all NER mentions would
1702        // collapse into one giant track).
1703        #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
1704        enum TrackKey {
1705            Canonical(super::types::CanonicalId),
1706            Singleton(usize),
1707        }
1708
1709        let mut tracks_map: HashMap<TrackKey, Vec<SignalId>> = HashMap::new();
1710        let mut signal_to_entity_idx: HashMap<SignalId, usize> = HashMap::new();
1711
1712        for (idx, entity) in entities.iter().enumerate() {
1713            let location = if let Some(disc) = &entity.discontinuous_span {
1714                Location::Discontinuous {
1715                    segments: disc.segments().iter().map(|r| (r.start, r.end)).collect(),
1716                }
1717            } else if let Some(visual) = &entity.visual_span {
1718                Location::from(visual)
1719            } else {
1720                Location::text(entity.start(), entity.end())
1721            };
1722
1723            let mut signal = Signal::new(
1724                SignalId::new(idx as u64),
1725                location,
1726                &entity.text,
1727                entity.entity_type.as_label(),
1728                f32::from(entity.confidence),
1729            );
1730            signal.normalized = entity.normalized.clone();
1731            signal.provenance = entity.provenance.clone();
1732            signal.hierarchical = entity.hierarchical_confidence;
1733
1734            let signal_id = doc.add_signal(signal);
1735            signal_to_entity_idx.insert(signal_id, idx);
1736
1737            let key = match entity.canonical_id {
1738                Some(cid) => TrackKey::Canonical(cid),
1739                None => TrackKey::Singleton(idx),
1740            };
1741            tracks_map.entry(key).or_default().push(signal_id);
1742        }
1743
1744        // Create tracks from grouped signals
1745        for (_key, signal_ids) in tracks_map {
1746            if let Some(first_signal) = signal_ids.first().and_then(|id| doc.get_signal(*id)) {
1747                let mut track = Track::new(doc.next_track_id, &first_signal.surface);
1748                track.entity_type =
1749                    Some(super::types::TypeLabel::from(first_signal.label.as_str()));
1750
1751                for (pos, &signal_id) in signal_ids.iter().enumerate() {
1752                    track.add_signal(signal_id, pos as u32);
1753                }
1754
1755                // If any member entity is linked to a KB entry, create an identity and link it.
1756                // (We intentionally do this even for singleton tracks without canonical_id.)
1757                let kb_id = signal_ids.iter().find_map(|sid| {
1758                    let ent_idx = signal_to_entity_idx.get(sid).copied()?;
1759                    entities.get(ent_idx)?.kb_id.clone()
1760                });
1761                if let Some(kb_id) = kb_id {
1762                    let identity = Identity::from_kb(
1763                        doc.next_identity_id,
1764                        &track.canonical_surface,
1765                        "unknown",
1766                        kb_id,
1767                    );
1768                    let identity_id = doc.add_identity(identity);
1769                    track = track.with_identity(identity_id);
1770                }
1771
1772                doc.add_track(track);
1773            }
1774        }
1775
1776        doc
1777    }
1778
1779    /// Get signals filtered by label.
1780    #[must_use]
1781    pub fn signals_with_label(&self, label: &str) -> Vec<&Signal<Location>> {
1782        let want = super::types::TypeLabel::from(label);
1783        self.signals.iter().filter(|s| s.label == want).collect()
1784    }
1785
1786    /// Get signals above a confidence threshold.
1787    #[must_use]
1788    pub fn confident_signals(&self, threshold: Confidence) -> Vec<&Signal<Location>> {
1789        self.signals
1790            .iter()
1791            .filter(|s| s.confidence >= threshold)
1792            .collect()
1793    }
1794
1795    /// Get tracks that are linked to an identity.
1796    pub fn linked_tracks(&self) -> impl Iterator<Item = &Track> {
1797        self.tracks.values().filter(|t| t.identity_id.is_some())
1798    }
1799
1800    /// Get tracks that are NOT linked to any identity (need resolution).
1801    pub fn unlinked_tracks(&self) -> impl Iterator<Item = &Track> {
1802        self.tracks.values().filter(|t| t.identity_id.is_none())
1803    }
1804
1805    /// Count of signals that are not yet assigned to any track.
1806    #[must_use]
1807    pub fn untracked_signal_count(&self) -> usize {
1808        self.signals
1809            .iter()
1810            .filter(|s| !self.signal_to_track.contains_key(&s.id))
1811            .count()
1812    }
1813
1814    /// Get untracked signals (need coreference resolution).
1815    #[must_use]
1816    pub fn untracked_signals(&self) -> Vec<&Signal<Location>> {
1817        self.signals
1818            .iter()
1819            .filter(|s| !self.signal_to_track.contains_key(&s.id))
1820            .collect()
1821    }
1822
1823    // -------------------------------------------------------------------------
1824    // Advanced Query Methods
1825    // -------------------------------------------------------------------------
1826
1827    /// Get signals filtered by modality.
1828    #[must_use]
1829    pub fn signals_by_modality(&self, modality: Modality) -> Vec<&Signal<Location>> {
1830        self.signals
1831            .iter()
1832            .filter(|s| s.modality == modality)
1833            .collect()
1834    }
1835
1836    /// Get all text-based signals (symbolic modality).
1837    #[must_use]
1838    pub fn text_signals(&self) -> Vec<&Signal<Location>> {
1839        self.signals_by_modality(Modality::Symbolic)
1840    }
1841
1842    /// Get all visual signals (iconic modality).
1843    #[must_use]
1844    pub fn visual_signals(&self) -> Vec<&Signal<Location>> {
1845        self.signals_by_modality(Modality::Iconic)
1846    }
1847
1848    /// Find signals that overlap with a given location.
1849    #[must_use]
1850    pub fn overlapping_signals(&self, location: &Location) -> Vec<&Signal<Location>> {
1851        self.signals
1852            .iter()
1853            .filter(|s| s.location.overlaps(location))
1854            .collect()
1855    }
1856
1857    /// Find signals within a text range.
1858    #[must_use]
1859    pub fn signals_in_range(&self, start: usize, end: usize) -> Vec<&Signal<Location>> {
1860        self.signals
1861            .iter()
1862            .filter(|s| {
1863                if let Some((s_start, s_end)) = s.location.text_offsets() {
1864                    s_start >= start && s_end <= end
1865                } else {
1866                    false
1867                }
1868            })
1869            .collect()
1870    }
1871
1872    /// Get signals that are negated.
1873    #[must_use]
1874    pub fn negated_signals(&self) -> Vec<&Signal<Location>> {
1875        self.signals.iter().filter(|s| s.negated).collect()
1876    }
1877
1878    /// Get signals with a specific quantifier.
1879    #[must_use]
1880    pub fn quantified_signals(&self, quantifier: Quantifier) -> Vec<&Signal<Location>> {
1881        self.signals
1882            .iter()
1883            .filter(|s| s.quantifier == Some(quantifier))
1884            .collect()
1885    }
1886
1887    // -------------------------------------------------------------------------
1888    // Validation
1889    // -------------------------------------------------------------------------
1890
1891    /// Validate all signals against the document text.
1892    ///
1893    /// Returns a list of validation errors. Empty means all valid.
1894    ///
1895    /// # Example
1896    ///
1897    /// ```rust
1898    /// use anno_core::{GroundedDocument, Signal, Location};
1899    ///
1900    /// let mut doc = GroundedDocument::new("test", "Marie Curie was a physicist.");
1901    /// doc.add_signal(Signal::new(0, Location::text(0, 11), "Marie Curie", "PER", 0.9));
1902    /// assert!(doc.validate().is_empty());
1903    ///
1904    /// // Bad signal: wrong text at offset
1905    /// doc.add_signal(Signal::new(0, Location::text(0, 5), "WRONG", "PER", 0.9));
1906    /// assert!(!doc.validate().is_empty());
1907    /// ```
1908    #[must_use]
1909    pub fn validate(&self) -> Vec<SignalValidationError> {
1910        self.signals
1911            .iter()
1912            .filter_map(|s| s.validate_against(&self.text))
1913            .collect()
1914    }
1915
1916    /// Validate structural invariants of the document.
1917    ///
1918    /// Returns a list of invariant violations. An empty list means the document
1919    /// is structurally consistent.
1920    ///
1921    /// This checks:
1922    /// 1. Signal ID uniqueness
1923    /// 2. Track signal references point to existing signals
1924    /// 3. `signal_to_track` index consistency
1925    /// 4. `track_to_identity` index consistency
1926    /// 5. Track identity references point to existing identities
1927    ///
1928    /// Use this after any direct field manipulation to ensure consistency.
1929    ///
1930    /// # Example
1931    ///
1932    /// ```rust
1933    /// use anno_core::{GroundedDocument, Signal, Location};
1934    ///
1935    /// let mut doc = GroundedDocument::new("test", "Marie Curie was a physicist.");
1936    /// doc.add_signal(Signal::new(0, Location::text(0, 11), "Marie Curie", "PER", 0.9));
1937    /// assert!(doc.validate_invariants().is_empty());
1938    /// ```
1939    #[must_use]
1940    pub fn validate_invariants(&self) -> Vec<String> {
1941        let mut errors = Vec::new();
1942
1943        // 1. Signal ID uniqueness
1944        let mut seen_ids = std::collections::HashSet::new();
1945        for signal in &self.signals {
1946            if !seen_ids.insert(signal.id) {
1947                errors.push(format!("Duplicate signal ID: {}", signal.id));
1948            }
1949        }
1950
1951        // Build signal ID set for reference checks
1952        let signal_ids: std::collections::HashSet<_> = self.signals.iter().map(|s| s.id).collect();
1953
1954        // 2. Track signal references point to existing signals
1955        for (track_id, track) in &self.tracks {
1956            for signal_ref in &track.signals {
1957                if !signal_ids.contains(&signal_ref.signal_id) {
1958                    errors.push(format!(
1959                        "Track {} references non-existent signal {}",
1960                        track_id, signal_ref.signal_id
1961                    ));
1962                }
1963            }
1964        }
1965
1966        // 3. signal_to_track consistency
1967        for (signal_id, track_id) in &self.signal_to_track {
1968            // Check track exists
1969            if let Some(track) = self.tracks.get(track_id) {
1970                // Check track contains the signal reference
1971                if !track.signals.iter().any(|r| r.signal_id == *signal_id) {
1972                    errors.push(format!(
1973                        "signal_to_track[{}] = {} but track doesn't contain signal",
1974                        signal_id, track_id
1975                    ));
1976                }
1977            } else {
1978                errors.push(format!(
1979                    "signal_to_track[{}] = {} but track doesn't exist",
1980                    signal_id, track_id
1981                ));
1982            }
1983        }
1984
1985        // 4. track_to_identity consistency
1986        for (track_id, identity_id) in &self.track_to_identity {
1987            // Check track exists and has matching identity_id
1988            if let Some(track) = self.tracks.get(track_id) {
1989                if track.identity_id != Some(*identity_id) {
1990                    errors.push(format!(
1991                        "track_to_identity[{}] = {} but track.identity_id = {:?}",
1992                        track_id, identity_id, track.identity_id
1993                    ));
1994                }
1995            } else {
1996                errors.push(format!(
1997                    "track_to_identity[{}] = {} but track doesn't exist",
1998                    track_id, identity_id
1999                ));
2000            }
2001
2002            // Check identity exists
2003            if !self.identities.contains_key(identity_id) {
2004                errors.push(format!(
2005                    "track_to_identity[{}] = {} but identity doesn't exist",
2006                    track_id, identity_id
2007                ));
2008            }
2009        }
2010
2011        // 5. Track identity references point to existing identities
2012        for (track_id, track) in &self.tracks {
2013            if let Some(identity_id) = track.identity_id {
2014                if !self.identities.contains_key(&identity_id) {
2015                    errors.push(format!(
2016                        "Track {} references non-existent identity {}",
2017                        track_id, identity_id
2018                    ));
2019                }
2020            }
2021        }
2022
2023        errors
2024    }
2025
2026    /// Check if all structural invariants hold.
2027    #[must_use]
2028    pub fn invariants_hold(&self) -> bool {
2029        self.validate_invariants().is_empty()
2030    }
2031
2032    /// Check if all signals are valid against document text.
2033    #[must_use]
2034    pub fn is_valid(&self) -> bool {
2035        self.signals.iter().all(|s| s.is_valid(&self.text))
2036    }
2037
2038    /// Add a signal, validating it first.
2039    ///
2040    /// Returns `Err` if the signal's offsets don't match the document text.
2041    pub fn add_signal_validated(
2042        &mut self,
2043        signal: Signal<Location>,
2044    ) -> Result<SignalId, SignalValidationError> {
2045        if let Some(err) = signal.validate_against(&self.text) {
2046            return Err(err);
2047        }
2048        Ok(self.add_signal(signal))
2049    }
2050
2051    /// Add a signal by finding text in document (safe construction).
2052    ///
2053    /// Returns the signal ID, or `None` if text not found.
2054    ///
2055    /// # Example
2056    ///
2057    /// ```rust
2058    /// use anno_core::GroundedDocument;
2059    ///
2060    /// let mut doc = GroundedDocument::new("test", "Marie Curie was a physicist.");
2061    /// let id = doc.add_signal_from_text("Marie Curie", "PER", 0.95);
2062    /// assert!(id.is_some());
2063    /// ```
2064    pub fn add_signal_from_text(
2065        &mut self,
2066        surface: &str,
2067        label: impl Into<super::types::TypeLabel>,
2068        confidence: f32,
2069    ) -> Option<SignalId> {
2070        let signal = Signal::from_text(&self.text, surface, label, confidence)?;
2071        Some(self.add_signal(signal))
2072    }
2073
2074    /// Add a signal by finding the nth occurrence of text.
2075    pub fn add_signal_from_text_nth(
2076        &mut self,
2077        surface: &str,
2078        label: impl Into<super::types::TypeLabel>,
2079        confidence: f32,
2080        occurrence: usize,
2081    ) -> Option<SignalId> {
2082        let signal = Signal::from_text_nth(&self.text, surface, label, confidence, occurrence)?;
2083        Some(self.add_signal(signal))
2084    }
2085
2086    // -------------------------------------------------------------------------
2087    // Statistics
2088    // -------------------------------------------------------------------------
2089
2090    /// Get statistics about the document.
2091    #[must_use]
2092    pub fn stats(&self) -> DocumentStats {
2093        let signal_count = self.signals.len();
2094        let track_count = self.tracks.len();
2095        let identity_count = self.identities.len();
2096
2097        let linked_track_count = self
2098            .tracks
2099            .values()
2100            .filter(|t| t.identity_id.is_some())
2101            .count();
2102        let untracked_count = self.untracked_signal_count();
2103
2104        let avg_track_size = if track_count > 0 {
2105            self.tracks.values().map(|t| t.len()).sum::<usize>() as f32 / track_count as f32
2106        } else {
2107            0.0
2108        };
2109
2110        let singleton_count = self.tracks.values().filter(|t| t.is_singleton()).count();
2111
2112        let avg_confidence = Confidence::new(if signal_count > 0 {
2113            self.signals
2114                .iter()
2115                .map(|s| s.confidence.value())
2116                .sum::<f64>()
2117                / signal_count as f64
2118        } else {
2119            0.0
2120        });
2121
2122        let negated_count = self.signals.iter().filter(|s| s.negated).count();
2123
2124        // Count by modality
2125        let symbolic_count = self
2126            .signals
2127            .iter()
2128            .filter(|s| s.modality == Modality::Symbolic)
2129            .count();
2130        let iconic_count = self
2131            .signals
2132            .iter()
2133            .filter(|s| s.modality == Modality::Iconic)
2134            .count();
2135        let hybrid_count = self
2136            .signals
2137            .iter()
2138            .filter(|s| s.modality == Modality::Hybrid)
2139            .count();
2140
2141        DocumentStats {
2142            signal_count,
2143            track_count,
2144            identity_count,
2145            linked_track_count,
2146            untracked_count,
2147            avg_track_size,
2148            singleton_count,
2149            avg_confidence,
2150            negated_count,
2151            symbolic_count,
2152            iconic_count,
2153            hybrid_count,
2154        }
2155    }
2156
2157    // -------------------------------------------------------------------------
2158    // Batch Operations
2159    // -------------------------------------------------------------------------
2160
2161    /// Add multiple signals at once.
2162    ///
2163    /// Returns the IDs of all added signals.
2164    pub fn add_signals(
2165        &mut self,
2166        signals: impl IntoIterator<Item = Signal<Location>>,
2167    ) -> Vec<SignalId> {
2168        signals.into_iter().map(|s| self.add_signal(s)).collect()
2169    }
2170
2171    /// Create a track from a list of signal IDs.
2172    ///
2173    /// Automatically sets positions based on order.
2174    pub fn create_track_from_signals(
2175        &mut self,
2176        canonical: impl Into<String>,
2177        signal_ids: &[SignalId],
2178    ) -> Option<TrackId> {
2179        if signal_ids.is_empty() {
2180            return None;
2181        }
2182
2183        let mut track = Track::new(TrackId::ZERO, canonical);
2184        for (pos, &id) in signal_ids.iter().enumerate() {
2185            track.add_signal(id, pos as u32);
2186        }
2187        Some(self.add_track(track))
2188    }
2189
2190    /// Merge multiple tracks into one.
2191    ///
2192    /// The resulting track has all signals from the input tracks.
2193    /// The canonical surface comes from the first track.
2194    pub fn merge_tracks(&mut self, track_ids: &[TrackId]) -> Option<TrackId> {
2195        if track_ids.is_empty() {
2196            return None;
2197        }
2198
2199        // Collect all signals from tracks to merge
2200        let mut all_signals: Vec<SignalRef> = Vec::new();
2201        let mut canonical = String::new();
2202        let mut entity_type = None;
2203
2204        for &track_id in track_ids {
2205            if let Some(track) = self.tracks.get(&track_id) {
2206                if canonical.is_empty() {
2207                    canonical = track.canonical_surface.clone();
2208                    entity_type = track.entity_type.clone();
2209                }
2210                all_signals.extend(track.signals.iter().cloned());
2211            }
2212        }
2213
2214        if all_signals.is_empty() {
2215            return None;
2216        }
2217
2218        // Sort by position
2219        all_signals.sort_by_key(|s| s.position);
2220
2221        // Remove old tracks
2222        for &track_id in track_ids {
2223            self.tracks.remove(&track_id);
2224        }
2225
2226        // Create new merged track
2227        let mut new_track = Track::new(TrackId::ZERO, canonical);
2228        new_track.entity_type = entity_type;
2229        for (pos, signal_ref) in all_signals.iter().enumerate() {
2230            new_track.add_signal(signal_ref.signal_id, pos as u32);
2231        }
2232
2233        Some(self.add_track(new_track))
2234    }
2235
2236    /// Find all pairs of overlapping signals (potential duplicates or nested entities).
2237    #[must_use]
2238    pub fn find_overlapping_signal_pairs(&self) -> Vec<(SignalId, SignalId)> {
2239        let mut pairs = Vec::new();
2240        let signals: Vec<_> = self.signals.iter().collect();
2241
2242        for i in 0..signals.len() {
2243            for j in (i + 1)..signals.len() {
2244                if signals[i].location.overlaps(&signals[j].location) {
2245                    pairs.push((signals[i].id, signals[j].id));
2246                }
2247            }
2248        }
2249
2250        pairs
2251    }
2252}
2253
2254/// Statistics about a grounded document.
2255#[derive(Debug, Clone, Copy, Default)]
2256pub struct DocumentStats {
2257    /// Total number of signals
2258    pub signal_count: usize,
2259    /// Total number of tracks
2260    pub track_count: usize,
2261    /// Total number of identities
2262    pub identity_count: usize,
2263    /// Number of tracks linked to identities
2264    pub linked_track_count: usize,
2265    /// Number of signals not in any track
2266    pub untracked_count: usize,
2267    /// Average signals per track
2268    pub avg_track_size: f32,
2269    /// Number of singleton tracks (single mention)
2270    pub singleton_count: usize,
2271    /// Average signal confidence
2272    pub avg_confidence: Confidence,
2273    /// Number of negated signals
2274    pub negated_count: usize,
2275    /// Number of symbolic (text) signals
2276    pub symbolic_count: usize,
2277    /// Number of iconic (visual) signals
2278    pub iconic_count: usize,
2279    /// Number of hybrid signals
2280    pub hybrid_count: usize,
2281}
2282
2283impl std::fmt::Display for DocumentStats {
2284    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
2285        writeln!(f, "Document Statistics:")?;
2286        writeln!(
2287            f,
2288            "  Signals: {} (avg confidence: {:.2})",
2289            self.signal_count,
2290            self.avg_confidence.value()
2291        )?;
2292        writeln!(
2293            f,
2294            "  Tracks: {} (avg size: {:.1}, singletons: {})",
2295            self.track_count, self.avg_track_size, self.singleton_count
2296        )?;
2297        writeln!(
2298            f,
2299            "  Identities: {} ({} tracks linked)",
2300            self.identity_count, self.linked_track_count
2301        )?;
2302        writeln!(f, "  Untracked signals: {}", self.untracked_count)?;
2303        writeln!(
2304            f,
2305            "  Modalities: {} symbolic, {} iconic, {} hybrid",
2306            self.symbolic_count, self.iconic_count, self.hybrid_count
2307        )?;
2308        if self.negated_count > 0 {
2309            writeln!(f, "  Negated: {}", self.negated_count)?;
2310        }
2311        Ok(())
2312    }
2313}
2314
2315// =============================================================================
2316// Spatial Index for Efficient Range Queries
2317// =============================================================================
2318
2319/// A simple interval tree node for text span indexing.
2320///
2321/// This provides O(log n + k) lookup for signals within a text range,
2322/// where k is the number of results. Much faster than O(n) linear scan
2323/// for documents with many signals.
2324#[derive(Debug, Clone)]
2325struct IntervalNode {
2326    /// Signal ID
2327    signal_id: SignalId,
2328    /// Start offset (inclusive)
2329    start: usize,
2330    /// End offset (exclusive)
2331    end: usize,
2332    /// Maximum end in this subtree (for efficient pruning)
2333    max_end: usize,
2334    /// Left child
2335    left: Option<Box<IntervalNode>>,
2336    /// Right child
2337    right: Option<Box<IntervalNode>>,
2338}
2339
2340impl IntervalNode {
2341    fn new(signal_id: SignalId, start: usize, end: usize) -> Self {
2342        Self {
2343            signal_id,
2344            start,
2345            end,
2346            max_end: end,
2347            left: None,
2348            right: None,
2349        }
2350    }
2351
2352    fn insert(&mut self, signal_id: SignalId, start: usize, end: usize) {
2353        self.max_end = self.max_end.max(end);
2354
2355        if start < self.start {
2356            if let Some(ref mut left) = self.left {
2357                left.insert(signal_id, start, end);
2358            } else {
2359                self.left = Some(Box::new(IntervalNode::new(signal_id, start, end)));
2360            }
2361        } else if let Some(ref mut right) = self.right {
2362            right.insert(signal_id, start, end);
2363        } else {
2364            self.right = Some(Box::new(IntervalNode::new(signal_id, start, end)));
2365        }
2366    }
2367
2368    fn query_overlap(&self, query_start: usize, query_end: usize, results: &mut Vec<SignalId>) {
2369        // Check if this interval overlaps with query
2370        if self.start < query_end && query_start < self.end {
2371            results.push(self.signal_id);
2372        }
2373
2374        // Check left subtree if it could contain overlapping intervals
2375        if let Some(ref left) = self.left {
2376            if left.max_end > query_start {
2377                left.query_overlap(query_start, query_end, results);
2378            }
2379        }
2380
2381        // Check right subtree if query could overlap
2382        if let Some(ref right) = self.right {
2383            if self.start < query_end {
2384                right.query_overlap(query_start, query_end, results);
2385            }
2386        }
2387    }
2388
2389    fn query_containing(&self, query_start: usize, query_end: usize, results: &mut Vec<SignalId>) {
2390        // Check if this interval fully contains the query
2391        if self.start <= query_start && self.end >= query_end {
2392            results.push(self.signal_id);
2393        }
2394
2395        // Check left subtree if it could contain the range
2396        if let Some(ref left) = self.left {
2397            if left.max_end >= query_end {
2398                left.query_containing(query_start, query_end, results);
2399            }
2400        }
2401
2402        // Check right subtree
2403        if let Some(ref right) = self.right {
2404            if self.start <= query_start {
2405                right.query_containing(query_start, query_end, results);
2406            }
2407        }
2408    }
2409
2410    fn query_contained_in(
2411        &self,
2412        range_start: usize,
2413        range_end: usize,
2414        results: &mut Vec<SignalId>,
2415    ) {
2416        // Check if this interval is fully contained in range
2417        if self.start >= range_start && self.end <= range_end {
2418            results.push(self.signal_id);
2419        }
2420
2421        // Check left subtree
2422        if let Some(ref left) = self.left {
2423            left.query_contained_in(range_start, range_end, results);
2424        }
2425
2426        // Check right subtree if it could have contained intervals
2427        if let Some(ref right) = self.right {
2428            if self.start < range_end {
2429                right.query_contained_in(range_start, range_end, results);
2430            }
2431        }
2432    }
2433}
2434
2435/// Spatial index for text signals using an interval tree.
2436///
2437/// Enables efficient queries:
2438/// - `query_overlap(start, end)`: Find signals that overlap with range
2439/// - `query_containing(start, end)`: Find signals that fully contain range
2440/// - `query_contained_in(start, end)`: Find signals fully within range
2441///
2442/// # Performance
2443///
2444/// - Build: O(n log n)
2445/// - Query: O(log n + k) where k is result count
2446/// - Space: O(n)
2447///
2448/// For documents with >100 signals, this provides significant speedup
2449/// over linear scan for range queries.
2450#[derive(Debug, Clone, Default)]
2451pub struct TextSpatialIndex {
2452    root: Option<IntervalNode>,
2453    size: usize,
2454}
2455
2456impl TextSpatialIndex {
2457    /// Create a new empty index.
2458    #[must_use]
2459    pub fn new() -> Self {
2460        Self::default()
2461    }
2462
2463    /// Build index from signals in a document.
2464    #[must_use]
2465    pub fn from_signals(signals: &[Signal<Location>]) -> Self {
2466        let mut index = Self::new();
2467        for signal in signals {
2468            if let Some((start, end)) = signal.location.text_offsets() {
2469                index.insert(signal.id, start, end);
2470            }
2471        }
2472        index
2473    }
2474
2475    /// Insert a text span into the index.
2476    pub fn insert(&mut self, signal_id: SignalId, start: usize, end: usize) {
2477        if let Some(ref mut root) = self.root {
2478            root.insert(signal_id, start, end);
2479        } else {
2480            self.root = Some(IntervalNode::new(signal_id, start, end));
2481        }
2482        self.size += 1;
2483    }
2484
2485    /// Find signals that overlap with the given range.
2486    #[must_use]
2487    pub fn query_overlap(&self, start: usize, end: usize) -> Vec<SignalId> {
2488        let mut results = Vec::new();
2489        if let Some(ref root) = self.root {
2490            root.query_overlap(start, end, &mut results);
2491        }
2492        results
2493    }
2494
2495    /// Find signals that fully contain the given range.
2496    #[must_use]
2497    pub fn query_containing(&self, start: usize, end: usize) -> Vec<SignalId> {
2498        let mut results = Vec::new();
2499        if let Some(ref root) = self.root {
2500            root.query_containing(start, end, &mut results);
2501        }
2502        results
2503    }
2504
2505    /// Find signals fully contained within the given range.
2506    #[must_use]
2507    pub fn query_contained_in(&self, start: usize, end: usize) -> Vec<SignalId> {
2508        let mut results = Vec::new();
2509        if let Some(ref root) = self.root {
2510            root.query_contained_in(start, end, &mut results);
2511        }
2512        results
2513    }
2514
2515    /// Number of entries in the index.
2516    #[must_use]
2517    pub fn len(&self) -> usize {
2518        self.size
2519    }
2520
2521    /// Check if the index is empty.
2522    #[must_use]
2523    pub fn is_empty(&self) -> bool {
2524        self.size == 0
2525    }
2526}
2527
2528impl GroundedDocument {
2529    /// Build a spatial index for efficient text range queries.
2530    ///
2531    /// This is useful for documents with many signals where you need
2532    /// to frequently query by text position.
2533    ///
2534    /// # Example
2535    ///
2536    /// ```rust
2537    /// use anno_core::{GroundedDocument, Signal, Location};
2538    ///
2539    /// let mut doc = GroundedDocument::new("doc", "Some text with entities.");
2540    /// doc.add_signal(Signal::new(0, Location::text(0, 4), "Some", "T", 0.9));
2541    /// doc.add_signal(Signal::new(0, Location::text(10, 14), "with", "T", 0.9));
2542    ///
2543    /// let index = doc.build_text_index();
2544    /// let in_range = index.query_contained_in(0, 20);
2545    /// assert_eq!(in_range.len(), 2);
2546    /// ```
2547    #[must_use]
2548    pub fn build_text_index(&self) -> TextSpatialIndex {
2549        TextSpatialIndex::from_signals(&self.signals)
2550    }
2551
2552    /// Query signals using the spatial index (builds index if needed).
2553    ///
2554    /// For repeated queries, build the index once with `build_text_index()`
2555    /// and reuse it.
2556    #[must_use]
2557    pub fn query_signals_in_range_indexed(
2558        &self,
2559        start: usize,
2560        end: usize,
2561    ) -> Vec<&Signal<Location>> {
2562        let index = self.build_text_index();
2563        let ids = index.query_contained_in(start, end);
2564        ids.iter().filter_map(|&id| self.get_signal(id)).collect()
2565    }
2566
2567    /// Query overlapping signals using spatial index.
2568    #[must_use]
2569    pub fn query_overlapping_signals_indexed(
2570        &self,
2571        start: usize,
2572        end: usize,
2573    ) -> Vec<&Signal<Location>> {
2574        let index = self.build_text_index();
2575        let ids = index.query_overlap(start, end);
2576        ids.iter().filter_map(|&id| self.get_signal(id)).collect()
2577    }
2578
2579    /// Convert this grounded document into a coreference document for evaluation.
2580    ///
2581    /// This is a lightweight bridge between the production pipeline types
2582    /// (Signal/Track/Identity) and the evaluation-oriented coreference types
2583    /// (`CorefDocument`, `CorefChain`, `Mention`).
2584    ///
2585    /// - Each [`Track`] becomes a [`super::coref::CorefChain`]
2586    /// - Each track mention is derived from the track's signal locations
2587    /// - Non-text signals (iconic-only locations) are skipped
2588    ///
2589    /// Note: Mention typing (proper/nominal/pronominal) is left unset; callers
2590    /// doing mention-type evaluation should compute that separately.
2591    #[must_use]
2592    pub fn to_coref_document(&self) -> super::coref::CorefDocument {
2593        use super::coref::{CorefChain, CorefDocument, Mention};
2594        use std::collections::HashMap;
2595
2596        // Build a fast index for signal lookup.
2597        let signal_by_id: HashMap<SignalId, &Signal<Location>> =
2598            self.signals.iter().map(|s| (s.id, s)).collect();
2599
2600        let mut chains: Vec<CorefChain> = Vec::new();
2601
2602        for track in self.tracks.values() {
2603            let mut mentions: Vec<Mention> = Vec::new();
2604
2605            for sref in &track.signals {
2606                let Some(signal) = signal_by_id.get(&sref.signal_id) else {
2607                    continue;
2608                };
2609
2610                let Some((start, end)) = signal.location.text_offsets() else {
2611                    continue;
2612                };
2613
2614                let mut m = Mention::new(signal.surface.clone(), start, end);
2615                m.entity_type = Some(signal.label.to_string());
2616                mentions.push(m);
2617            }
2618
2619            if mentions.is_empty() {
2620                continue;
2621            }
2622
2623            let mut chain = CorefChain::new(mentions);
2624            chain.entity_type = track.entity_type.as_ref().map(|t| t.to_string());
2625            chains.push(chain);
2626        }
2627
2628        // Deterministic ordering: sort by earliest mention.
2629        chains.sort_by_key(|c| c.mentions.first().map(|m| m.start).unwrap_or(usize::MAX));
2630
2631        CorefDocument::with_id(&self.text, &self.id, chains)
2632    }
2633}
2634
2635// =============================================================================
2636// HTML Visualization (Brutalist/Functional Style)
2637// =============================================================================
2638
2639/// Generate an HTML visualization of a grounded document.
2640///
2641/// Brutalist design: monospace, dense tables, no decoration, raw data.
2642pub fn render_document_html(doc: &GroundedDocument) -> String {
2643    let mut html = String::new();
2644    let stats = doc.stats();
2645
2646    html.push_str(r#"<!DOCTYPE html>
2647<html>
2648<head>
2649<meta charset="UTF-8">
2650<meta name="color-scheme" content="dark light">
2651<title>grounded::GroundedDocument</title>
2652<style>
2653:root{
2654  /* Allow UA widgets (inputs/scrollbars) to match the theme */
2655  color-scheme: light dark;
2656  /* Dark (default) */
2657  --bg:#0a0a0a;
2658  --panel-bg:#0d0d0d;
2659  --text:#b0b0b0;
2660  --text-strong:#fff;
2661  --muted:#666;
2662  --border:#222;
2663  --border-strong:#333;
2664  --hover:#111;
2665  --input-bg:#080808;
2666  --active:#fff;
2667  --track-strong:rgba(255,255,255,0.35);
2668  --track-soft:rgba(255,255,255,0.18);
2669  /* Entity colors (dark) */
2670  --per-bg:#1a1a2e; --per-br:#4a4a8a; --per-tx:#8888cc;
2671  --org-bg:#1a2e1a; --org-br:#4a8a4a; --org-tx:#88cc88;
2672  --loc-bg:#2e2e1a; --loc-br:#8a8a4a; --loc-tx:#cccc88;
2673  --mis-bg:#1a1a1a; --mis-br:#4a4a4a; --mis-tx:#999;
2674  --dat-bg:#2e1a1a; --dat-br:#8a4a4a; --dat-tx:#cc8888;
2675  --badge-y-bg:#1a2e1a; --badge-y-tx:#4a8a4a; --badge-y-br:#2a4a2a;
2676  --badge-n-bg:#2e2e1a; --badge-n-tx:#8a8a4a; --badge-n-br:#4a4a2a;
2677}
2678@media (prefers-color-scheme: light){
2679  :root{
2680    --bg:#ffffff;
2681    --panel-bg:#f7f7f7;
2682    --text:#222;
2683    --text-strong:#000;
2684    --muted:#555;
2685    --border:#d6d6d6;
2686    --border-strong:#c6c6c6;
2687    --hover:#f0f0f0;
2688    --input-bg:#ffffff;
2689    --active:#000;
2690    --track-strong:rgba(0,0,0,0.25);
2691    --track-soft:rgba(0,0,0,0.12);
2692    /* Entity colors (light) */
2693    --per-bg:#e9e9ff; --per-br:#6c6cff; --per-tx:#2b2b7a;
2694    --org-bg:#e9f7e9; --org-br:#2f8a2f; --org-tx:#1f5a1f;
2695    --loc-bg:#fff7db; --loc-br:#8a7a2f; --loc-tx:#5a4d12;
2696    --mis-bg:#f2f2f2; --mis-br:#8a8a8a; --mis-tx:#333;
2697    --dat-bg:#ffe9e9; --dat-br:#8a2f2f; --dat-tx:#5a1f1f;
2698    --badge-y-bg:#e9f7e9; --badge-y-tx:#1f5a1f; --badge-y-br:#9ad19a;
2699    --badge-n-bg:#fff7db; --badge-n-tx:#5a4d12; --badge-n-br:#e2d39a;
2700  }
2701}
2702html[data-theme='dark']{
2703  --bg:#0a0a0a; --panel-bg:#0d0d0d; --text:#b0b0b0; --text-strong:#fff;
2704  --muted:#666; --border:#222; --border-strong:#333; --hover:#111;
2705  --input-bg:#080808; --active:#fff;
2706  --track-strong:rgba(255,255,255,0.35); --track-soft:rgba(255,255,255,0.18);
2707  --per-bg:#1a1a2e; --per-br:#4a4a8a; --per-tx:#8888cc;
2708  --org-bg:#1a2e1a; --org-br:#4a8a4a; --org-tx:#88cc88;
2709  --loc-bg:#2e2e1a; --loc-br:#8a8a4a; --loc-tx:#cccc88;
2710  --mis-bg:#1a1a1a; --mis-br:#4a4a4a; --mis-tx:#999;
2711  --dat-bg:#2e1a1a; --dat-br:#8a4a4a; --dat-tx:#cc8888;
2712  --badge-y-bg:#1a2e1a; --badge-y-tx:#4a8a4a; --badge-y-br:#2a4a2a;
2713  --badge-n-bg:#2e2e1a; --badge-n-tx:#8a8a4a; --badge-n-br:#4a4a2a;
2714}
2715html[data-theme='light']{
2716  --bg:#ffffff; --panel-bg:#f7f7f7; --text:#222; --text-strong:#000;
2717  --muted:#555; --border:#d6d6d6; --border-strong:#c6c6c6; --hover:#f0f0f0;
2718  --input-bg:#ffffff; --active:#000;
2719  --track-strong:rgba(0,0,0,0.25); --track-soft:rgba(0,0,0,0.12);
2720  --per-bg:#e9e9ff; --per-br:#6c6cff; --per-tx:#2b2b7a;
2721  --org-bg:#e9f7e9; --org-br:#2f8a2f; --org-tx:#1f5a1f;
2722  --loc-bg:#fff7db; --loc-br:#8a7a2f; --loc-tx:#5a4d12;
2723  --mis-bg:#f2f2f2; --mis-br:#8a8a8a; --mis-tx:#333;
2724  --dat-bg:#ffe9e9; --dat-br:#8a2f2f; --dat-tx:#5a1f1f;
2725  --badge-y-bg:#e9f7e9; --badge-y-tx:#1f5a1f; --badge-y-br:#9ad19a;
2726  --badge-n-bg:#fff7db; --badge-n-tx:#5a4d12; --badge-n-br:#e2d39a;
2727}
2728
2729*{box-sizing:border-box;margin:0;padding:0}
2730body{font:12px/1.4 monospace;background:var(--bg);color:var(--text);padding:8px}
2731h1,h2,h3{color:var(--text-strong);font-weight:normal;border-bottom:1px solid var(--border-strong);padding:4px 0;margin:16px 0 8px}
2732h1{font-size:14px}h2{font-size:12px}h3{font-size:11px;color:var(--muted)}
2733 a{color:inherit}
2734 a:hover{text-decoration:underline}
2735table{width:100%;border-collapse:collapse;font-size:11px;margin:4px 0}
2736th,td{padding:4px 8px;text-align:left;border:1px solid var(--border)}
2737th{background:var(--hover);color:var(--muted);font-weight:normal;text-transform:uppercase;font-size:10px}
2738tr:hover{background:var(--hover)}
2739.grid{display:grid;grid-template-columns:repeat(auto-fit,minmax(300px,1fr));gap:8px}
2740.panel{border:1px solid var(--border);background:var(--panel-bg);padding:8px}
2741.panel-h{display:flex;align-items:center;gap:8px}
2742.toggle{cursor:pointer;user-select:none;color:var(--muted);border:1px solid var(--border);background:var(--bg);padding:2px 6px;font-size:10px}
2743.panel-collapsed table,.panel-collapsed .panel-body{display:none}
2744.toolbar{display:flex;gap:8px;align-items:center;margin:8px 0 0}
2745.toolbar input{width:100%;max-width:520px;background:var(--input-bg);border:1px solid var(--border);color:var(--text);padding:6px 8px;font:12px monospace}
2746.muted{color:var(--muted)}
2747.panel-body{white-space:pre-wrap;word-break:break-word}
2748.text-box{background:var(--input-bg);border:1px solid var(--border);padding:8px;white-space:pre-wrap;word-break:break-word;line-height:1.6}
2749.e{padding:1px 2px;border-bottom:1px solid}
2750.seg{cursor:pointer}
2751.e-per{background:var(--per-bg);border-color:var(--per-br);color:var(--per-tx)}
2752.e-org{background:var(--org-bg);border-color:var(--org-br);color:var(--org-tx)}
2753.e-loc{background:var(--loc-bg);border-color:var(--loc-br);color:var(--loc-tx)}
2754.e-misc{background:var(--mis-bg);border-color:var(--mis-br);color:var(--mis-tx)}
2755.e-date{background:var(--dat-bg);border-color:var(--dat-br);color:var(--dat-tx)}
2756.e-track{box-shadow:inset 0 0 0 1px var(--track-strong)}
2757.e-track-hover{box-shadow:inset 0 0 0 1px var(--track-soft)}
2758.e-active{outline:2px solid var(--active);outline-offset:1px}
2759.conf{color:var(--muted);font-size:10px}
2760.badge{display:inline-block;padding:1px 4px;font-size:9px;text-transform:uppercase}
2761.badge-y{background:var(--badge-y-bg);color:var(--badge-y-tx);border:1px solid var(--badge-y-br)}
2762.badge-n{background:var(--badge-n-bg);color:var(--badge-n-tx);border:1px solid var(--badge-n-br)}
2763.stats{display:flex;gap:16px;padding:8px 0;border-bottom:1px solid var(--border);margin-bottom:8px}
2764.stat{text-align:center}.stat-v{font-size:18px;color:var(--text-strong)}.stat-l{font-size:9px;color:var(--muted);text-transform:uppercase}
2765.id{color:var(--muted);font-size:9px}
2766.kb{color:var(--muted)}
2767.arrow{color:var(--muted)}
2768</style>
2769</head>
2770<body>
2771"#);
2772
2773    // Header with stats
2774    html.push_str(&format!(
2775        r#"<div class="panel-h" style="justify-content:space-between"><h1>doc_id="{}" len={}</h1><span class="toggle" id="theme-toggle" title="toggle theme (auto → dark → light)">theme: auto</span></div>"#,
2776        html_escape(&doc.id),
2777        doc.text.len()
2778    ));
2779
2780    html.push_str(r#"<div class="stats">"#);
2781    html.push_str(&format!(
2782        r#"<div class="stat"><div class="stat-v">{}</div><div class="stat-l">signals</div></div>"#,
2783        stats.signal_count
2784    ));
2785    html.push_str(&format!(
2786        r#"<div class="stat"><div class="stat-v">{}</div><div class="stat-l">tracks</div></div>"#,
2787        stats.track_count
2788    ));
2789    html.push_str(&format!(r#"<div class="stat"><div class="stat-v">{}</div><div class="stat-l">identities</div></div>"#, stats.identity_count));
2790    html.push_str(&format!(r#"<div class="stat"><div class="stat-v">{:.2}</div><div class="stat-l">avg_conf</div></div>"#, stats.avg_confidence));
2791    html.push_str(&format!(
2792        r#"<div class="stat"><div class="stat-v">{}</div><div class="stat-l">linked</div></div>"#,
2793        stats.linked_track_count
2794    ));
2795    html.push_str(&format!(r#"<div class="stat"><div class="stat-v">{}</div><div class="stat-l">untracked</div></div>"#, stats.untracked_count));
2796    if stats.iconic_count > 0 || stats.hybrid_count > 0 {
2797        html.push_str(&format!(r#"<div class="stat"><div class="stat-v">{}/{}/{}</div><div class="stat-l">sym/ico/hyb</div></div>"#,
2798            stats.symbolic_count, stats.iconic_count, stats.hybrid_count));
2799    }
2800    html.push_str(r#"</div>"#);
2801
2802    // Annotated text
2803    html.push_str(r#"<h2>text</h2>"#);
2804    html.push_str(r#"<div class="text-box">"#);
2805    html.push_str(&annotate_text_html(
2806        &doc.text,
2807        doc.signals(),
2808        &doc.signal_to_track,
2809    ));
2810    html.push_str(r#"</div>"#);
2811
2812    // Selection panel (filled by JS)
2813    html.push_str(
2814        r#"<h2>selection</h2><div class="panel" id="selection-panel" role="region" aria-label="selection"><div class="panel-h"><h3>selection</h3><span class="muted" id="selection-hint" role="status" aria-live="polite">click a mention / row to see coref track details</span></div><pre class="panel-body" id="selection-body" role="textbox" aria-readonly="true" aria-label="selection details">—</pre></div>"#,
2815    );
2816
2817    // Grid layout for three levels
2818    html.push_str(r#"<div class="grid">"#);
2819
2820    // Level 1: Signals table
2821    html.push_str(r#"<div class="panel" id="panel-signals"><div class="panel-h"><h3>signals (level 1)</h3><span class="toggle" data-toggle="panel-signals">toggle</span></div><div class="toolbar"><input id="signal-filter" type="text" placeholder="filter signals: id / label / surface (e.g. 'PER', 'S12', 'Paris')" /><span class="muted" id="signal-filter-count"></span></div><table id="signals-table">"#);
2822    html.push_str(r#"<tr><th>id</th><th>span</th><th>surface</th><th>label</th><th>conf</th><th>track</th></tr>"#);
2823    for signal in doc.signals() {
2824        let (span, start_opt, end_opt) = if let Some((s, e)) = signal.location.text_offsets() {
2825            (format!("[{},{})", s, e), Some(s), Some(e))
2826        } else {
2827            ("bbox".to_string(), None, None)
2828        };
2829        let track_id_num = doc.signal_to_track.get(&signal.id).copied();
2830        let track_id = track_id_num
2831            .map(|t| format!("T{}", t))
2832            .unwrap_or_else(|| "-".to_string());
2833        let track_attr = track_id_num
2834            .map(|t| format!(r#" data-track="{}""#, t))
2835            .unwrap_or_default();
2836        let offs_attr = match (start_opt, end_opt) {
2837            (Some(s), Some(e)) => format!(r#" data-start="{}" data-end="{}""#, s, e),
2838            _ => String::new(),
2839        };
2840        let neg = if signal.negated { " NEG" } else { "" };
2841        html.push_str(&format!(
2842            r#"<tr data-sid="S{sid}" data-label="{label}" data-surface="{surface}"{track_attr}{offs_attr} data-conf="{conf:.2}"><td class="id"><a href='#S{sid}'>S{sid}</a></td><td>{span}</td><td>{surface}</td><td>{label}{neg}</td><td class="conf">{conf:.2}</td><td class="id">{track}</td></tr>"#,
2843            sid = signal.id,
2844            span = span,
2845            surface = html_escape(&signal.surface),
2846            label = html_escape(signal.label.as_str()),
2847            neg = neg,
2848            conf = signal.confidence.value(),
2849            track = track_id,
2850            track_attr = track_attr,
2851            offs_attr = offs_attr
2852        ));
2853    }
2854    html.push_str(r#"</table></div>"#);
2855
2856    // Level 2: Tracks table
2857    html.push_str(r#"<div class="panel" id="panel-tracks"><div class="panel-h"><h3>tracks (level 2)</h3><span class="toggle" data-toggle="panel-tracks">toggle</span></div><table id="tracks-table">"#);
2858    html.push_str(r#"<tr><th>id</th><th>canonical</th><th>type</th><th>|S|</th><th>signals</th><th>identity</th></tr>"#);
2859    for track in doc.tracks() {
2860        let entity_type = track
2861            .entity_type
2862            .as_ref()
2863            .map(|t| t.as_str())
2864            .unwrap_or("-");
2865        let signals: Vec<String> = track
2866            .signals
2867            .iter()
2868            .map(|s| format!("S{}", s.signal_id))
2869            .collect();
2870        let identity = doc
2871            .identity_for_track(track.id)
2872            .map(|i| format!("I{}", i.id))
2873            .unwrap_or_else(|| "-".to_string());
2874        let linked_badge = if track.identity_id.is_some() {
2875            r#"<span class="badge badge-y">y</span>"#
2876        } else {
2877            r#"<span class="badge badge-n">n</span>"#
2878        };
2879        html.push_str(&format!(
2880            r#"<tr data-tid="{tid}"><td class="id">T{tid}</td><td>{canonical_surface}</td><td>{etype}</td><td>{n}</td><td class="id">{sigs}</td><td class="id">{ident} {badge}</td></tr>"#,
2881            tid = track.id,
2882            canonical_surface = html_escape(&track.canonical_surface),
2883            etype = html_escape(entity_type),
2884            n = track.len(),
2885            sigs = html_escape(&signals.join(" ")),
2886            ident = identity,
2887            badge = linked_badge
2888        ));
2889    }
2890    html.push_str(r#"</table></div>"#);
2891
2892    // Level 3: Identities table
2893    html.push_str(r#"<div class="panel" id="panel-identities"><div class="panel-h"><h3>identities (level 3)</h3><span class="toggle" data-toggle="panel-identities">toggle</span></div><table>"#);
2894    html.push_str(r#"<tr><th>id</th><th>name</th><th>type</th><th>kb</th><th>kb_id</th><th>aliases</th></tr>"#);
2895    for identity in doc.identities() {
2896        let kb = identity.kb_name.as_deref().unwrap_or("-");
2897        let kb_id = identity.kb_id.as_deref().unwrap_or("-");
2898        let entity_type = identity
2899            .entity_type
2900            .as_ref()
2901            .map(|t| t.as_str())
2902            .unwrap_or("-");
2903        let aliases = if identity.aliases.is_empty() {
2904            "-".to_string()
2905        } else {
2906            identity.aliases.join(", ")
2907        };
2908        html.push_str(&format!(
2909            r#"<tr><td class="id">I{}</td><td>{}</td><td>{}</td><td class="kb">{}</td><td class="kb">{}</td><td>{}</td></tr>"#,
2910            identity.id, html_escape(&identity.canonical_name), entity_type, kb, kb_id, html_escape(&aliases)
2911        ));
2912    }
2913    html.push_str(r#"</table></div>"#);
2914
2915    html.push_str(r#"</div>"#); // end grid
2916
2917    // Signal-Track-Identity mapping (compact view)
2918    html.push_str(r#"<h2>hierarchy trace</h2><div class="panel"><table>"#);
2919    html.push_str(r#"<tr><th>signal</th><th></th><th>track</th><th></th><th>identity</th><th>kb_id</th></tr>"#);
2920    for signal in doc.signals() {
2921        let track = doc.track_for_signal(signal.id);
2922        let identity = doc.identity_for_signal(signal.id);
2923
2924        let track_str = track
2925            .map(|t| format!("T{} \"{}\"", t.id, html_escape(&t.canonical_surface)))
2926            .unwrap_or_else(|| "-".to_string());
2927        let identity_str = identity
2928            .map(|i| format!("I{} \"{}\"", i.id, html_escape(&i.canonical_name)))
2929            .unwrap_or_else(|| "-".to_string());
2930        let kb_str = identity
2931            .and_then(|i| i.kb_id.as_ref())
2932            .map(|s| s.as_str())
2933            .unwrap_or("-");
2934
2935        html.push_str(&format!(
2936            r#"<tr><td>S{} "{}"</td><td class="arrow">→</td><td>{}</td><td class="arrow">→</td><td>{}</td><td class="kb">{}</td></tr>"#,
2937            signal.id, html_escape(&signal.surface), track_str, identity_str, kb_str
2938        ));
2939    }
2940    html.push_str(r#"</table></div>"#);
2941
2942    // Minimal JS: click a signal row → highlight that mention in the text box.
2943    // Also support filtering signals by substring match.
2944    html.push_str(r#"<script>
2945(() => {
2946  // Index signal metadata from the signals table, and map signal/track → text elements.
2947  const signalMeta = new Map();
2948  document.querySelectorAll('#signals-table tr[data-sid]').forEach((row) => {
2949    const sid = row.getAttribute('data-sid');
2950    if (!sid) return;
2951    signalMeta.set(sid, {
2952      sid,
2953      label: row.getAttribute('data-label') || '',
2954      surface: row.getAttribute('data-surface') || '',
2955      conf: row.getAttribute('data-conf') || '',
2956      start: row.getAttribute('data-start'),
2957      end: row.getAttribute('data-end'),
2958      track: row.getAttribute('data-track'),
2959    });
2960  });
2961
2962  const signalEls = new Map();
2963  const addSignalEl = (sid, el) => {
2964    if (!sid || !el) return;
2965    const arr = signalEls.get(sid) || [];
2966    arr.push(el);
2967    signalEls.set(sid, arr);
2968  };
2969  // Old-style inline spans (non-overlapping renderer).
2970  document.querySelectorAll('span.e[data-sid]').forEach((el) => {
2971    addSignalEl(el.getAttribute('data-sid'), el);
2972  });
2973  // Segmented spans (overlap/discontinuous-safe renderer).
2974  document.querySelectorAll('span.seg[data-sids]').forEach((el) => {
2975    const raw = (el.getAttribute('data-sids') || '').trim();
2976    if (!raw) return;
2977    raw.split(/\s+/).filter(Boolean).forEach((sid) => addSignalEl(sid, el));
2978  });
2979
2980  const trackEls = new Map();
2981  for (const [sid, els] of signalEls.entries()) {
2982    const meta = signalMeta.get(sid);
2983    const tid = meta ? meta.track : null;
2984    if (!tid) continue;
2985    const arr = trackEls.get(tid) || [];
2986    els.forEach((el) => arr.push(el));
2987    trackEls.set(tid, arr);
2988  }
2989
2990  const selectionBody = document.getElementById('selection-body');
2991  const selectionHint = document.getElementById('selection-hint');
2992  const defaultHint = selectionHint ? (selectionHint.textContent || '') : '';
2993  const setSelection = (text) => {
2994    if (!selectionBody) return;
2995    selectionBody.textContent = text;
2996  };
2997  const setHint = (text) => {
2998    if (!selectionHint) return;
2999    selectionHint.textContent = text || defaultHint;
3000  };
3001
3002  // Theme toggle: auto (prefers-color-scheme) → dark → light.
3003  const themeBtn = document.getElementById('theme-toggle');
3004  const themeKey = 'anno-theme';
3005  const applyTheme = (theme) => {
3006    const t = theme || 'auto';
3007    if (t === 'auto') {
3008      delete document.documentElement.dataset.theme;
3009    } else {
3010      document.documentElement.dataset.theme = t;
3011    }
3012    if (themeBtn) themeBtn.textContent = `theme: ${t}`;
3013  };
3014  const readTheme = () => {
3015    try { return localStorage.getItem(themeKey) || 'auto'; } catch (_) { return 'auto'; }
3016  };
3017  const writeTheme = (t) => {
3018    try { localStorage.setItem(themeKey, t); } catch (_) { /* ignore */ }
3019  };
3020  applyTheme(readTheme());
3021  if (themeBtn) {
3022    themeBtn.addEventListener('click', () => {
3023      const cur = readTheme();
3024      const next = cur === 'auto' ? 'dark' : (cur === 'dark' ? 'light' : 'auto');
3025      writeTheme(next);
3026      applyTheme(next);
3027    });
3028  }
3029
3030  let activeSignalEls = [];
3031  let activeSignalRow = null;
3032  const clearActive = () => {
3033    if (activeSignalEls && activeSignalEls.length) {
3034      activeSignalEls.forEach((el) => el.classList.remove('e-active'));
3035    }
3036    if (activeSignalRow) activeSignalRow.classList.remove('e-active');
3037    activeSignalEls = [];
3038    activeSignalRow = null;
3039  };
3040
3041  let activeTrack = null;
3042  let hoverTrack = null;
3043
3044  const removeTrackClass = (tid, cls) => {
3045    if (!tid) return;
3046    const els = trackEls.get(tid);
3047    if (!els) return;
3048    els.forEach((el) => el.classList.remove(cls));
3049  };
3050
3051  const addTrackClass = (tid, cls) => {
3052    if (!tid) return;
3053    const els = trackEls.get(tid);
3054    if (!els) return;
3055    els.forEach((el) => el.classList.add(cls));
3056  };
3057
3058  const trackSize = (tid) => {
3059    const els = tid ? trackEls.get(tid) : null;
3060    return els ? els.length : 0;
3061  };
3062
3063  const getTrackSelectionText = (tid) => {
3064    if (!tid) return 'track: - (untracked)';
3065    const row = document.querySelector(`#tracks-table tr[data-tid='${tid}']`);
3066    if (!row) return `track T${tid}`;
3067    const cells = row.querySelectorAll('td');
3068    const canonical = (cells[1]?.textContent || '').trim();
3069    const etype = (cells[2]?.textContent || '').trim();
3070    const count = (cells[3]?.textContent || '').trim();
3071    const sigs = (cells[4]?.textContent || '').trim();
3072    const lines = [];
3073    lines.push(`track T${tid} canonical="${canonical}" type="${etype}" mentions=${count}`);
3074    if (sigs) lines.push(`track signals: ${sigs}`);
3075    return lines.join('\n');
3076  };
3077
3078  const renderTrackSelection = (tid) => setSelection(getTrackSelectionText(tid));
3079
3080  const renderSignalSelectionBySid = (sid) => {
3081    const meta = signalMeta.get(sid);
3082    const label = meta ? (meta.label || '') : '';
3083    const conf = meta ? (meta.conf || '') : '';
3084    const start = meta ? meta.start : null;
3085    const end = meta ? meta.end : null;
3086    const tid = meta ? meta.track : null;
3087    const lines = [];
3088    if (start !== null && end !== null) {
3089      lines.push(`signal ${sid} label=${label} conf=${conf} span=[${start},${end})`);
3090    } else {
3091      lines.push(`signal ${sid} label=${label} conf=${conf}`);
3092    }
3093    if (meta && meta.surface) lines.push(`surface: ${meta.surface}`);
3094    lines.push('');
3095    lines.push(getTrackSelectionText(tid));
3096    setSelection(lines.join('\n'));
3097  };
3098
3099  const setActiveTrack = (tid) => {
3100    const next = tid || null;
3101    if (activeTrack === next) return;
3102    removeTrackClass(activeTrack, 'e-track');
3103    activeTrack = next;
3104    if (activeTrack) addTrackClass(activeTrack, 'e-track');
3105    if (hoverTrack && activeTrack && hoverTrack === activeTrack) {
3106      removeTrackClass(hoverTrack, 'e-track-hover');
3107    }
3108  };
3109
3110  const setHoverTrack = (tid) => {
3111    const next = tid || null;
3112    if (hoverTrack === next) return;
3113    removeTrackClass(hoverTrack, 'e-track-hover');
3114    hoverTrack = next;
3115    if (!hoverTrack) {
3116      setHint('');
3117      return;
3118    }
3119    if (activeTrack && hoverTrack === activeTrack) {
3120      setHint(`selected track T${hoverTrack} (${trackSize(hoverTrack)} mentions)`);
3121      return;
3122    }
3123    addTrackClass(hoverTrack, 'e-track-hover');
3124    setHint(`hover track T${hoverTrack} (${trackSize(hoverTrack)} mentions)`);
3125  };
3126
3127  const emitToParentSpan = (start, end) => {
3128    try {
3129      if (!window.parent || window.parent === window) return;
3130      if (start === null || end === null) return;
3131      window.parent.postMessage({ type: 'anno:activate-span', start: Number(start), end: Number(end) }, '*');
3132    } catch (_) {
3133      // ignore: best-effort bridge for iframe containers
3134    }
3135  };
3136
3137  const activateBySpan = (start, end, emit) => {
3138    if (start === null || end === null || start === undefined || end === undefined) return;
3139    // Prefer an exact signal span if present; otherwise fall back to the table row metadata.
3140    const el = document.querySelector(`span.e[data-sid][data-start='${start}'][data-end='${end}']`);
3141    if (el) {
3142      const sid = el.getAttribute('data-sid');
3143      if (sid) activateSignal(sid, emit);
3144      return;
3145    }
3146    const row = document.querySelector(`#signals-table tr[data-start='${start}'][data-end='${end}']`);
3147    if (!row) return;
3148    const sid = row.getAttribute('data-sid');
3149    if (!sid) return;
3150    activateSignal(sid, emit);
3151  };
3152
3153  const activateSignal = (sid, emit) => {
3154    clearActive();
3155    const els = signalEls.get(sid) || [];
3156    if (!els.length) return;
3157    els.forEach((el) => el.classList.add('e-active'));
3158    activeSignalEls = els;
3159    const row = document.querySelector(`#signals-table tr[data-sid='${sid}']`);
3160    if (row) {
3161      row.classList.add('e-active');
3162      activeSignalRow = row;
3163    }
3164    const primaryEl = els[0];
3165    primaryEl.scrollIntoView({ block: 'center', behavior: 'smooth' });
3166    const meta = signalMeta.get(sid);
3167    const tid = meta ? meta.track : primaryEl.getAttribute('data-track');
3168    setActiveTrack(tid);
3169    renderSignalSelectionBySid(sid);
3170    if (emit && meta && meta.start !== null && meta.end !== null) {
3171      emitToParentSpan(meta.start, meta.end);
3172    }
3173  };
3174
3175  // Table click
3176  const signalsTable = document.getElementById('signals-table');
3177  if (signalsTable) {
3178    signalsTable.addEventListener('click', (ev) => {
3179      const a = ev.target && ev.target.closest ? ev.target.closest("a[href^='#S']") : null;
3180      const row = ev.target && ev.target.closest ? ev.target.closest('tr[data-sid]') : null;
3181      const sid = (a && a.getAttribute('href') ? a.getAttribute('href').slice(1) : null) || (row ? row.getAttribute('data-sid') : null);
3182      if (!sid) return;
3183      ev.preventDefault();
3184      activateSignal(sid, true);
3185      history.replaceState(null, '', '#' + sid);
3186    });
3187
3188    // Hover a signals row → preview track highlight
3189    signalsTable.addEventListener('mouseover', (ev) => {
3190      const row = ev.target && ev.target.closest ? ev.target.closest('tr[data-sid]') : null;
3191      if (!row) return;
3192      const tid = row.getAttribute('data-track');
3193      setHoverTrack(tid);
3194    });
3195    signalsTable.addEventListener('mouseout', (ev) => {
3196      const to = ev.relatedTarget;
3197      if (to && signalsTable.contains(to)) return;
3198      setHoverTrack(null);
3199    });
3200  }
3201
3202  // Clicking an inline entity should also toggle active highlight.
3203  const pickPrimarySid = (el) => {
3204    if (!el) return null;
3205    const p = el.getAttribute('data-primary');
3206    if (p) return p;
3207    const raw = (el.getAttribute('data-sids') || '').trim();
3208    if (!raw) return null;
3209    const sids = raw.split(/\s+/).filter(Boolean);
3210    if (!sids.length) return null;
3211    // Prefer the shortest mention span from metadata.
3212    let best = sids[0];
3213    let bestLen = null;
3214    for (const sid of sids) {
3215      const meta = signalMeta.get(sid);
3216      const s = meta && meta.start !== null ? Number(meta.start) : null;
3217      const e = meta && meta.end !== null ? Number(meta.end) : null;
3218      const len = (s !== null && e !== null) ? (e - s) : null;
3219      if (len === null) continue;
3220      if (bestLen === null || len < bestLen) {
3221        best = sid;
3222        bestLen = len;
3223      }
3224    }
3225    return best;
3226  };
3227
3228  document.addEventListener('click', (ev) => {
3229    const span = ev.target && ev.target.closest ? ev.target.closest('span.e[data-sid]') : null;
3230    if (span) {
3231      activateSignal(span.getAttribute('data-sid'), true);
3232      return;
3233    }
3234    const seg = ev.target && ev.target.closest ? ev.target.closest('span.seg[data-sids]') : null;
3235    if (!seg) return;
3236    activateSignal(pickPrimarySid(seg), true);
3237  });
3238
3239  // Hover an inline entity → preview highlight its track
3240  document.addEventListener('mouseover', (ev) => {
3241    const span = ev.target && ev.target.closest ? ev.target.closest('span.e[data-sid]') : null;
3242    if (span) {
3243      setHoverTrack(span.getAttribute('data-track'));
3244      return;
3245    }
3246    const seg = ev.target && ev.target.closest ? ev.target.closest('span.seg[data-sids]') : null;
3247    if (!seg) return;
3248    const sid = pickPrimarySid(seg);
3249    const meta = sid ? signalMeta.get(sid) : null;
3250    setHoverTrack(meta ? meta.track : null);
3251  });
3252  document.addEventListener('mouseout', (ev) => {
3253    const span = ev.target && ev.target.closest ? ev.target.closest('span.e[data-sid]') : null;
3254    const seg = ev.target && ev.target.closest ? ev.target.closest('span.seg[data-sids]') : null;
3255    if (!span && !seg) return;
3256    const to = ev.relatedTarget;
3257    if (to && to.closest && (to.closest('span.e[data-sid]') || to.closest('span.seg[data-sids]'))) return;
3258    setHoverTrack(null);
3259  });
3260
3261  // Clicking a track row → select track (highlight + details)
3262  const tracksTable = document.getElementById('tracks-table');
3263  if (tracksTable) {
3264    tracksTable.addEventListener('click', (ev) => {
3265      const row = ev.target && ev.target.closest ? ev.target.closest('tr[data-tid]') : null;
3266      if (!row) return;
3267      const tid = row.getAttribute('data-tid');
3268      setActiveTrack(tid);
3269      renderTrackSelection(tid);
3270    });
3271    tracksTable.addEventListener('mouseover', (ev) => {
3272      const row = ev.target && ev.target.closest ? ev.target.closest('tr[data-tid]') : null;
3273      if (!row) return;
3274      setHoverTrack(row.getAttribute('data-tid'));
3275    });
3276    tracksTable.addEventListener('mouseout', (ev) => {
3277      const to = ev.relatedTarget;
3278      if (to && tracksTable.contains(to)) return;
3279      setHoverTrack(null);
3280    });
3281  }
3282
3283  // Filter
3284  const input = document.getElementById('signal-filter');
3285  const countEl = document.getElementById('signal-filter-count');
3286  if (input && signalsTable) {
3287    const update = () => {
3288      const q = (input.value || '').trim().toLowerCase();
3289      let shown = 0;
3290      const rows = signalsTable.querySelectorAll('tr[data-sid]');
3291      rows.forEach(row => {
3292        const sid = (row.getAttribute('data-sid') || '').toLowerCase();
3293        const label = (row.getAttribute('data-label') || '').toLowerCase();
3294        const surface = (row.getAttribute('data-surface') || '').toLowerCase();
3295        const ok = !q || sid.includes(q) || label.includes(q) || surface.includes(q);
3296        row.style.display = ok ? '' : 'none';
3297        if (ok) shown += 1;
3298      });
3299      if (countEl) countEl.textContent = shown + ' shown';
3300    };
3301    input.addEventListener('input', update);
3302    update();
3303  }
3304
3305  // Panel toggles
3306  document.querySelectorAll('[data-toggle]').forEach(btn => {
3307    btn.addEventListener('click', () => {
3308      const id = btn.getAttribute('data-toggle');
3309      const panel = id ? document.getElementById(id) : null;
3310      if (!panel) return;
3311      panel.classList.toggle('panel-collapsed');
3312    });
3313  });
3314
3315  // If URL hash is #S123, focus it.
3316  const hash = (location.hash || '').slice(1);
3317  if (hash && hash.startsWith('S')) activateSignal(hash, false);
3318
3319  // Optional: allow parent pages (e.g., dataset explorers) to sync selection across iframes.
3320  window.addEventListener('message', (ev) => {
3321    const data = ev && ev.data ? ev.data : null;
3322    if (!data || data.type !== 'anno:activate-span') return;
3323    if (typeof data.start !== 'number' || typeof data.end !== 'number') return;
3324    activateBySpan(data.start, data.end, false);
3325  });
3326})();
3327</script>"#);
3328
3329    html.push_str(r#"</body></html>"#);
3330    html
3331}
3332
3333fn html_escape(s: &str) -> String {
3334    s.replace('&', "&amp;")
3335        .replace('<', "&lt;")
3336        .replace('>', "&gt;")
3337        .replace('"', "&quot;")
3338}
3339
3340fn annotate_text_html(
3341    text: &str,
3342    signals: &[Signal<Location>],
3343    signal_to_track: &std::collections::HashMap<SignalId, TrackId>,
3344) -> String {
3345    let char_count = text.chars().count();
3346    if char_count == 0 {
3347        return String::new();
3348    }
3349
3350    #[derive(Debug, Clone)]
3351    struct SigMeta {
3352        sid: String,
3353        label: String,
3354        conf: f64,
3355        track_id: Option<TrackId>,
3356        covered_len: usize,
3357    }
3358
3359    #[derive(Debug, Clone)]
3360    struct Event {
3361        pos: usize,
3362        meta_idx: usize,
3363        delta: i32, // -1 end, +1 start
3364    }
3365
3366    // Collect text segments for each signal (supports discontinuous spans).
3367    let mut metas: Vec<SigMeta> = Vec::new();
3368    let mut events: Vec<Event> = Vec::new();
3369    let mut boundaries: Vec<usize> = vec![0, char_count];
3370
3371    for s in signals {
3372        let raw_segments: Vec<(usize, usize)> = match &s.location {
3373            Location::Text { start, end } => vec![(*start, *end)],
3374            Location::Discontinuous { segments } => segments.clone(),
3375        };
3376        if raw_segments.is_empty() {
3377            continue;
3378        }
3379
3380        let mut cleaned: Vec<(usize, usize)> = Vec::new();
3381        let mut covered_len = 0usize;
3382        for (start, end) in raw_segments {
3383            let start = start.min(char_count);
3384            let end = end.min(char_count);
3385            if start >= end {
3386                continue;
3387            }
3388            covered_len = covered_len.saturating_add(end - start);
3389            cleaned.push((start, end));
3390        }
3391        if cleaned.is_empty() {
3392            continue;
3393        }
3394
3395        let meta_idx = metas.len();
3396        let track_id = signal_to_track.get(&s.id).copied();
3397        metas.push(SigMeta {
3398            sid: format!("S{}", s.id),
3399            label: s.label.to_string(),
3400            conf: s.confidence.value(),
3401            track_id,
3402            covered_len,
3403        });
3404
3405        for (start, end) in cleaned {
3406            boundaries.push(start);
3407            boundaries.push(end);
3408            events.push(Event {
3409                pos: start,
3410                meta_idx,
3411                delta: 1,
3412            });
3413            events.push(Event {
3414                pos: end,
3415                meta_idx,
3416                delta: -1,
3417            });
3418        }
3419    }
3420
3421    if metas.is_empty() {
3422        return html_escape(text);
3423    }
3424
3425    boundaries.sort_unstable();
3426    boundaries.dedup();
3427    events.sort_by(|a, b| a.pos.cmp(&b.pos).then_with(|| a.delta.cmp(&b.delta)));
3428
3429    let mut active_counts: Vec<u32> = vec![0; metas.len()];
3430    let mut active: Vec<usize> = Vec::new();
3431    let mut ev_idx = 0usize;
3432
3433    let mut result = String::new();
3434
3435    for bi in 0..boundaries.len().saturating_sub(1) {
3436        let pos = boundaries[bi];
3437        // Apply all events at this boundary.
3438        while ev_idx < events.len() && events[ev_idx].pos == pos {
3439            let e = &events[ev_idx];
3440            let idx = e.meta_idx;
3441            if e.delta < 0 {
3442                if active_counts[idx] > 0 {
3443                    active_counts[idx] -= 1;
3444                    if active_counts[idx] == 0 {
3445                        active.retain(|&x| x != idx);
3446                    }
3447                }
3448            } else {
3449                active_counts[idx] += 1;
3450                if active_counts[idx] == 1 {
3451                    active.push(idx);
3452                }
3453            }
3454            ev_idx += 1;
3455        }
3456
3457        let next = boundaries[bi + 1];
3458        if next <= pos {
3459            continue;
3460        }
3461
3462        let seg_text: String = text.chars().skip(pos).take(next - pos).collect();
3463        if active.is_empty() {
3464            result.push_str(&html_escape(&seg_text));
3465            continue;
3466        }
3467
3468        // Determine primary (for coloring + click default): shortest covered len, then highest conf.
3469        let primary_idx = active
3470            .iter()
3471            .copied()
3472            .min_by(|a, b| {
3473                metas[*a]
3474                    .covered_len
3475                    .cmp(&metas[*b].covered_len)
3476                    .then_with(|| {
3477                        metas[*b]
3478                            .conf
3479                            .partial_cmp(&metas[*a].conf)
3480                            .unwrap_or(std::cmp::Ordering::Equal)
3481                    })
3482            })
3483            .unwrap_or(active[0]);
3484        let primary = &metas[primary_idx];
3485
3486        let class = match primary.label.to_uppercase().as_str() {
3487            "PER" | "PERSON" => "e-per",
3488            "ORG" | "ORGANIZATION" | "COMPANY" => "e-org",
3489            "LOC" | "LOCATION" | "GPE" => "e-loc",
3490            "DATE" | "TIME" => "e-date",
3491            _ => "e-misc",
3492        };
3493
3494        let mut sids: Vec<&str> = active.iter().map(|i| metas[*i].sid.as_str()).collect();
3495        sids.sort_unstable();
3496        let data_sids = sids.join(" ");
3497
3498        let mut title = format!(
3499            "sids=[{}] primary={} [{}..{})",
3500            data_sids, primary.sid, pos, next
3501        );
3502        if let Some(t) = primary.track_id {
3503            title.push_str(&format!(" track=T{}", t));
3504        }
3505
3506        result.push_str(&format!(
3507            r#"<span class="e seg {class}" data-sids="{sids}" data-start="{start}" data-end="{end}" data-primary="{primary}" title="{title}">{text}</span>"#,
3508            class = class,
3509            sids = html_escape(&data_sids),
3510            start = pos,
3511            end = next,
3512            primary = html_escape(&primary.sid),
3513            title = html_escape(&title),
3514            text = html_escape(&seg_text),
3515        ));
3516    }
3517
3518    result
3519}
3520
3521// =============================================================================
3522// Eval Comparison HTML Rendering
3523// =============================================================================
3524
3525/// Comparison between gold (ground truth) and predicted entities.
3526#[derive(Debug, Clone)]
3527pub struct EvalComparison {
3528    /// Document text
3529    pub text: String,
3530    /// Gold/ground truth signals
3531    pub gold: Vec<Signal<Location>>,
3532    /// Predicted signals
3533    pub predicted: Vec<Signal<Location>>,
3534    /// Match results
3535    pub matches: Vec<EvalMatch>,
3536}
3537
3538/// Result of matching a gold or predicted signal.
3539#[derive(Debug, Clone)]
3540pub enum EvalMatch {
3541    /// Exact match: gold and predicted align perfectly.
3542    Correct {
3543        /// Gold signal ID
3544        gold_id: SignalId,
3545        /// Predicted signal ID
3546        pred_id: SignalId,
3547    },
3548    /// Type mismatch: same span, different label.
3549    TypeMismatch {
3550        /// Gold signal ID
3551        gold_id: SignalId,
3552        /// Predicted signal ID
3553        pred_id: SignalId,
3554        /// Gold label
3555        gold_label: String,
3556        /// Predicted label
3557        pred_label: String,
3558    },
3559    /// Boundary error: overlapping but not exact span.
3560    BoundaryError {
3561        /// Gold signal ID
3562        gold_id: SignalId,
3563        /// Predicted signal ID
3564        pred_id: SignalId,
3565        /// Intersection over Union
3566        iou: f64,
3567    },
3568    /// False positive: predicted with no gold match.
3569    Spurious {
3570        /// Predicted signal ID
3571        pred_id: SignalId,
3572    },
3573    /// False negative: gold with no prediction.
3574    Missed {
3575        /// Gold signal ID
3576        gold_id: SignalId,
3577    },
3578}
3579
3580impl EvalComparison {
3581    /// Create a comparison from gold and predicted entities.
3582    ///
3583    /// # Example
3584    ///
3585    /// ```rust
3586    /// use anno_core::core::grounded::{EvalComparison};
3587    /// use anno_core::{Signal, Location};
3588    ///
3589    /// let text = "Marie Curie won the Nobel Prize.";
3590    /// let gold = vec![
3591    ///     Signal::new(0, Location::text(0, 11), "Marie Curie", "PER", 1.0),
3592    ///     Signal::new(1, Location::text(20, 31), "Nobel Prize", "AWARD", 1.0),
3593    /// ];
3594    /// let pred = vec![
3595    ///     Signal::new(0, Location::text(0, 11), "Marie Curie", "PER", 0.95),
3596    /// ];
3597    /// let cmp = EvalComparison::compare(text, gold, pred);
3598    /// assert_eq!(cmp.matches.len(), 2); // 1 correct, 1 missed
3599    /// ```
3600    #[must_use]
3601    pub fn compare(
3602        text: &str,
3603        gold: Vec<Signal<Location>>,
3604        predicted: Vec<Signal<Location>>,
3605    ) -> Self {
3606        let mut matches = Vec::new();
3607        let mut gold_matched = vec![false; gold.len()];
3608        let mut pred_matched = vec![false; predicted.len()];
3609
3610        // First pass: find exact matches and type mismatches
3611        for (pi, pred) in predicted.iter().enumerate() {
3612            let pred_offsets = match pred.location.text_offsets() {
3613                Some(o) => o,
3614                None => continue,
3615            };
3616
3617            for (gi, g) in gold.iter().enumerate() {
3618                if gold_matched[gi] {
3619                    continue;
3620                }
3621                let gold_offsets = match g.location.text_offsets() {
3622                    Some(o) => o,
3623                    None => continue,
3624                };
3625
3626                // Exact span match
3627                if pred_offsets == gold_offsets {
3628                    if pred.label == g.label {
3629                        matches.push(EvalMatch::Correct {
3630                            gold_id: g.id,
3631                            pred_id: pred.id,
3632                        });
3633                    } else {
3634                        matches.push(EvalMatch::TypeMismatch {
3635                            gold_id: g.id,
3636                            pred_id: pred.id,
3637                            gold_label: g.label.to_string(),
3638                            pred_label: pred.label.to_string(),
3639                        });
3640                    }
3641                    gold_matched[gi] = true;
3642                    pred_matched[pi] = true;
3643                    break;
3644                }
3645            }
3646        }
3647
3648        // Second pass: find boundary errors (overlapping but not exact)
3649        for (pi, pred) in predicted.iter().enumerate() {
3650            if pred_matched[pi] {
3651                continue;
3652            }
3653            let pred_offsets = match pred.location.text_offsets() {
3654                Some(o) => o,
3655                None => continue,
3656            };
3657
3658            for (gi, g) in gold.iter().enumerate() {
3659                if gold_matched[gi] {
3660                    continue;
3661                }
3662                let gold_offsets = match g.location.text_offsets() {
3663                    Some(o) => o,
3664                    None => continue,
3665                };
3666
3667                // Check overlap
3668                if pred_offsets.0 < gold_offsets.1 && pred_offsets.1 > gold_offsets.0 {
3669                    let iou = pred.location.iou(&g.location).unwrap_or(0.0);
3670                    matches.push(EvalMatch::BoundaryError {
3671                        gold_id: g.id,
3672                        pred_id: pred.id,
3673                        iou,
3674                    });
3675                    gold_matched[gi] = true;
3676                    pred_matched[pi] = true;
3677                    break;
3678                }
3679            }
3680        }
3681
3682        // Remaining unmatched predictions are spurious
3683        for (pi, pred) in predicted.iter().enumerate() {
3684            if !pred_matched[pi] {
3685                matches.push(EvalMatch::Spurious { pred_id: pred.id });
3686            }
3687        }
3688
3689        // Remaining unmatched gold are missed
3690        for (gi, g) in gold.iter().enumerate() {
3691            if !gold_matched[gi] {
3692                matches.push(EvalMatch::Missed { gold_id: g.id });
3693            }
3694        }
3695
3696        Self {
3697            text: text.to_string(),
3698            gold,
3699            predicted,
3700            matches,
3701        }
3702    }
3703
3704    /// Count correct matches.
3705    #[must_use]
3706    pub fn correct_count(&self) -> usize {
3707        self.matches
3708            .iter()
3709            .filter(|m| matches!(m, EvalMatch::Correct { .. }))
3710            .count()
3711    }
3712
3713    /// Count errors (type mismatch + boundary + spurious + missed).
3714    #[must_use]
3715    pub fn error_count(&self) -> usize {
3716        self.matches.len() - self.correct_count()
3717    }
3718
3719    /// Calculate precision.
3720    #[must_use]
3721    pub fn precision(&self) -> f64 {
3722        if self.predicted.is_empty() {
3723            0.0
3724        } else {
3725            self.correct_count() as f64 / self.predicted.len() as f64
3726        }
3727    }
3728
3729    /// Calculate recall.
3730    #[must_use]
3731    pub fn recall(&self) -> f64 {
3732        if self.gold.is_empty() {
3733            0.0
3734        } else {
3735            self.correct_count() as f64 / self.gold.len() as f64
3736        }
3737    }
3738
3739    /// Calculate F1.
3740    #[must_use]
3741    pub fn f1(&self) -> f64 {
3742        let p = self.precision();
3743        let r = self.recall();
3744        if p + r > 0.0 {
3745            2.0 * p * r / (p + r)
3746        } else {
3747            0.0
3748        }
3749    }
3750}
3751
3752/// Render an eval comparison as HTML.
3753///
3754/// Shows gold vs predicted side by side with error highlighting.
3755pub fn render_eval_html(cmp: &EvalComparison) -> String {
3756    render_eval_html_with_title(cmp, "eval comparison")
3757}
3758
3759/// Render an eval comparison as HTML, with a custom title.
3760///
3761/// The title is used for both the page `<title>` and the top `<h1>`.
3762#[must_use]
3763pub fn render_eval_html_with_title(cmp: &EvalComparison, title: &str) -> String {
3764    let mut html = String::new();
3765    let title = html_escape(title);
3766
3767    html.push_str(
3768        r#"<!DOCTYPE html>
3769<html>
3770<head>
3771<meta charset="UTF-8">
3772<meta name="color-scheme" content="dark light">
3773"#,
3774    );
3775    html.push_str(&format!("<title>{}</title>", title));
3776    html.push_str(r#"
3777:root{
3778  color-scheme: light dark;
3779  --bg:#0a0a0a;
3780  --panel-bg:#0d0d0d;
3781  --text:#b0b0b0;
3782  --text-strong:#fff;
3783  --muted:#666;
3784  --border:#222;
3785  --border-strong:#333;
3786  --hover:#111;
3787  --input-bg:#080808;
3788  --active:#ddd;
3789  /* Eval entity colors (dark) */
3790  --gold-bg:#1a2e1a; --gold-br:#4a8a4a; --gold-tx:#88cc88;
3791  --pred-bg:#1a1a2e; --pred-br:#4a4a8a; --pred-tx:#8888cc;
3792  /* Match row borders */
3793  --m-ok:#4a8a4a;
3794  --m-type:#8a8a4a;
3795  --m-bound:#4a8a8a;
3796  --m-fp:#8a4a4a;
3797  --m-fn:#8a4a8a;
3798}
3799@media (prefers-color-scheme: light){
3800  :root{
3801    --bg:#ffffff;
3802    --panel-bg:#f7f7f7;
3803    --text:#222;
3804    --text-strong:#000;
3805    --muted:#555;
3806    --border:#d6d6d6;
3807    --border-strong:#c6c6c6;
3808    --hover:#f0f0f0;
3809    --input-bg:#ffffff;
3810    --active:#000;
3811    --gold-bg:#e9f7e9; --gold-br:#2f8a2f; --gold-tx:#1f5a1f;
3812    --pred-bg:#e9e9ff; --pred-br:#6c6cff; --pred-tx:#2b2b7a;
3813    --m-ok:#2f8a2f;
3814    --m-type:#8a7a2f;
3815    --m-bound:#2f7a8a;
3816    --m-fp:#8a2f2f;
3817    --m-fn:#6a2f8a;
3818  }
3819}
3820html[data-theme='dark']{
3821  --bg:#0a0a0a; --panel-bg:#0d0d0d; --text:#b0b0b0; --text-strong:#fff;
3822  --muted:#666; --border:#222; --border-strong:#333; --hover:#111; --input-bg:#080808; --active:#ddd;
3823  --gold-bg:#1a2e1a; --gold-br:#4a8a4a; --gold-tx:#88cc88;
3824  --pred-bg:#1a1a2e; --pred-br:#4a4a8a; --pred-tx:#8888cc;
3825  --m-ok:#4a8a4a; --m-type:#8a8a4a; --m-bound:#4a8a8a; --m-fp:#8a4a4a; --m-fn:#8a4a8a;
3826}
3827html[data-theme='light']{
3828  --bg:#ffffff; --panel-bg:#f7f7f7; --text:#222; --text-strong:#000;
3829  --muted:#555; --border:#d6d6d6; --border-strong:#c6c6c6; --hover:#f0f0f0; --input-bg:#ffffff; --active:#000;
3830  --gold-bg:#e9f7e9; --gold-br:#2f8a2f; --gold-tx:#1f5a1f;
3831  --pred-bg:#e9e9ff; --pred-br:#6c6cff; --pred-tx:#2b2b7a;
3832  --m-ok:#2f8a2f; --m-type:#8a7a2f; --m-bound:#2f7a8a; --m-fp:#8a2f2f; --m-fn:#6a2f8a;
3833}
3834
3835<style>
3836*{box-sizing:border-box;margin:0;padding:0}
3837body{font:12px/1.4 monospace;background:var(--bg);color:var(--text);padding:8px}
3838h1,h2{color:var(--text-strong);font-weight:normal;border-bottom:1px solid var(--border-strong);padding:4px 0;margin:16px 0 8px}
3839h1{font-size:14px}h2{font-size:12px}
3840table{width:100%;border-collapse:collapse;font-size:11px;margin:4px 0}
3841th,td{padding:4px 8px;text-align:left;border:1px solid var(--border)}
3842th{background:var(--hover);color:var(--muted);font-weight:normal;text-transform:uppercase;font-size:10px}
3843tr:hover{background:var(--hover)}
3844.grid{display:grid;grid-template-columns:1fr 1fr;gap:8px}
3845.panel{border:1px solid var(--border);background:var(--panel-bg);padding:8px}
3846.text-box{background:var(--input-bg);border:1px solid var(--border);padding:8px;white-space:pre-wrap;word-break:break-word;line-height:1.6}
3847.stats{display:flex;gap:24px;padding:8px 0;border-bottom:1px solid var(--border);margin-bottom:8px}
3848.stat{text-align:center}.stat-v{font-size:18px;color:var(--text-strong)}.stat-l{font-size:9px;color:var(--muted);text-transform:uppercase}
3849/* Entities */
3850.e{padding:1px 2px;border-bottom:2px solid}
3851.seg{cursor:pointer}
3852.e-gold{background:var(--gold-bg);border-color:var(--gold-br);color:var(--gold-tx)}
3853.e-pred{background:var(--pred-bg);border-color:var(--pred-br);color:var(--pred-tx)}
3854.e-active{outline:1px solid var(--active);outline-offset:1px}
3855/* Match types */
3856.correct{background:#1a2e1a;border-color:#4a8a4a}
3857.type-err{background:#2e2e1a;border-color:#8a8a4a}
3858.boundary{background:#1a2e2e;border-color:#4a8a8a}
3859.spurious{background:#2e1a1a;border-color:#8a4a4a}
3860.missed{background:#2e1a2e;border-color:#8a4a8a}
3861.match-row.correct{border-left:3px solid var(--m-ok)}
3862.match-row.type-err{border-left:3px solid var(--m-type)}
3863.match-row.boundary{border-left:3px solid var(--m-bound)}
3864.match-row.spurious{border-left:3px solid var(--m-fp)}
3865.match-row.missed{border-left:3px solid var(--m-fn)}
3866.match-row.active{outline:1px solid var(--muted)}
3867.sel{color:var(--muted);margin:6px 0 12px}
3868.metric{font-size:14px;color:var(--muted)}.metric b{color:var(--text-strong)}
3869</style>
3870</head>
3871<body>
3872"#);
3873
3874    // Header (with theme toggle)
3875    html.push_str(&format!(
3876        "<div class=\"panel-h\" style=\"justify-content:space-between\"><h1>{}</h1><span class=\"toggle\" id=\"theme-toggle\" title=\"toggle theme (auto → dark → light)\">theme: auto</span></div>",
3877        title
3878    ));
3879
3880    // Metrics bar
3881    html.push_str("<div class=\"stats\">");
3882    html.push_str(&format!(
3883        "<div class=\"stat\"><div class=\"stat-v\">{}</div><div class=\"stat-l\">gold</div></div>",
3884        cmp.gold.len()
3885    ));
3886    html.push_str(&format!(
3887        "<div class=\"stat\"><div class=\"stat-v\">{}</div><div class=\"stat-l\">predicted</div></div>",
3888        cmp.predicted.len()
3889    ));
3890    html.push_str(&format!(
3891        "<div class=\"stat\"><div class=\"stat-v\">{}</div><div class=\"stat-l\">correct</div></div>",
3892        cmp.correct_count()
3893    ));
3894    html.push_str(&format!(
3895        "<div class=\"stat\"><div class=\"stat-v\">{}</div><div class=\"stat-l\">errors</div></div>",
3896        cmp.error_count()
3897    ));
3898    html.push_str(&format!(
3899        "<div class=\"metric\">P=<b>{:.1}%</b> R=<b>{:.1}%</b> F1=<b>{:.1}%</b></div>",
3900        cmp.precision() * 100.0,
3901        cmp.recall() * 100.0,
3902        cmp.f1() * 100.0
3903    ));
3904    html.push_str("</div>");
3905
3906    // Simple selection readout (helps debugging + browser-based verification)
3907    html.push_str("<div id=\"selection\" class=\"sel\">click a match row to select spans</div>");
3908
3909    // Side-by-side text
3910    html.push_str("<div class=\"grid\">");
3911
3912    // Gold panel
3913    html.push_str("<div class=\"panel\"><h2>gold (ground truth)</h2><div class=\"text-box\">");
3914    let gold_spans: Vec<EvalHtmlSpan> = cmp
3915        .gold
3916        .iter()
3917        .map(|s| {
3918            let (start, end) = s.location.text_offsets().unwrap_or((0, 0));
3919            EvalHtmlSpan {
3920                start,
3921                end,
3922                label: s.label.to_string(),
3923                class: "e-gold",
3924                id: format!("G{}", s.id),
3925            }
3926        })
3927        .collect();
3928    html.push_str(&annotate_text_spans(&cmp.text, &gold_spans));
3929    html.push_str("</div></div>");
3930
3931    // Predicted panel
3932    html.push_str("<div class=\"panel\"><h2>predicted</h2><div class=\"text-box\">");
3933    let pred_spans: Vec<EvalHtmlSpan> = cmp
3934        .predicted
3935        .iter()
3936        .map(|s| {
3937            let (start, end) = s.location.text_offsets().unwrap_or((0, 0));
3938            EvalHtmlSpan {
3939                start,
3940                end,
3941                label: s.label.to_string(),
3942                class: "e-pred",
3943                id: format!("P{}", s.id),
3944            }
3945        })
3946        .collect();
3947    html.push_str(&annotate_text_spans(&cmp.text, &pred_spans));
3948    html.push_str("</div></div>");
3949
3950    html.push_str("</div>");
3951
3952    // Match table
3953    html.push_str("<h2>matches</h2><table>");
3954    html.push_str("<tr><th>type</th><th>gold</th><th>predicted</th><th>notes</th></tr>");
3955
3956    for (mi, m) in cmp.matches.iter().enumerate() {
3957        let (class, mtype, gold_text, pred_text, notes, gid, pid) = match m {
3958            EvalMatch::Correct { gold_id, pred_id } => {
3959                let g = cmp.gold.iter().find(|s| s.id == *gold_id);
3960                let p = cmp.predicted.iter().find(|s| s.id == *pred_id);
3961                (
3962                    "correct",
3963                    "✓",
3964                    g.map(|s| format!("[{}] {}", s.label, s.surface()))
3965                        .unwrap_or_default(),
3966                    p.map(|s| format!("[{}] {}", s.label, s.surface()))
3967                        .unwrap_or_default(),
3968                    String::new(),
3969                    Some(format!("G{}", gold_id)),
3970                    Some(format!("P{}", pred_id)),
3971                )
3972            }
3973            EvalMatch::TypeMismatch {
3974                gold_id,
3975                pred_id,
3976                gold_label,
3977                pred_label,
3978            } => {
3979                let g = cmp.gold.iter().find(|s| s.id == *gold_id);
3980                let p = cmp.predicted.iter().find(|s| s.id == *pred_id);
3981                (
3982                    "type-err",
3983                    "type",
3984                    g.map(|s| format!("[{}] {}", s.label, s.surface()))
3985                        .unwrap_or_default(),
3986                    p.map(|s| format!("[{}] {}", s.label, s.surface()))
3987                        .unwrap_or_default(),
3988                    format!("{} → {}", gold_label, pred_label),
3989                    Some(format!("G{}", gold_id)),
3990                    Some(format!("P{}", pred_id)),
3991                )
3992            }
3993            EvalMatch::BoundaryError {
3994                gold_id,
3995                pred_id,
3996                iou,
3997            } => {
3998                let g = cmp.gold.iter().find(|s| s.id == *gold_id);
3999                let p = cmp.predicted.iter().find(|s| s.id == *pred_id);
4000                (
4001                    "boundary",
4002                    "bound",
4003                    g.map(|s| format!("[{}] \"{}\"", s.label, s.surface()))
4004                        .unwrap_or_default(),
4005                    p.map(|s| format!("[{}] \"{}\"", s.label, s.surface()))
4006                        .unwrap_or_default(),
4007                    format!("IoU={:.2}", iou),
4008                    Some(format!("G{}", gold_id)),
4009                    Some(format!("P{}", pred_id)),
4010                )
4011            }
4012            EvalMatch::Spurious { pred_id } => {
4013                let p = cmp.predicted.iter().find(|s| s.id == *pred_id);
4014                (
4015                    "spurious",
4016                    "FP",
4017                    String::new(),
4018                    p.map(|s| format!("[{}] {}", s.label, s.surface()))
4019                        .unwrap_or_default(),
4020                    "false positive".to_string(),
4021                    None,
4022                    Some(format!("P{}", pred_id)),
4023                )
4024            }
4025            EvalMatch::Missed { gold_id } => {
4026                let g = cmp.gold.iter().find(|s| s.id == *gold_id);
4027                (
4028                    "missed",
4029                    "FN",
4030                    g.map(|s| format!("[{}] {}", s.label, s.surface()))
4031                        .unwrap_or_default(),
4032                    String::new(),
4033                    "false negative".to_string(),
4034                    Some(format!("G{}", gold_id)),
4035                    None,
4036                )
4037            }
4038        };
4039
4040        let mut data_attrs = String::new();
4041        if let Some(gid) = gid.as_deref() {
4042            data_attrs.push_str(&format!(" data-gid=\"{}\"", html_escape(gid)));
4043        }
4044        if let Some(pid) = pid.as_deref() {
4045            data_attrs.push_str(&format!(" data-pid=\"{}\"", html_escape(pid)));
4046        }
4047
4048        html.push_str(&format!(
4049            "<tr id=\"M{mid}\" class=\"match-row {class}\"{attrs}><td><a class=\"match-link\" href=\"#M{mid}\">{mtype}</a></td><td>{gold}</td><td>{pred}</td><td>{notes}</td></tr>",
4050            mid = mi,
4051            class = class,
4052            attrs = data_attrs,
4053            mtype = html_escape(mtype),
4054            gold = html_escape(&gold_text),
4055            pred = html_escape(&pred_text),
4056            notes = html_escape(&notes)
4057        ));
4058    }
4059    html.push_str("</table>");
4060
4061    html.push_str(
4062        r#"<script>
4063(() => {
4064  // Theme toggle: auto (prefers-color-scheme) → dark → light.
4065  const themeBtn = document.getElementById('theme-toggle');
4066  const themeKey = 'anno-theme';
4067  const applyTheme = (theme) => {
4068    const t = theme || 'auto';
4069    if (t === 'auto') {
4070      delete document.documentElement.dataset.theme;
4071    } else {
4072      document.documentElement.dataset.theme = t;
4073    }
4074    if (themeBtn) themeBtn.textContent = `theme: ${t}`;
4075  };
4076  const readTheme = () => {
4077    try { return localStorage.getItem(themeKey) || 'auto'; } catch (_) { return 'auto'; }
4078  };
4079  const writeTheme = (t) => {
4080    try { localStorage.setItem(themeKey, t); } catch (_) { /* ignore */ }
4081  };
4082  applyTheme(readTheme());
4083  if (themeBtn) {
4084    themeBtn.addEventListener('click', () => {
4085      const cur = readTheme();
4086      const next = cur === 'auto' ? 'dark' : (cur === 'dark' ? 'light' : 'auto');
4087      writeTheme(next);
4088      applyTheme(next);
4089    });
4090  }
4091
4092  function clearActive() {
4093    document.querySelectorAll(".e-active").forEach((el) => el.classList.remove("e-active"));
4094    document.querySelectorAll("tr.match-row.active").forEach((el) => el.classList.remove("active"));
4095  }
4096
4097  function findSpanEls(eid) {
4098    if (!eid) return [];
4099    // New segmented renderer: one span can be split across multiple elements.
4100    const els = Array.from(document.querySelectorAll(`span.e[data-eids~='${eid}']`));
4101    if (els.length) return els;
4102    // Back-compat: older HTML used a single element id.
4103    const single = document.getElementById(eid);
4104    return single ? [single] : [];
4105  }
4106
4107  function activate(gid, pid, row) {
4108    clearActive();
4109    const gEls = findSpanEls(gid);
4110    const pEls = findSpanEls(pid);
4111    const sel = document.getElementById("selection");
4112    gEls.forEach((el) => el.classList.add("e-active"));
4113    pEls.forEach((el) => el.classList.add("e-active"));
4114    if (row) row.classList.add("active");
4115    if (sel) {
4116      const parts = [];
4117      if (gEls.length) {
4118        const lbl = gEls[0].dataset && gEls[0].dataset.label ? ` [${gEls[0].dataset.label}]` : "";
4119        parts.push(`gold ${gid}${lbl}`);
4120      }
4121      if (pEls.length) {
4122        const lbl = pEls[0].dataset && pEls[0].dataset.label ? ` [${pEls[0].dataset.label}]` : "";
4123        parts.push(`pred ${pid}${lbl}`);
4124      }
4125      sel.textContent = parts.length ? parts.join("  |  ") : "no selection";
4126    }
4127    if (row && row.id) {
4128      // Keep deep links stable without triggering navigation jump.
4129      // NOTE: single quotes avoid the Rust raw-string delimiter issue with quote+hash.
4130      history.replaceState(null, "", '#' + row.id);
4131    }
4132    const target = gEls[0] || pEls[0];
4133    if (target) target.scrollIntoView({ behavior: "smooth", block: "center" });
4134  }
4135
4136  document.querySelectorAll("tr.match-row[data-gid], tr.match-row[data-pid]").forEach((tr) => {
4137    tr.addEventListener("click", () => activate(tr.dataset.gid, tr.dataset.pid, tr));
4138  });
4139
4140  document.querySelectorAll("a.match-link").forEach((a) => {
4141    a.addEventListener("click", (ev) => {
4142      ev.preventDefault();
4143      const tr = a.closest("tr.match-row");
4144      if (!tr) return;
4145      activate(tr.dataset.gid, tr.dataset.pid, tr);
4146    });
4147  });
4148
4149  // Auto-select a match row if the URL has a deep link (e.g. #M12).
4150  const hash = (location.hash || "").slice(1);
4151  if (hash && hash.startsWith("M")) {
4152    const tr = document.getElementById(hash);
4153    if (tr && tr.classList && tr.classList.contains("match-row")) {
4154      activate(tr.dataset.gid, tr.dataset.pid, tr);
4155    }
4156  }
4157})();
4158</script>"#,
4159    );
4160
4161    html.push_str("</body></html>");
4162    html
4163}
4164
4165/// Annotate text with multiple labeled spans.
4166#[derive(Debug, Clone)]
4167struct EvalHtmlSpan {
4168    start: usize,
4169    end: usize,
4170    label: String,
4171    class: &'static str,
4172    id: String,
4173}
4174
4175fn annotate_text_spans(text: &str, spans: &[EvalHtmlSpan]) -> String {
4176    let char_count = text.chars().count();
4177    if char_count == 0 || spans.is_empty() {
4178        return html_escape(text);
4179    }
4180
4181    #[derive(Debug, Clone)]
4182    struct Meta {
4183        id: String,
4184        label: String,
4185        class: &'static str,
4186        len: usize,
4187    }
4188    #[derive(Debug, Clone)]
4189    struct Event {
4190        pos: usize,
4191        meta_idx: usize,
4192        delta: i32,
4193    }
4194
4195    let mut metas: Vec<Meta> = Vec::with_capacity(spans.len());
4196    let mut events: Vec<Event> = Vec::new();
4197    let mut boundaries: Vec<usize> = vec![0, char_count];
4198
4199    for s in spans {
4200        let start = s.start.min(char_count);
4201        let end = s.end.min(char_count);
4202        if start >= end {
4203            continue;
4204        }
4205        let meta_idx = metas.len();
4206        metas.push(Meta {
4207            id: s.id.clone(),
4208            label: s.label.to_string(),
4209            class: s.class,
4210            len: end - start,
4211        });
4212        boundaries.push(start);
4213        boundaries.push(end);
4214        events.push(Event {
4215            pos: start,
4216            meta_idx,
4217            delta: 1,
4218        });
4219        events.push(Event {
4220            pos: end,
4221            meta_idx,
4222            delta: -1,
4223        });
4224    }
4225
4226    if metas.is_empty() {
4227        return html_escape(text);
4228    }
4229
4230    boundaries.sort_unstable();
4231    boundaries.dedup();
4232    events.sort_by(|a, b| a.pos.cmp(&b.pos).then_with(|| a.delta.cmp(&b.delta)));
4233
4234    let mut active_counts: Vec<u32> = vec![0; metas.len()];
4235    let mut active: Vec<usize> = Vec::new();
4236    let mut ev_idx = 0usize;
4237    let mut result = String::new();
4238
4239    for bi in 0..boundaries.len().saturating_sub(1) {
4240        let pos = boundaries[bi];
4241        while ev_idx < events.len() && events[ev_idx].pos == pos {
4242            let e = &events[ev_idx];
4243            let idx = e.meta_idx;
4244            if e.delta < 0 {
4245                if active_counts[idx] > 0 {
4246                    active_counts[idx] -= 1;
4247                    if active_counts[idx] == 0 {
4248                        active.retain(|&x| x != idx);
4249                    }
4250                }
4251            } else {
4252                active_counts[idx] += 1;
4253                if active_counts[idx] == 1 {
4254                    active.push(idx);
4255                }
4256            }
4257            ev_idx += 1;
4258        }
4259
4260        let next = boundaries[bi + 1];
4261        if next <= pos {
4262            continue;
4263        }
4264
4265        let seg_text: String = text.chars().skip(pos).take(next - pos).collect();
4266        if active.is_empty() {
4267            result.push_str(&html_escape(&seg_text));
4268            continue;
4269        }
4270
4271        let primary_idx = active
4272            .iter()
4273            .copied()
4274            .min_by_key(|i| metas[*i].len)
4275            .unwrap_or(active[0]);
4276        let primary = &metas[primary_idx];
4277        let mut eids: Vec<&str> = active.iter().map(|i| metas[*i].id.as_str()).collect();
4278        eids.sort_unstable();
4279        let data_eids = eids.join(" ");
4280
4281        let title = format!(
4282            "eids=[{}] primary={} [{}..{})",
4283            data_eids, primary.id, pos, next
4284        );
4285        result.push_str(&format!(
4286            "<span class=\"e seg {class}\" data-eids=\"{eids}\" data-label=\"{label}\" data-start=\"{start}\" data-end=\"{end}\" title=\"{title}\">{text}</span>",
4287            class = primary.class,
4288            eids = html_escape(&data_eids),
4289            label = html_escape(&primary.label),
4290            start = pos,
4291            end = next,
4292            title = html_escape(&title),
4293            text = html_escape(&seg_text)
4294        ));
4295    }
4296
4297    result
4298}
4299
4300// =============================================================================
4301// URL/Text Input Processing
4302// =============================================================================
4303
4304/// Options for processing arbitrary input.
4305#[derive(Debug, Clone, Default)]
4306pub struct ProcessOptions {
4307    /// Labels to extract (empty = all)
4308    pub labels: Vec<String>,
4309    /// Confidence threshold
4310    pub threshold: Confidence,
4311}
4312
4313/// Result of processing input.
4314#[derive(Debug)]
4315pub struct ProcessResult {
4316    /// The document with signals
4317    pub document: GroundedDocument,
4318    /// Whether validation passed
4319    pub valid: bool,
4320    /// Any validation errors
4321    pub errors: Vec<SignalValidationError>,
4322}
4323
4324impl ProcessResult {
4325    /// Render as HTML.
4326    #[must_use]
4327    pub fn to_html(&self) -> String {
4328        render_document_html(&self.document)
4329    }
4330}
4331
4332// =============================================================================
4333// Corpus: Multi-Document Operations
4334// =============================================================================
4335
4336/// A corpus of grounded documents for cross-document operations.
4337///
4338/// Enables inter-document coreference resolution and entity linking
4339/// across multiple documents.
4340#[derive(Debug, Clone)]
4341pub struct Corpus {
4342    documents: std::collections::HashMap<String, GroundedDocument>,
4343    identities: std::collections::HashMap<IdentityId, Identity>,
4344    next_identity_id: IdentityId,
4345}
4346
4347impl Corpus {
4348    /// Create a new empty corpus.
4349    #[must_use]
4350    pub fn new() -> Self {
4351        Self {
4352            documents: std::collections::HashMap::new(),
4353            identities: std::collections::HashMap::new(),
4354            next_identity_id: IdentityId::ZERO,
4355        }
4356    }
4357
4358    /// Get all identities in the corpus.
4359    #[must_use]
4360    pub fn identities(&self) -> &std::collections::HashMap<IdentityId, Identity> {
4361        &self.identities
4362    }
4363
4364    /// Get an identity by ID.
4365    #[must_use]
4366    pub fn get_identity(&self, id: IdentityId) -> Option<&Identity> {
4367        self.identities.get(&id)
4368    }
4369
4370    /// Add an identity to the corpus and return its ID.
4371    ///
4372    /// This method assigns the next available identity ID and inserts the identity.
4373    /// Used by coalescing operations to create cross-document identities.
4374    pub fn add_identity(&mut self, mut identity: Identity) -> IdentityId {
4375        let id = self.next_identity_id;
4376        identity.id = id;
4377        self.identities.insert(id, identity);
4378        self.next_identity_id += 1;
4379        id
4380    }
4381
4382    /// Get the next identity ID that would be assigned.
4383    ///
4384    /// This is used by coalescing operations to reserve identity IDs.
4385    #[must_use]
4386    pub fn next_identity_id(&self) -> IdentityId {
4387        self.next_identity_id
4388    }
4389
4390    /// Get all documents in the corpus.
4391    ///
4392    /// Returns an iterator over all documents.
4393    pub fn documents(&self) -> impl Iterator<Item = &GroundedDocument> {
4394        self.documents.values()
4395    }
4396
4397    /// Get a document by ID.
4398    ///
4399    /// Returns `None` if the document doesn't exist.
4400    #[must_use]
4401    pub fn get_document(&self, doc_id: &str) -> Option<&GroundedDocument> {
4402        self.documents.get(doc_id)
4403    }
4404
4405    /// Get a mutable reference to a document by ID.
4406    ///
4407    /// Returns `None` if the document doesn't exist.
4408    pub fn get_document_mut(&mut self, doc_id: &str) -> Option<&mut GroundedDocument> {
4409        self.documents.get_mut(doc_id)
4410    }
4411
4412    /// Add a document to the corpus.
4413    ///
4414    /// If a document with the same ID already exists, it will be replaced.
4415    /// Returns the document ID.
4416    pub fn add_document(&mut self, document: GroundedDocument) -> String {
4417        let doc_id = document.id.clone();
4418        self.documents.insert(doc_id.clone(), document);
4419        doc_id
4420    }
4421
4422    /// Link a track to a knowledge base entity.
4423    ///
4424    /// This is the entity linking (NED) operation. It creates or updates
4425    /// an identity with KB information.
4426    ///
4427    /// # Parameters
4428    ///
4429    /// * `track_ref` - Reference to the track to link
4430    /// * `kb_name` - Knowledge base name (e.g., "wikidata")
4431    /// * `kb_id` - Knowledge base entity ID (e.g., "Q7186")
4432    /// * `canonical_name` - Canonical name from KB
4433    ///
4434    /// # Returns
4435    ///
4436    /// The identity ID (new or existing), or an error if the track reference is invalid.
4437    ///
4438    /// # Errors
4439    ///
4440    /// Returns `Error::TrackRef` if:
4441    /// - The document ID doesn't exist in the corpus
4442    /// - The track ID doesn't exist in the document
4443    pub fn link_track_to_kb(
4444        &mut self,
4445        track_ref: &TrackRef,
4446        kb_name: impl Into<String>,
4447        kb_id: impl Into<String>,
4448        canonical_name: impl Into<String>,
4449    ) -> super::Result<IdentityId> {
4450        use super::error::Error;
4451
4452        let doc = self.documents.get_mut(&track_ref.doc_id).ok_or_else(|| {
4453            Error::track_ref(format!(
4454                "Document '{}' not found in corpus",
4455                track_ref.doc_id
4456            ))
4457        })?;
4458        let track = doc.get_track(track_ref.track_id).ok_or_else(|| {
4459            Error::track_ref(format!(
4460                "Track {} not found in document '{}'",
4461                track_ref.track_id, track_ref.doc_id
4462            ))
4463        })?;
4464
4465        let kb_name_str = kb_name.into();
4466        let kb_id_str = kb_id.into();
4467        let canonical_name_str = canonical_name.into();
4468
4469        // Check if track already has an identity
4470        let identity_id = if let Some(existing_id) = track.identity_id {
4471            // Update existing identity with KB info if it exists in corpus
4472            if let Some(identity) = self.identities.get_mut(&existing_id) {
4473                identity.kb_id = Some(kb_id_str.clone());
4474                identity.kb_name = Some(kb_name_str.clone());
4475                identity.canonical_name = canonical_name_str.clone();
4476
4477                // Update source
4478                identity.source = Some(match identity.source.take() {
4479                    Some(IdentitySource::CrossDocCoref { track_refs }) => IdentitySource::Hybrid {
4480                        track_refs,
4481                        kb_name: kb_name_str.clone(),
4482                        kb_id: kb_id_str.clone(),
4483                    },
4484                    _ => IdentitySource::KnowledgeBase {
4485                        kb_name: kb_name_str.clone(),
4486                        kb_id: kb_id_str.clone(),
4487                    },
4488                });
4489
4490                existing_id
4491            } else {
4492                // Identity ID exists in document but not in corpus - this is inconsistent.
4493                // This can happen if:
4494                // 1. Document was added to corpus with pre-existing identities
4495                // 2. Identity was removed from corpus but document still references it
4496                //
4497                // Fix: Create new identity and update ALL references in the document
4498                // to ensure consistency between document and corpus state.
4499                let new_id = self.next_identity_id;
4500                self.next_identity_id += 1;
4501
4502                let identity = Identity {
4503                    id: new_id,
4504                    canonical_name: canonical_name_str,
4505                    entity_type: track.entity_type.clone(),
4506                    kb_id: Some(kb_id_str.clone()),
4507                    kb_name: Some(kb_name_str.clone()),
4508                    description: None,
4509                    embedding: track.embedding.clone(),
4510                    aliases: Vec::new(),
4511                    confidence: track.cluster_confidence,
4512                    source: Some(IdentitySource::KnowledgeBase {
4513                        kb_name: kb_name_str,
4514                        kb_id: kb_id_str,
4515                    }),
4516                };
4517
4518                self.identities.insert(new_id, identity);
4519                // Update the track's identity reference to point to the new identity
4520                // This ensures document and corpus are consistent
4521                doc.link_track_to_identity(track_ref.track_id, new_id);
4522                new_id
4523            }
4524        } else {
4525            // Create new identity
4526            let new_id = self.next_identity_id;
4527            self.next_identity_id += 1;
4528
4529            let identity = Identity {
4530                id: new_id,
4531                canonical_name: canonical_name_str,
4532                entity_type: track.entity_type.clone(),
4533                kb_id: Some(kb_id_str.clone()),
4534                kb_name: Some(kb_name_str.clone()),
4535                description: None,
4536                embedding: track.embedding.clone(),
4537                aliases: Vec::new(),
4538                confidence: track.cluster_confidence,
4539                source: Some(IdentitySource::KnowledgeBase {
4540                    kb_name: kb_name_str,
4541                    kb_id: kb_id_str,
4542                }),
4543            };
4544
4545            self.identities.insert(new_id, identity);
4546            doc.link_track_to_identity(track_ref.track_id, new_id);
4547            new_id
4548        };
4549
4550        Ok(identity_id)
4551    }
4552}
4553
4554impl Default for Corpus {
4555    fn default() -> Self {
4556        Self::new()
4557    }
4558}
4559
4560#[cfg(test)]
4561mod tests {
4562    #![allow(clippy::unwrap_used)] // unwrap() is acceptable in test code
4563    use super::*;
4564    use crate::EntityCategory;
4565
4566    #[test]
4567    fn test_render_eval_html_has_interactive_hooks_and_is_unicode_safe() {
4568        // CJK example (multi-byte, no spaces)
4569        let text = "習近平在北京會見了普京。";
4570
4571        let gold: Vec<Signal<Location>> = vec![
4572            Signal::new(SignalId::new(0), Location::text(0, 3), "習近平", "PER", 1.0),
4573            Signal::new(SignalId::new(1), Location::text(4, 6), "北京", "LOC", 1.0),
4574        ];
4575
4576        // Intentionally introduce a type mismatch on 北京 to ensure a non-correct row exists.
4577        let predicted: Vec<Signal<Location>> = vec![
4578            Signal::new(SignalId::new(0), Location::text(0, 3), "習近平", "PER", 0.9),
4579            Signal::new(SignalId::new(1), Location::text(4, 6), "北京", "PER", 0.7),
4580        ];
4581
4582        let cmp = EvalComparison::compare(text, gold, predicted);
4583        let html = render_eval_html_with_title(&cmp, "test");
4584
4585        // Selection readout (useful for humans + enables browser-based verification)
4586        assert!(html.contains("id=\"selection\""));
4587
4588        // Span IDs must be stable and distinct between gold/pred (segmented renderer uses data-eids)
4589        assert!(html.contains("data-eids=\"G0\""));
4590        assert!(html.contains("data-eids=\"P0\""));
4591
4592        // Match rows must carry cross-links and be clickable
4593        assert!(html.contains("class=\"match-link\""));
4594        assert!(html.contains("href=\"#M0\""));
4595        assert!(html.contains("data-gid=\"G0\""));
4596        assert!(html.contains("data-pid=\"P0\""));
4597
4598        // Ensure we didn't break Unicode rendering
4599        assert!(html.contains("北京"));
4600    }
4601
4602    fn find_char_span(text: &str, needle: &str) -> Option<(usize, usize)> {
4603        let hay: Vec<char> = text.chars().collect();
4604        let pat: Vec<char> = needle.chars().collect();
4605        if pat.is_empty() || hay.len() < pat.len() {
4606            return None;
4607        }
4608        for i in 0..=(hay.len() - pat.len()) {
4609            if hay[i..(i + pat.len())] == pat[..] {
4610                return Some((i, i + pat.len()));
4611            }
4612        }
4613        None
4614    }
4615
4616    #[test]
4617    fn test_annotate_text_html_supports_overlaps_discontinuous_and_unicode() {
4618        // Intentionally include multiple scripts and an overlap + discontinuous mention.
4619        let text = "Marie Curie met Cher in Paris. 習近平在北京會見了普京。 \
4620التقى محمد بن سلمان في الرياض. Путин встретился с Си Цзиньпином в Москве. \
4621प्रधान मंत्री शर्मा दिल्ली में मिले। severe pain ... in abdomen.";
4622
4623        // Overlap: "Marie Curie" contains "Curie"
4624        let (m0s, m0e) = find_char_span(text, "Marie Curie").unwrap();
4625        let (m1s, m1e) = find_char_span(text, "Curie").unwrap();
4626
4627        // Discontinuous: "pain" + "abdomen"
4628        let pain = find_char_span(text, "pain").unwrap();
4629        let abdomen = find_char_span(text, "abdomen").unwrap();
4630
4631        let signals: Vec<Signal<Location>> = vec![
4632            Signal::new(
4633                SignalId::new(0),
4634                Location::text(m0s, m0e),
4635                "Marie Curie",
4636                "PER",
4637                0.9,
4638            ),
4639            Signal::new(
4640                SignalId::new(1),
4641                Location::text(m1s, m1e),
4642                "Curie",
4643                "PER",
4644                0.8,
4645            ),
4646            Signal::new(
4647                SignalId::new(2),
4648                Location::Discontinuous {
4649                    segments: vec![pain, abdomen],
4650                },
4651                "pain … abdomen",
4652                "SYMPTOM",
4653                0.7,
4654            ),
4655        ];
4656
4657        let html = annotate_text_html(text, &signals, &std::collections::HashMap::new());
4658
4659        // Overlap must be representable (segment(s) covered by both S0 and S1).
4660        assert!(html.contains("data-sids=\"S0 S1\"") || html.contains("data-sids=\"S1 S0\""));
4661
4662        // Discontinuous mention should be present in two places (at least one segment contains S2).
4663        assert!(html.contains("data-sids=\"S2\""));
4664
4665        // Unicode safety: the original text snippets should still appear.
4666        assert!(html.contains("北京"));
4667        assert!(html.contains("Москве"));
4668        assert!(html.contains("शर्मा"));
4669        assert!(html.contains("محمد"));
4670    }
4671
4672    #[test]
4673    fn test_location_text_iou() {
4674        let l1 = Location::text(0, 10);
4675        let l2 = Location::text(5, 15);
4676        let iou = l1.iou(&l2).unwrap();
4677        // Intersection: [5, 10) = 5 chars
4678        // Union: [0, 15) = 15 chars
4679        // IoU = 5/15 = 0.333...
4680        assert!((iou - 0.333).abs() < 0.01);
4681    }
4682
4683    #[test]
4684    fn test_signal_creation() {
4685        let signal: Signal<Location> =
4686            Signal::new(0, Location::text(0, 11), "Marie Curie", "Person", 0.95);
4687        assert_eq!(signal.surface, "Marie Curie");
4688        assert_eq!(signal.label, "Person".into());
4689        assert!((signal.confidence.value() - 0.95).abs() < 0.001);
4690        assert!(!signal.negated);
4691    }
4692
4693    #[test]
4694    fn test_signal_with_linguistic_features() {
4695        let signal: Signal<Location> =
4696            Signal::new(0, Location::text(0, 10), "not a doctor", "Occupation", 0.8)
4697                .negated()
4698                .with_quantifier(Quantifier::Existential)
4699                .with_modality(Modality::Symbolic);
4700
4701        assert!(signal.negated);
4702        assert_eq!(signal.quantifier, Some(Quantifier::Existential));
4703        assert_eq!(signal.modality, Modality::Symbolic);
4704    }
4705
4706    #[test]
4707    fn test_track_formation() {
4708        let mut track = Track::new(0, "Marie Curie");
4709        track.add_signal(0, 0);
4710        track.add_signal(1, 1);
4711        track.add_signal(2, 2);
4712
4713        assert_eq!(track.len(), 3);
4714        assert!(!track.is_singleton());
4715        assert!(!track.is_empty());
4716    }
4717
4718    #[test]
4719    fn test_identity_creation() {
4720        let identity = Identity::from_kb(0, "Marie Curie", "wikidata", "Q7186")
4721            .with_type("Person")
4722            .with_embedding(vec![0.1, 0.2, 0.3]);
4723
4724        assert_eq!(identity.canonical_name, "Marie Curie");
4725        assert_eq!(identity.kb_id, Some("Q7186".to_string()));
4726        assert_eq!(identity.kb_name, Some("wikidata".to_string()));
4727        assert!(identity.embedding.is_some());
4728    }
4729
4730    #[test]
4731    fn test_grounded_document_hierarchy() {
4732        let mut doc = GroundedDocument::new(
4733            "doc1",
4734            "Marie Curie won the Nobel Prize. She was a physicist.",
4735        );
4736
4737        // Add signals (Level 1)
4738        let s1 = doc.add_signal(Signal::new(
4739            0,
4740            Location::text(0, 12),
4741            "Marie Curie",
4742            "Person",
4743            0.95,
4744        ));
4745        let s2 = doc.add_signal(Signal::new(
4746            1,
4747            Location::text(38, 41),
4748            "She",
4749            "Person",
4750            0.88,
4751        ));
4752        let s3 = doc.add_signal(Signal::new(
4753            2,
4754            Location::text(17, 29),
4755            "Nobel Prize",
4756            "Award",
4757            0.92,
4758        ));
4759
4760        // Form tracks (Level 2)
4761        let mut track1 = Track::new(0, "Marie Curie");
4762        track1.add_signal(s1, 0);
4763        track1.add_signal(s2, 1);
4764        let track1_id = doc.add_track(track1);
4765
4766        let mut track2 = Track::new(1, "Nobel Prize");
4767        track2.add_signal(s3, 0);
4768        doc.add_track(track2);
4769
4770        // Add identity (Level 3)
4771        let identity = Identity::from_kb(0, "Marie Curie", "wikidata", "Q7186");
4772        let identity_id = doc.add_identity(identity);
4773        doc.link_track_to_identity(track1_id, identity_id);
4774
4775        // Verify hierarchy traversal
4776        assert_eq!(doc.signals().len(), 3);
4777        assert_eq!(doc.tracks().count(), 2);
4778        assert_eq!(doc.identities().count(), 1);
4779
4780        // Signal → Track
4781        let track = doc.track_for_signal(s1).unwrap();
4782        assert_eq!(track.canonical_surface, "Marie Curie");
4783        assert_eq!(track.len(), 2);
4784
4785        // Track → Identity
4786        let identity = doc.identity_for_track(track1_id).unwrap();
4787        assert_eq!(identity.kb_id, Some("Q7186".to_string()));
4788
4789        // Signal → Identity (transitive)
4790        let identity = doc.identity_for_signal(s1).unwrap();
4791        assert_eq!(identity.canonical_name, "Marie Curie");
4792    }
4793
4794    #[test]
4795    fn test_modality_variants() {
4796        assert_eq!(Modality::default(), Modality::Symbolic);
4797        assert_eq!(Location::text(0, 10).modality(), Modality::Symbolic);
4798    }
4799
4800    #[test]
4801    fn test_location_from_span() {
4802        let span = Span::Text { start: 0, end: 10 };
4803        let location = Location::from(&span);
4804        assert_eq!(location.text_offsets(), Some((0, 10)));
4805    }
4806
4807    #[test]
4808    fn test_entity_roundtrip() {
4809        use super::EntityType;
4810
4811        let entities = vec![
4812            Entity::new("Marie Curie", EntityType::Person, 0, 12, 0.95),
4813            Entity::new(
4814                "Nobel Prize",
4815                EntityType::custom("Award", EntityCategory::Creative),
4816                17,
4817                29,
4818                0.92,
4819            ),
4820        ];
4821
4822        let doc =
4823            GroundedDocument::from_entities("doc1", "Marie Curie won the Nobel Prize.", &entities);
4824        let converted = doc.to_entities();
4825
4826        assert_eq!(converted.len(), 2);
4827        assert_eq!(converted[0].text, "Marie Curie");
4828        assert_eq!(converted[1].text, "Nobel Prize");
4829    }
4830
4831    #[test]
4832    fn test_signal_confidence_threshold() {
4833        let signal: Signal<Location> = Signal::new(0, Location::text(0, 10), "test", "Type", 0.75);
4834        assert!(signal.is_confident(Confidence::new(0.5)));
4835        assert!(signal.is_confident(Confidence::new(0.75)));
4836        assert!(!signal.is_confident(Confidence::new(0.8)));
4837    }
4838
4839    #[test]
4840    fn test_document_filtering() {
4841        let mut doc = GroundedDocument::new("doc1", "Test text");
4842
4843        // Add signals with different confidences and labels
4844        doc.add_signal(Signal::new(0, Location::text(0, 4), "high", "Person", 0.95));
4845        doc.add_signal(Signal::new(1, Location::text(5, 8), "low", "Person", 0.3));
4846        doc.add_signal(Signal::new(
4847            2,
4848            Location::text(9, 12),
4849            "org",
4850            "Organization",
4851            0.8,
4852        ));
4853
4854        // Filter by confidence
4855        let confident = doc.confident_signals(Confidence::new(0.5));
4856        assert_eq!(confident.len(), 2);
4857
4858        // Filter by label
4859        let persons = doc.signals_with_label("Person");
4860        assert_eq!(persons.len(), 2);
4861
4862        let orgs = doc.signals_with_label("Organization");
4863        assert_eq!(orgs.len(), 1);
4864    }
4865
4866    #[test]
4867    fn test_untracked_signals() {
4868        let mut doc = GroundedDocument::new("doc1", "Test");
4869
4870        let s1 = doc.add_signal(Signal::new(0, Location::text(0, 4), "a", "T", 0.9));
4871        let s2 = doc.add_signal(Signal::new(1, Location::text(5, 8), "b", "T", 0.9));
4872        let _s3 = doc.add_signal(Signal::new(2, Location::text(9, 12), "c", "T", 0.9));
4873
4874        // Only track s1 and s2
4875        let mut track = Track::new(0, "a");
4876        track.add_signal(s1, 0);
4877        track.add_signal(s2, 1);
4878        doc.add_track(track);
4879
4880        // s3 should be untracked
4881        assert_eq!(doc.untracked_signal_count(), 1);
4882        let untracked = doc.untracked_signals();
4883        assert_eq!(untracked.len(), 1);
4884        assert_eq!(untracked[0].surface, "c");
4885    }
4886
4887    #[test]
4888    fn test_linked_unlinked_tracks() {
4889        let mut doc = GroundedDocument::new("doc1", "Test");
4890
4891        let s1 = doc.add_signal(Signal::new(0, Location::text(0, 4), "a", "T", 0.9));
4892        let s2 = doc.add_signal(Signal::new(1, Location::text(5, 8), "b", "T", 0.9));
4893
4894        let mut track1 = Track::new(0, "a");
4895        track1.add_signal(s1, 0);
4896        let track1_id = doc.add_track(track1);
4897
4898        let mut track2 = Track::new(1, "b");
4899        track2.add_signal(s2, 0);
4900        doc.add_track(track2);
4901
4902        // Link only track1 to an identity
4903        let identity = Identity::new(0, "Entity A");
4904        let identity_id = doc.add_identity(identity);
4905        doc.link_track_to_identity(track1_id, identity_id);
4906
4907        assert_eq!(doc.linked_tracks().count(), 1);
4908        assert_eq!(doc.unlinked_tracks().count(), 1);
4909    }
4910
4911    #[test]
4912    fn test_iou_edge_cases() {
4913        // No overlap
4914        let l1 = Location::text(0, 5);
4915        let l2 = Location::text(10, 15);
4916        assert_eq!(l1.iou(&l2), Some(0.0));
4917
4918        // Complete overlap (identical)
4919        let l3 = Location::text(0, 10);
4920        let l4 = Location::text(0, 10);
4921        assert_eq!(l3.iou(&l4), Some(1.0));
4922
4923        // One contains the other
4924        let l5 = Location::text(0, 20);
4925        let l6 = Location::text(5, 15);
4926        let iou = l5.iou(&l6).unwrap();
4927        // Intersection: 10, Union: 20
4928        assert!((iou - 0.5).abs() < 0.001);
4929    }
4930
4931    // Note: Tests that depend on anno::eval::coref types have been moved to anno crate
4932    // (test_coref_chain_conversion, test_from_coref_document, test_coref_roundtrip)
4933
4934    #[test]
4935    fn test_document_stats() {
4936        let mut doc = GroundedDocument::new("doc1", "Test document with entities.");
4937
4938        // Add signals with varying properties
4939        let s1 = doc.add_signal(Signal::new(0, Location::text(0, 4), "Test", "Type", 0.9));
4940        let mut negated = Signal::new(0, Location::text(5, 13), "document", "Type", 0.8);
4941        negated.negated = true;
4942        let s2 = doc.add_signal(negated);
4943        let _s3 = doc.add_signal(Signal::new(
4944            0,
4945            Location::text(19, 27),
4946            "entities",
4947            "Type",
4948            0.7,
4949        ));
4950
4951        // Create one track with 2 signals
4952        let mut track = Track::new(0, "Test");
4953        track.add_signal(s1, 0);
4954        track.add_signal(s2, 1);
4955        doc.add_track(track);
4956
4957        // Add identity for the track
4958        let identity = Identity::new(0, "Test Entity");
4959        let identity_id = doc.add_identity(identity);
4960        doc.link_track_to_identity(0, identity_id);
4961
4962        let stats = doc.stats();
4963
4964        assert_eq!(stats.signal_count, 3);
4965        assert_eq!(stats.track_count, 1);
4966        assert_eq!(stats.identity_count, 1);
4967        assert_eq!(stats.linked_track_count, 1);
4968        assert_eq!(stats.untracked_count, 1); // s3 is untracked
4969        assert_eq!(stats.negated_count, 1);
4970        assert!((stats.avg_confidence - 0.8).abs() < 0.01); // (0.9 + 0.8 + 0.7) / 3
4971        assert!((stats.avg_track_size - 2.0).abs() < 0.01);
4972    }
4973
4974    #[test]
4975    fn test_batch_operations() {
4976        let mut doc = GroundedDocument::new("doc1", "Test document.");
4977
4978        // Batch add signals
4979        let signals = vec![
4980            Signal::new(0, Location::text(0, 4), "Test", "Type", 0.9),
4981            Signal::new(0, Location::text(5, 13), "document", "Type", 0.8),
4982        ];
4983        let ids = doc.add_signals(signals);
4984
4985        assert_eq!(ids.len(), 2);
4986        assert_eq!(doc.signals().len(), 2);
4987
4988        // Create track from signal IDs
4989        let track_id = doc.create_track_from_signals("Test", &ids);
4990        assert!(track_id.is_some());
4991
4992        let track = doc.get_track(track_id.unwrap()).unwrap();
4993        assert_eq!(track.len(), 2);
4994        assert_eq!(track.canonical_surface, "Test");
4995    }
4996
4997    #[test]
4998    fn test_merge_tracks() {
4999        let mut doc = GroundedDocument::new("doc1", "John Smith works at Acme. He is great.");
5000
5001        // Add signals
5002        let s1 = doc.add_signal(Signal::new(
5003            0,
5004            Location::text(0, 10),
5005            "John Smith",
5006            "Person",
5007            0.9,
5008        ));
5009        let s2 = doc.add_signal(Signal::new(0, Location::text(26, 28), "He", "Person", 0.8));
5010
5011        // Create two separate tracks
5012        let mut track1 = Track::new(0, "John Smith");
5013        track1.add_signal(s1, 0);
5014        let track1_id = doc.add_track(track1);
5015
5016        let mut track2 = Track::new(0, "He");
5017        track2.add_signal(s2, 0);
5018        let track2_id = doc.add_track(track2);
5019
5020        assert_eq!(doc.tracks().count(), 2);
5021
5022        // Merge tracks
5023        let merged_id = doc.merge_tracks(&[track1_id, track2_id]);
5024        assert!(merged_id.is_some());
5025
5026        // Should now have only 1 track with 2 signals
5027        assert_eq!(doc.tracks().count(), 1);
5028        let merged = doc.get_track(merged_id.unwrap()).unwrap();
5029        assert_eq!(merged.len(), 2);
5030        assert_eq!(merged.canonical_surface, "John Smith"); // From first track
5031    }
5032
5033    #[test]
5034    fn test_find_overlapping_pairs() {
5035        let mut doc = GroundedDocument::new("doc1", "New York City is great.");
5036
5037        // Add overlapping signals (nested entity)
5038        doc.add_signal(Signal::new(
5039            0,
5040            Location::text(0, 13),
5041            "New York City",
5042            "Location",
5043            0.9,
5044        ));
5045        doc.add_signal(Signal::new(
5046            0,
5047            Location::text(0, 8),
5048            "New York",
5049            "Location",
5050            0.85,
5051        ));
5052        doc.add_signal(Signal::new(0, Location::text(17, 22), "great", "Adj", 0.7)); // Not overlapping
5053
5054        let pairs = doc.find_overlapping_signal_pairs();
5055
5056        // Should find one overlapping pair (New York City & New York)
5057        assert_eq!(pairs.len(), 1);
5058    }
5059
5060    #[test]
5061    fn test_signals_in_range() {
5062        let mut doc = GroundedDocument::new("doc1", "John went to Paris and Berlin last year.");
5063
5064        doc.add_signal(Signal::new(0, Location::text(0, 4), "John", "Person", 0.9));
5065        doc.add_signal(Signal::new(
5066            0,
5067            Location::text(13, 18),
5068            "Paris",
5069            "Location",
5070            0.9,
5071        ));
5072        doc.add_signal(Signal::new(
5073            0,
5074            Location::text(23, 29),
5075            "Berlin",
5076            "Location",
5077            0.9,
5078        ));
5079        doc.add_signal(Signal::new(
5080            0,
5081            Location::text(30, 39),
5082            "last year",
5083            "Date",
5084            0.8,
5085        ));
5086
5087        // Find signals in the "Paris and Berlin" section
5088        let in_range = doc.signals_in_range(10, 30);
5089        assert_eq!(in_range.len(), 2); // Paris and Berlin
5090
5091        let surfaces: Vec<_> = in_range.iter().map(|s| &s.surface).collect();
5092        assert!(surfaces.contains(&&"Paris".to_string()));
5093        assert!(surfaces.contains(&&"Berlin".to_string()));
5094    }
5095
5096    #[test]
5097    fn test_quantifier_variants() {
5098        // Ensure all quantifier variants work
5099        let quantifiers = [
5100            Quantifier::Universal,
5101            Quantifier::Existential,
5102            Quantifier::None,
5103            Quantifier::Definite,
5104            Quantifier::Bare,
5105            Quantifier::Approximate,
5106            Quantifier::MinBound,
5107            Quantifier::MaxBound,
5108        ];
5109
5110        for q in quantifiers {
5111            let signal: Signal<Location> =
5112                Signal::new(0, Location::text(0, 5), "test", "Type", 0.9).with_quantifier(q);
5113
5114            assert_eq!(signal.quantifier, Some(q));
5115        }
5116    }
5117
5118    #[test]
5119    fn test_location_modality_derivation() {
5120        assert_eq!(Location::text(0, 10).modality(), Modality::Symbolic);
5121        assert_eq!(
5122            Location::Discontinuous {
5123                segments: vec![(0, 5), (10, 15)]
5124            }
5125            .modality(),
5126            Modality::Symbolic
5127        );
5128    }
5129
5130    // Note: CrossDocCluster conversion test moved to anno crate
5131    // since CrossDocCluster is defined in anno/src/eval/cdcr.rs
5132}
5133
5134// =============================================================================
5135// Property-Based Tests
5136// =============================================================================
5137//
5138// These tests verify invariants that should hold for ALL valid inputs,
5139// not just specific examples. They catch edge cases that unit tests miss.
5140
5141#[cfg(test)]
5142mod proptests {
5143    #![allow(clippy::unwrap_used)] // unwrap() is acceptable in property tests
5144    use super::*;
5145    use proptest::prelude::*;
5146
5147    // -------------------------------------------------------------------------
5148    // Strategies for generating test data
5149    // -------------------------------------------------------------------------
5150
5151    /// Generate valid confidence values in [0, 1].
5152    fn confidence_strategy() -> impl Strategy<Value = f32> {
5153        0.0f32..=1.0
5154    }
5155
5156    /// Generate signal labels.
5157    fn label_strategy() -> impl Strategy<Value = String> {
5158        prop_oneof![
5159            Just("Person".to_string()),
5160            Just("Organization".to_string()),
5161            Just("Location".to_string()),
5162            Just("Date".to_string()),
5163            "[A-Z][a-z]{2,10}".prop_map(|s| s),
5164        ]
5165    }
5166
5167    /// Generate surface forms (entity text).
5168    fn surface_strategy() -> impl Strategy<Value = String> {
5169        "[A-Za-z ]{1,50}".prop_map(|s| s.trim().to_string())
5170    }
5171
5172    // -------------------------------------------------------------------------
5173    // IoU Properties (Intersection over Union)
5174    // -------------------------------------------------------------------------
5175
5176    proptest! {
5177        /// IoU is symmetric: iou(a, b) == iou(b, a)
5178        #[test]
5179        fn iou_symmetric(
5180            start1 in 0usize..1000,
5181            len1 in 1usize..500,
5182            start2 in 0usize..1000,
5183            len2 in 1usize..500,
5184        ) {
5185            let a = Location::text(start1, start1 + len1);
5186            let b = Location::text(start2, start2 + len2);
5187
5188            let iou_ab = a.iou(&b);
5189            let iou_ba = b.iou(&a);
5190
5191            prop_assert_eq!(iou_ab, iou_ba, "IoU must be symmetric");
5192        }
5193
5194        /// IoU is bounded: 0 <= iou <= 1
5195        #[test]
5196        fn iou_bounded(
5197            start1 in 0usize..1000,
5198            len1 in 1usize..500,
5199            start2 in 0usize..1000,
5200            len2 in 1usize..500,
5201        ) {
5202            let a = Location::text(start1, start1 + len1);
5203            let b = Location::text(start2, start2 + len2);
5204
5205            if let Some(iou) = a.iou(&b) {
5206                prop_assert!(iou >= 0.0, "IoU must be non-negative: got {}", iou);
5207                prop_assert!(iou <= 1.0, "IoU must be at most 1: got {}", iou);
5208            }
5209        }
5210
5211        /// Self-IoU is 1: iou(a, a) == 1
5212        #[test]
5213        fn iou_self_identity(start in 0usize..1000, len in 1usize..500) {
5214            let loc = Location::text(start, start + len);
5215            let iou = loc.iou(&loc).unwrap();
5216            prop_assert!(
5217                (iou - 1.0).abs() < 1e-6,
5218                "Self-IoU must be 1.0, got {}",
5219                iou
5220            );
5221        }
5222
5223        /// Non-overlapping locations have IoU = 0
5224        #[test]
5225        fn iou_non_overlapping_zero(
5226            start1 in 0usize..500,
5227            len1 in 1usize..100,
5228        ) {
5229            let end1 = start1 + len1;
5230            let start2 = end1 + 100; // Guaranteed gap
5231            let len2 = 50;
5232
5233            let a = Location::text(start1, end1);
5234            let b = Location::text(start2, start2 + len2);
5235
5236            let iou = a.iou(&b).expect("bbox iou should be defined");
5237            prop_assert!(
5238                iou.abs() < 1e-6,
5239                "Non-overlapping IoU must be 0, got {}",
5240                iou
5241            );
5242        }
5243
5244
5245    }
5246
5247    // -------------------------------------------------------------------------
5248    // Signal Properties
5249    // -------------------------------------------------------------------------
5250
5251    proptest! {
5252        /// Confidence is always clamped to [0, 1]
5253        #[test]
5254        fn signal_confidence_clamped(raw_conf in -10.0f32..10.0) {
5255            let signal: Signal<Location> = Signal::new(
5256                0,
5257                Location::text(0, 10),
5258                "test",
5259                "Type",
5260                raw_conf,
5261            );
5262
5263            prop_assert!(signal.confidence.value() >= 0.0, "Confidence below 0: {}", signal.confidence);
5264            prop_assert!(signal.confidence.value() <= 1.0, "Confidence above 1: {}", signal.confidence);
5265        }
5266
5267        /// Signal with valid inputs preserves surface and label
5268        #[test]
5269        fn signal_preserves_data(
5270            surface in surface_strategy(),
5271            label in label_strategy(),
5272            conf in confidence_strategy(),
5273            start in 0usize..1000,
5274            len in 1usize..100,
5275        ) {
5276            let signal: Signal<Location> = Signal::new(
5277                0,
5278                Location::text(start, start + len),
5279                &surface,
5280                label.as_str(),
5281                conf,
5282            );
5283
5284            prop_assert_eq!(&signal.surface, &surface);
5285            let want = crate::TypeLabel::from(label.as_str());
5286            prop_assert_eq!(signal.label, want);
5287        }
5288
5289        /// Negation is idempotent: negated().negated() still has negated=true
5290        /// (Note: our API doesn't have an "un-negate", so calling negated() twice
5291        /// just keeps it negated - this tests that it doesn't toggle)
5292        #[test]
5293        fn signal_negation_stable(conf in confidence_strategy()) {
5294            let signal: Signal<Location> = Signal::new(
5295                0,
5296                Location::text(0, 10),
5297                "test",
5298                "Type",
5299                conf,
5300            )
5301            .negated();
5302
5303            prop_assert!(signal.negated, "Signal should be negated after .negated()");
5304        }
5305
5306        /// Text locations have Symbolic modality
5307        #[test]
5308        fn text_location_is_symbolic(
5309            start in 0usize..1000,
5310            len in 1usize..100,
5311        ) {
5312            let loc = Location::text(start, start + len);
5313            prop_assert_eq!(
5314                loc.modality(),
5315                Modality::Symbolic,
5316                "Text locations must be Symbolic"
5317            );
5318        }
5319    }
5320
5321    // -------------------------------------------------------------------------
5322    // Track Properties
5323    // -------------------------------------------------------------------------
5324
5325    proptest! {
5326        /// Track length increases with each added signal
5327        #[test]
5328        fn track_length_monotonic(signal_count in 1usize..20) {
5329            let mut track = Track::new(0, "test");
5330
5331            for i in 0..signal_count {
5332                track.add_signal(i, i as u32);
5333                prop_assert_eq!(
5334                    track.len(),
5335                    i + 1,
5336                    "Track length should be {} after adding {} signals",
5337                    i + 1,
5338                    i + 1
5339                );
5340            }
5341        }
5342
5343        /// Track is never empty after adding a signal
5344        #[test]
5345        fn track_not_empty_after_add(canonical in surface_strategy()) {
5346            let mut track = Track::new(0, &canonical);
5347            prop_assert!(track.is_empty(), "New track should be empty");
5348
5349            track.add_signal(0, 0);
5350            prop_assert!(!track.is_empty(), "Track should not be empty after add");
5351        }
5352
5353        /// Track positions are stored correctly
5354        #[test]
5355        fn track_positions_stored(signal_count in 1usize..10) {
5356            let mut track = Track::new(0, "test");
5357
5358            for i in 0..signal_count {
5359                track.add_signal(i, i as u32);
5360            }
5361
5362            for (idx, signal_ref) in track.signals.iter().enumerate() {
5363                prop_assert_eq!(
5364                    signal_ref.position as usize,
5365                    idx,
5366                    "Signal position mismatch at index {}",
5367                    idx
5368                );
5369            }
5370        }
5371    }
5372
5373    // -------------------------------------------------------------------------
5374    // GroundedDocument Properties
5375    // -------------------------------------------------------------------------
5376
5377    proptest! {
5378        /// Signal IDs are unique and monotonically increasing
5379        #[test]
5380        fn document_signal_ids_monotonic(signal_count in 1usize..20) {
5381            let mut doc = GroundedDocument::new("test", "test text");
5382
5383            let mut prev_id: Option<SignalId> = None;
5384            for i in 0..signal_count {
5385                let id = doc.add_signal(Signal::new(
5386                    999, // Should be overwritten
5387                    Location::text(i * 10, i * 10 + 5),
5388                    format!("entity_{}", i),
5389                    "Type",
5390                    0.9,
5391                ));
5392
5393                if let Some(prev) = prev_id {
5394                    prop_assert!(id > prev, "Signal IDs should be monotonically increasing");
5395                }
5396                prev_id = Some(id);
5397            }
5398        }
5399
5400        /// Track membership is consistent: if signal is in track, track_for_signal returns that track
5401        #[test]
5402        fn document_track_membership_consistent(signal_count in 1usize..5) {
5403            let mut doc = GroundedDocument::new("test", "test text");
5404
5405            // Add signals
5406            let mut signal_ids = Vec::new();
5407            for i in 0..signal_count {
5408                let id = doc.add_signal(Signal::new(
5409                    0,
5410                    Location::text(i * 10, i * 10 + 5),
5411                    format!("entity_{}", i),
5412                    "Type",
5413                    0.9,
5414                ));
5415                signal_ids.push(id);
5416            }
5417
5418            // Create track with all signals
5419            let mut track = Track::new(0, "canonical");
5420            for (pos, &id) in signal_ids.iter().enumerate() {
5421                track.add_signal(id, pos as u32);
5422            }
5423            let track_id = doc.add_track(track);
5424
5425            // Verify membership
5426            for &signal_id in &signal_ids {
5427                let found_track = doc.track_for_signal(signal_id);
5428                prop_assert!(found_track.is_some(), "Signal should be in a track");
5429                prop_assert_eq!(
5430                    found_track.unwrap().id,
5431                    track_id,
5432                    "Signal should be in the correct track"
5433                );
5434            }
5435        }
5436
5437        /// Identity linking is transitive: signal → track → identity
5438        #[test]
5439        fn document_identity_transitivity(signal_count in 1usize..3) {
5440            let mut doc = GroundedDocument::new("test", "test text");
5441
5442            // Add signals
5443            let mut signal_ids = Vec::new();
5444            for i in 0..signal_count {
5445                let id = doc.add_signal(Signal::new(
5446                    0,
5447                    Location::text(i * 10, i * 10 + 5),
5448                    format!("entity_{}", i),
5449                    "Type",
5450                    0.9,
5451                ));
5452                signal_ids.push(id);
5453            }
5454
5455            // Create track and identity
5456            let mut track = Track::new(0, "canonical");
5457            for (pos, &id) in signal_ids.iter().enumerate() {
5458                track.add_signal(id, pos as u32);
5459            }
5460            let track_id = doc.add_track(track);
5461
5462            let identity = Identity::from_kb(0, "Entity", "wikidata", "Q123");
5463            let identity_id = doc.add_identity(identity);
5464            doc.link_track_to_identity(track_id, identity_id);
5465
5466            // Verify transitivity
5467            for &signal_id in &signal_ids {
5468                let identity = doc.identity_for_signal(signal_id);
5469                prop_assert!(identity.is_some(), "Should find identity through signal");
5470                prop_assert_eq!(
5471                    identity.unwrap().id,
5472                    identity_id,
5473                    "Should find correct identity"
5474                );
5475            }
5476        }
5477
5478        /// Untracked signals are correctly identified
5479        #[test]
5480        fn document_untracked_signals(total in 2usize..10, tracked in 0usize..10) {
5481            let tracked = tracked.min(total - 1); // Ensure at least one untracked
5482            let mut doc = GroundedDocument::new("test", "test text");
5483
5484            // Add all signals
5485            let mut signal_ids = Vec::new();
5486            for i in 0..total {
5487                let id = doc.add_signal(Signal::new(
5488                    0,
5489                    Location::text(i * 10, i * 10 + 5),
5490                    format!("entity_{}", i),
5491                    "Type",
5492                    0.9,
5493                ));
5494                signal_ids.push(id);
5495            }
5496
5497            // Track only some signals
5498            let mut track = Track::new(0, "canonical");
5499            for (pos, &id) in signal_ids.iter().take(tracked).enumerate() {
5500                track.add_signal(id, pos as u32);
5501            }
5502            if tracked > 0 {
5503                doc.add_track(track);
5504            }
5505
5506            // Verify counts
5507            prop_assert_eq!(
5508                doc.untracked_signal_count(),
5509                total - tracked,
5510                "Wrong untracked count"
5511            );
5512        }
5513    }
5514
5515    // -------------------------------------------------------------------------
5516    // Roundtrip / Conversion Properties
5517    // -------------------------------------------------------------------------
5518
5519    proptest! {
5520        /// Entity → GroundedDocument → Entities preserves core data
5521        #[test]
5522        fn entity_roundtrip_preserves_text(
5523            text in surface_strategy(),
5524            start in 0usize..1000,
5525            len in 1usize..100,
5526            conf in 0.0f64..=1.0,
5527        ) {
5528            use super::EntityType;
5529
5530            let end = start + len;
5531            let entity = super::Entity::new(&text, EntityType::Person, start, end, conf);
5532
5533            let doc = GroundedDocument::from_entities("test", "x".repeat(end + 10), &[entity]);
5534            let converted = doc.to_entities();
5535
5536            prop_assert_eq!(converted.len(), 1, "Should have exactly one entity");
5537            prop_assert_eq!(&converted[0].text, &text, "Text should be preserved");
5538            prop_assert_eq!(converted[0].start(), start, "Start should be preserved");
5539            prop_assert_eq!(converted[0].end(), end, "End should be preserved");
5540        }
5541
5542        // Note: Property test that depends on anno::eval::coref types has been moved to anno crate
5543        // (coref_roundtrip_preserves_count)
5544    }
5545
5546    // -------------------------------------------------------------------------
5547    // Modality Invariants
5548    // -------------------------------------------------------------------------
5549
5550    // -------------------------------------------------------------------------
5551    // Location Overlap Properties
5552    // -------------------------------------------------------------------------
5553
5554    proptest! {
5555        /// Overlap is symmetric: overlaps(a, b) == overlaps(b, a)
5556        #[test]
5557        fn overlap_symmetric(
5558            start1 in 0usize..1000,
5559            len1 in 1usize..100,
5560            start2 in 0usize..1000,
5561            len2 in 1usize..100,
5562        ) {
5563            let a = Location::text(start1, start1 + len1);
5564            let b = Location::text(start2, start2 + len2);
5565
5566            prop_assert_eq!(
5567                a.overlaps(&b),
5568                b.overlaps(&a),
5569                "Overlap must be symmetric"
5570            );
5571        }
5572
5573        /// A location always overlaps with itself
5574        #[test]
5575        fn overlap_reflexive(start in 0usize..1000, len in 1usize..100) {
5576            let loc = Location::text(start, start + len);
5577            prop_assert!(loc.overlaps(&loc), "Location must overlap with itself");
5578        }
5579
5580        /// If IoU > 0, then overlaps is true
5581        #[test]
5582        fn iou_implies_overlap(
5583            start1 in 0usize..500,
5584            len1 in 1usize..100,
5585            start2 in 0usize..500,
5586            len2 in 1usize..100,
5587        ) {
5588            let a = Location::text(start1, start1 + len1);
5589            let b = Location::text(start2, start2 + len2);
5590
5591            if let Some(iou) = a.iou(&b) {
5592                if iou > 0.0 {
5593                    prop_assert!(
5594                        a.overlaps(&b),
5595                        "IoU > 0 should imply overlap"
5596                    );
5597                }
5598            }
5599        }
5600    }
5601
5602    // -------------------------------------------------------------------------
5603    // DocumentStats Properties
5604    // -------------------------------------------------------------------------
5605
5606    proptest! {
5607        /// Stats signal count matches actual count
5608        #[test]
5609        fn stats_signal_count_accurate(signal_count in 0usize..20) {
5610            let mut doc = GroundedDocument::new("test", "test");
5611            for i in 0..signal_count {
5612                doc.add_signal(Signal::new(
5613                    0,
5614                    Location::text(i * 10, i * 10 + 5),
5615                    "entity",
5616                    "Type",
5617                    0.9,
5618                ));
5619            }
5620
5621            let stats = doc.stats();
5622            prop_assert_eq!(stats.signal_count, signal_count);
5623        }
5624
5625        /// Stats track count matches actual count
5626        #[test]
5627        fn stats_track_count_accurate(track_count in 0usize..10) {
5628            let mut doc = GroundedDocument::new("test", "test");
5629            for i in 0..track_count {
5630                let id = doc.add_signal(Signal::new(
5631                    0,
5632                    Location::text(i * 10, i * 10 + 5),
5633                    "entity",
5634                    "Type",
5635                    0.9,
5636                ));
5637                let mut track = Track::new(0, format!("track_{}", i));
5638                track.add_signal(id, 0);
5639                doc.add_track(track);
5640            }
5641
5642            let stats = doc.stats();
5643            prop_assert_eq!(stats.track_count, track_count);
5644        }
5645
5646        /// Avg confidence is in [0, 1]
5647        #[test]
5648        fn stats_avg_confidence_bounded(
5649            confidences in proptest::collection::vec(0.0f32..=1.0, 1..10)
5650        ) {
5651            let mut doc = GroundedDocument::new("test", "test");
5652            for (i, conf) in confidences.iter().enumerate() {
5653                doc.add_signal(Signal::new(
5654                    0,
5655                    Location::text(i * 10, i * 10 + 5),
5656                    "entity",
5657                    "Type",
5658                    *conf,
5659                ));
5660            }
5661
5662            let stats = doc.stats();
5663            prop_assert!(stats.avg_confidence.value() >= 0.0);
5664            prop_assert!(stats.avg_confidence.value() <= 1.0);
5665        }
5666    }
5667
5668    // -------------------------------------------------------------------------
5669    // Batch Operations Properties
5670    // -------------------------------------------------------------------------
5671
5672    proptest! {
5673        /// add_signals returns correct number of IDs
5674        #[test]
5675        fn batch_add_returns_all_ids(count in 1usize..10) {
5676            let mut doc = GroundedDocument::new("test", "test");
5677            let signals: Vec<Signal<Location>> = (0..count)
5678                .map(|i| Signal::new(0, Location::text(i * 10, i * 10 + 5), "e", "T", 0.9))
5679                .collect();
5680
5681            let ids = doc.add_signals(signals);
5682            prop_assert_eq!(ids.len(), count);
5683            prop_assert_eq!(doc.signals().len(), count);
5684        }
5685
5686        /// create_track_from_signals creates valid track
5687        #[test]
5688        fn create_track_valid(signal_count in 1usize..5) {
5689            let mut doc = GroundedDocument::new("test", "test");
5690            let mut signal_ids = Vec::new();
5691            for i in 0..signal_count {
5692                let id = doc.add_signal(Signal::new(
5693                    0,
5694                    Location::text(i * 10, i * 10 + 5),
5695                    "entity",
5696                    "Type",
5697                    0.9,
5698                ));
5699                signal_ids.push(id);
5700            }
5701
5702            let track_id = doc.create_track_from_signals("canonical", &signal_ids);
5703            prop_assert!(track_id.is_some());
5704
5705            let track = doc.get_track(track_id.unwrap());
5706            prop_assert!(track.is_some());
5707            prop_assert_eq!(track.unwrap().len(), signal_count);
5708        }
5709
5710        /// Empty signal list returns None for track creation
5711        #[test]
5712        fn create_track_empty_returns_none(_dummy in 0..1) {
5713            let mut doc = GroundedDocument::new("test", "test");
5714            let track_id = doc.create_track_from_signals("canonical", &[]);
5715            prop_assert!(track_id.is_none());
5716        }
5717    }
5718
5719    // -------------------------------------------------------------------------
5720    // Filtering Properties
5721    // -------------------------------------------------------------------------
5722
5723    proptest! {
5724        /// signals_in_range returns only signals within range
5725        #[test]
5726        fn signals_in_range_within_bounds(
5727            range_start in 0usize..100,
5728            range_len in 10usize..50,
5729        ) {
5730            let range_end = range_start + range_len;
5731            let mut doc = GroundedDocument::new("test", "x".repeat(200));
5732
5733            // Add signals: some inside, some outside
5734            doc.add_signal(Signal::new(0, Location::text(range_start + 2, range_start + 5), "inside", "T", 0.9));
5735            doc.add_signal(Signal::new(0, Location::text(0, 5), "before", "T", 0.9));
5736            doc.add_signal(Signal::new(0, Location::text(190, 195), "after", "T", 0.9));
5737
5738            let in_range = doc.signals_in_range(range_start, range_end);
5739
5740            for signal in &in_range {
5741                if let Some((start, end)) = signal.location.text_offsets() {
5742                    prop_assert!(start >= range_start, "Signal start {} < range start {}", start, range_start);
5743                    prop_assert!(end <= range_end, "Signal end {} > range end {}", end, range_end);
5744                }
5745            }
5746        }
5747
5748        /// overlapping_signals is symmetric: if A overlaps B, then B's overlaps includes A's location
5749        #[test]
5750        fn overlapping_signals_symmetric(
5751            start1 in 10usize..50,
5752            len1 in 5usize..20,
5753            start2 in 10usize..50,
5754            len2 in 5usize..20,
5755        ) {
5756            let mut doc = GroundedDocument::new("test", "x".repeat(100));
5757
5758            let loc1 = Location::text(start1, start1 + len1);
5759            let loc2 = Location::text(start2, start2 + len2);
5760
5761            doc.add_signal(Signal::new(0, loc1.clone(), "A", "T", 0.9));
5762            doc.add_signal(Signal::new(0, loc2.clone(), "B", "T", 0.9));
5763
5764            let overlaps_loc1 = doc.overlapping_signals(&loc1);
5765            let overlaps_loc2 = doc.overlapping_signals(&loc2);
5766
5767            // If loc1 overlaps loc2, both should find each other
5768            if loc1.overlaps(&loc2) {
5769                prop_assert!(overlaps_loc1.len() >= 2, "Should find both when overlapping");
5770                prop_assert!(overlaps_loc2.len() >= 2, "Should find both when overlapping");
5771            }
5772        }
5773    }
5774
5775    // -------------------------------------------------------------------------
5776    // Invariant: Modality count consistency
5777    // -------------------------------------------------------------------------
5778
5779    proptest! {
5780        /// Sum of modality counts equals total signal count
5781        #[test]
5782        fn modality_counts_sum_to_total(
5783            symbolic_count in 0usize..5,
5784            iconic_count in 0usize..5,
5785        ) {
5786            let mut doc = GroundedDocument::new("test", "test");
5787
5788            // Add symbolic signals
5789            for i in 0..symbolic_count {
5790                let mut signal = Signal::new(
5791                    0,
5792                    Location::text(i * 10, i * 10 + 5),
5793                    "entity",
5794                    "Type",
5795                    0.9,
5796                );
5797                signal.modality = Modality::Symbolic;
5798                doc.add_signal(signal);
5799            }
5800
5801            // Add iconic-modality signals (modality overridden on text locations)
5802            for i in 0..iconic_count {
5803                let mut signal = Signal::new(
5804                    0,
5805                    Location::text(1000 + i * 10, 1000 + i * 10 + 5),
5806                    "entity",
5807                    "Type",
5808                    0.9,
5809                );
5810                signal.modality = Modality::Iconic;
5811                doc.add_signal(signal);
5812            }
5813
5814            let stats = doc.stats();
5815            prop_assert_eq!(
5816                stats.symbolic_count + stats.iconic_count + stats.hybrid_count,
5817                stats.signal_count,
5818                "Modality counts should sum to total"
5819            );
5820        }
5821    }
5822
5823    // -------------------------------------------------------------------------
5824    // Invariant: Signal-Text Offset Consistency
5825    // -------------------------------------------------------------------------
5826
5827    proptest! {
5828        /// Signals created via from_text are always valid
5829        #[test]
5830        fn from_text_always_valid(
5831            text in "[a-zA-Z ]{20,100}",
5832            surface_start in 0usize..15,
5833            surface_len in 1usize..8,
5834        ) {
5835            let text_char_len = text.chars().count();
5836            let surface_end = (surface_start + surface_len).min(text_char_len);
5837            let surface_start = surface_start.min(surface_end.saturating_sub(1));
5838
5839            if surface_start < surface_end && surface_end <= text_char_len {
5840                let surface: String = text.chars()
5841                    .skip(surface_start)
5842                    .take(surface_end - surface_start)
5843                    .collect();
5844
5845                if !surface.is_empty() {
5846                    // from_text should find the surface and create a valid signal
5847                    if let Some(signal) = Signal::<Location>::from_text(&text, &surface, "Test", 0.9) {
5848                        // The created signal MUST be valid
5849                        prop_assert!(
5850                            signal.validate_against(&text).is_none(),
5851                            "Signal created via from_text must be valid"
5852                        );
5853                    }
5854                }
5855            }
5856        }
5857
5858        /// Validated add never allows invalid signals
5859        #[test]
5860        fn validated_add_rejects_invalid(
5861            text in "[a-z]{10,50}",
5862            wrong_surface in "[A-Z]{3,10}",
5863        ) {
5864            let mut doc = GroundedDocument::new("test", &text);
5865
5866            // Create a signal with offsets pointing to different text than surface
5867            let signal = Signal::new(
5868                0,
5869                Location::text(0, wrong_surface.chars().count().min(text.chars().count())),
5870                wrong_surface.clone(),
5871                "Test",
5872                0.9,
5873            );
5874
5875            // If text doesn't actually contain wrong_surface at offset 0,
5876            // validated add should reject it
5877            let expected: String = text.chars().take(wrong_surface.chars().count()).collect();
5878            if expected != wrong_surface {
5879                let result = doc.add_signal_validated(signal);
5880                prop_assert!(result.is_err(), "Should reject signal with mismatched surface");
5881            }
5882        }
5883
5884        /// Round-trip: add_signal_from_text creates retrievable signals
5885        #[test]
5886        fn round_trip_signal_from_text(
5887            prefix in "[a-z]{5,20}",
5888            entity in "[A-Z][a-z]{3,10}",
5889            suffix in "[a-z]{5,20}",
5890        ) {
5891            let text = format!("{} {} {}", prefix, entity, suffix);
5892            let mut doc = GroundedDocument::new("test", &text);
5893
5894            let id = doc.add_signal_from_text(&entity, "Entity", 0.9);
5895            prop_assert!(id.is_some(), "Should find entity in text");
5896
5897            let signal = doc.signals().iter().find(|s| s.id == id.unwrap());
5898            prop_assert!(signal.is_some(), "Should retrieve added signal");
5899
5900            let signal = signal.unwrap();
5901            prop_assert_eq!(signal.surface(), entity.as_str(), "Surface should match");
5902
5903            // Validation MUST pass
5904            prop_assert!(
5905                doc.is_valid(),
5906                "Document should be valid after from_text add"
5907            );
5908        }
5909
5910        /// Multiple occurrences: nth variant finds correct occurrence
5911        #[test]
5912        fn nth_occurrence_finds_correct(
5913            entity in "[A-Z][a-z]{2,5}",
5914            sep in " [a-z]+ ",
5915        ) {
5916            // Create text with multiple occurrences
5917            let text = format!("{}{}{}{}{}", entity, sep, entity, sep, entity);
5918            let mut doc = GroundedDocument::new("test", &text);
5919
5920            // Find each occurrence
5921            for n in 0..3 {
5922                let id = doc.add_signal_from_text_nth(&entity, "Entity", 0.9, n);
5923                prop_assert!(id.is_some(), "Should find occurrence {}", n);
5924            }
5925
5926            // 4th occurrence shouldn't exist
5927            let id = doc.add_signal_from_text_nth(&entity, "Entity", 0.9, 3);
5928            prop_assert!(id.is_none(), "Should NOT find 4th occurrence");
5929
5930            // All signals should be valid
5931            prop_assert!(doc.is_valid(), "All signals should be valid");
5932
5933            // Check offsets are distinct
5934            let offsets: Vec<_> = doc.signals()
5935                .iter()
5936                .filter_map(|s| s.text_offsets())
5937                .collect();
5938            let unique: std::collections::HashSet<_> = offsets.iter().collect();
5939            prop_assert_eq!(offsets.len(), unique.len(), "Each occurrence should have distinct offset");
5940        }
5941    }
5942
5943    // =========================================================================
5944    // TrackStats Tests
5945    // =========================================================================
5946
5947    #[test]
5948    fn test_track_stats_basic() {
5949        let text = "John met Mary. He said hello. John left.";
5950        let mut doc = GroundedDocument::new("test", text);
5951        let text_len = text.chars().count();
5952
5953        // Add signals for "John" at positions 0 and 30
5954        let s1 = doc.add_signal(Signal::new(0, Location::text(0, 4), "John", "Person", 0.95));
5955        let s2 = doc.add_signal(Signal::new(
5956            0,
5957            Location::text(30, 34),
5958            "John",
5959            "Person",
5960            0.90,
5961        ));
5962
5963        // Create track linking both Johns
5964        let track_id = doc.add_track(Track::new(0, "John".to_string()));
5965        doc.add_signal_to_track(s1, track_id, 0);
5966        doc.add_signal_to_track(s2, track_id, 1);
5967
5968        // Get track and compute stats
5969        let track = doc.get_track(track_id).unwrap();
5970        let stats = track.compute_stats(&doc, text_len);
5971
5972        assert_eq!(stats.chain_length, 2, "Two mentions");
5973        assert_eq!(stats.variation_count, 1, "One unique surface form");
5974        assert!(stats.spread > 0, "Spread should be positive");
5975        assert!(stats.relative_spread > 0.0 && stats.relative_spread < 1.0);
5976        assert!((stats.min_confidence.value() - 0.90).abs() < 0.01);
5977        assert!((stats.max_confidence.value() - 0.95).abs() < 0.01);
5978        assert!((stats.mean_confidence.value() - 0.925).abs() < 0.01);
5979    }
5980
5981    #[test]
5982    fn test_track_stats_singleton() {
5983        let text = "Paris is beautiful.";
5984        let mut doc = GroundedDocument::new("test", text);
5985        let text_len = text.chars().count();
5986
5987        let s1 = doc.add_signal(Signal::new(
5988            0,
5989            Location::text(0, 5),
5990            "Paris",
5991            "Location",
5992            0.88,
5993        ));
5994        let track_id = doc.add_track(Track::new(0, "Paris".to_string()));
5995        doc.add_signal_to_track(s1, track_id, 0);
5996
5997        let track = doc.get_track(track_id).unwrap();
5998        let stats = track.compute_stats(&doc, text_len);
5999
6000        assert_eq!(stats.chain_length, 1);
6001        assert_eq!(stats.spread, 0, "Singleton has zero spread");
6002        assert_eq!(stats.first_position, stats.last_position);
6003        assert!((stats.min_confidence.value() - stats.max_confidence.value()).abs() < 0.001);
6004    }
6005}