Skip to main content

anno_core/core/
entity.rs

1//! Entity types and structures for NER.
2//!
3//! # Design Philosophy (Research-Aligned)
4//!
5//! This module implements entity types informed by modern NER research:
6//!
7//! - **GLiNER/Bi-Encoder**: Entity types are *labels to match against*, not fixed classes.
8//!   Relations ("CEO of") are entities too - they're just labels in the same latent space.
9//!
10//! - **TPLinker/Joint Extraction**: Entities and relations can be extracted in a single pass.
11//!   The type system supports relation triggers as first-class mentions.
12//!
13//! - **Knowledge Graphs**: Entities can link to external knowledge bases (`kb_id`) for
14//!   coreference resolution and GraphRAG applications.
15//!
16//! # Type Hierarchy
17//!
18//! ```text
19//! Mention
20//! ├── Entity (single span)
21//! │   ├── Named (ML): Person, Organization, Location
22//! │   ├── Temporal (Pattern): Date, Time
23//! │   ├── Numeric (Pattern): Money, Percent, Quantity, Cardinal, Ordinal
24//! │   └── Contact (Pattern): Email, Url, Phone
25//! │
26//! └── Relation (connects entities)
27//!     └── Trigger text: "CEO of", "located in", "born on"
28//! ```
29//!
30//! # Design Principles
31//!
32//! 1. **Bi-encoder compatible**: Types are semantic labels, not fixed enums
33//! 2. **Joint extraction**: Relations are mentions with trigger spans
34//! 3. **Knowledge linking**: `kb_id` for connecting to external KBs
35//! 4. **Hierarchical confidence**: Coarse (linkage) + fine (type) scores
36//! 5. **Multi-modal ready**: Spans can be text offsets or visual bboxes
37
38use serde::{Deserialize, Serialize};
39use std::borrow::Cow;
40
41// ============================================================================
42// Entity Category (OntoNotes-inspired)
43// ============================================================================
44
45/// Category of entity based on detection characteristics and semantics.
46///
47/// Based on OntoNotes 5.0 categories with extensions for:
48/// - Structured data (Contact, patterns)
49/// - Knowledge graphs (Relation, for TPLinker/GLiNER joint extraction)
50#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
51#[non_exhaustive]
52pub enum EntityCategory {
53    /// Named entities for people/groups (ML-required).
54    /// Types: Person, NORP (nationalities/religious/political groups)
55    Agent,
56    /// Named entities for organizations/facilities (ML-required).
57    /// Types: Organization, Facility
58    Organization,
59    /// Named entities for places (ML-required).
60    /// Types: GPE (geo-political), Location (geographic)
61    Place,
62    /// Named entities for creative/conceptual (ML-required).
63    /// Types: Event, Product, WorkOfArt, Law, Language
64    Creative,
65    /// Temporal entities (pattern-detectable).
66    /// Types: Date, Time
67    Temporal,
68    /// Numeric entities (pattern-detectable).
69    /// Types: Money, Percent, Quantity, Cardinal, Ordinal
70    Numeric,
71    /// Contact/identifier entities (pattern-detectable).
72    /// Types: Email, Url, Phone
73    Contact,
74    /// Relation triggers for knowledge graph construction (ML-required).
75    /// Examples: "CEO of", "located in", "founded by"
76    /// In GLiNER bi-encoder, relations are just another label to match.
77    Relation,
78    /// Miscellaneous/unknown category
79    Misc,
80}
81
82impl EntityCategory {
83    /// Returns true if this category requires ML for detection.
84    #[must_use]
85    pub const fn requires_ml(&self) -> bool {
86        matches!(
87            self,
88            EntityCategory::Agent
89                | EntityCategory::Organization
90                | EntityCategory::Place
91                | EntityCategory::Creative
92                | EntityCategory::Relation
93        )
94    }
95
96    /// Returns true if this category can be detected via patterns.
97    #[must_use]
98    pub const fn pattern_detectable(&self) -> bool {
99        matches!(
100            self,
101            EntityCategory::Temporal | EntityCategory::Numeric | EntityCategory::Contact
102        )
103    }
104
105    /// Returns true if this is a relation (for knowledge graph construction).
106    #[must_use]
107    pub const fn is_relation(&self) -> bool {
108        matches!(self, EntityCategory::Relation)
109    }
110
111    /// Returns OntoNotes-compatible category name.
112    #[must_use]
113    pub const fn as_str(&self) -> &'static str {
114        match self {
115            EntityCategory::Agent => "agent",
116            EntityCategory::Organization => "organization",
117            EntityCategory::Place => "place",
118            EntityCategory::Creative => "creative",
119            EntityCategory::Temporal => "temporal",
120            EntityCategory::Numeric => "numeric",
121            EntityCategory::Contact => "contact",
122            EntityCategory::Relation => "relation",
123            EntityCategory::Misc => "misc",
124        }
125    }
126}
127
128impl std::fmt::Display for EntityCategory {
129    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
130        write!(f, "{}", self.as_str())
131    }
132}
133
134// ============================================================================
135// Entity Viewport (Research: Entity Manifolds)
136// ============================================================================
137
138/// Viewport context for multi-faceted entity representation.
139///
140/// # Research Background
141///
142/// The concept of "Entity Viewports" comes from the observation that
143/// real-world entities are not monolithic - they present different
144/// facets depending on context:
145///
146/// - "Marie Curie" in an **Academic** context: physicist, Nobel laureate
147/// - "Marie Curie" in a **Technical** context: radioactivity researcher, X-ray pioneer
148/// - "Marie Curie" in a **Personal** context: mother, immigrant, educator
149/// - "Marie Curie" in a **Medical** context: founder of mobile X-ray units
150///
151/// Rather than collapsing all information into a single vector,
152/// the viewport model preserves these distinctions and enables
153/// "projection" at query time.
154///
155/// # Usage in RAG Systems
156///
157/// When answering "What were Curie's scientific contributions?", retrieve
158/// facts from the `Academic` viewport. When answering "What was Curie's
159/// personal life like?", retrieve from `Personal`.
160///
161/// # Example
162///
163/// ```rust,ignore
164/// use anno_core::{Entity, EntityType, EntityViewport};
165///
166/// let mut entity = Entity::new("Marie Curie", EntityType::Person, 0, 11, 0.9);
167/// entity.viewport = Some(EntityViewport::Academic);
168/// assert!(entity.viewport.as_ref().unwrap().is_professional());
169/// ```
170#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize, Default)]
171#[non_exhaustive]
172pub enum EntityViewport {
173    /// Business/financial context (CEO, revenue, market cap)
174    Business,
175    /// Legal context (lawsuits, settlements, compliance)
176    Legal,
177    /// Technical/engineering context (patents, inventions, code)
178    Technical,
179    /// Academic/research context (publications, citations, grants)
180    Academic,
181    /// Personal/biographical context (family, hobbies, background)
182    Personal,
183    /// Political context (lobbying, donations, policy positions)
184    Political,
185    /// Media/public relations context (interviews, statements, PR)
186    Media,
187    /// Historical context (past roles, timeline events)
188    Historical,
189    /// Generic/unspecified context
190    #[default]
191    General,
192    /// Custom viewport with a descriptive label
193    Custom(String),
194}
195
196impl EntityViewport {
197    /// Human-readable label for the viewport.
198    #[must_use]
199    pub fn as_str(&self) -> &str {
200        match self {
201            EntityViewport::Business => "business",
202            EntityViewport::Legal => "legal",
203            EntityViewport::Technical => "technical",
204            EntityViewport::Academic => "academic",
205            EntityViewport::Personal => "personal",
206            EntityViewport::Political => "political",
207            EntityViewport::Media => "media",
208            EntityViewport::Historical => "historical",
209            EntityViewport::General => "general",
210            EntityViewport::Custom(s) => s,
211        }
212    }
213
214    /// Is this a professional/work-related viewport?
215    #[must_use]
216    pub const fn is_professional(&self) -> bool {
217        matches!(
218            self,
219            EntityViewport::Business
220                | EntityViewport::Legal
221                | EntityViewport::Technical
222                | EntityViewport::Academic
223                | EntityViewport::Political
224        )
225    }
226}
227
228impl std::str::FromStr for EntityViewport {
229    type Err = std::convert::Infallible;
230
231    fn from_str(s: &str) -> Result<Self, Self::Err> {
232        Ok(match s.to_lowercase().as_str() {
233            "business" | "financial" | "corporate" => EntityViewport::Business,
234            "legal" | "law" | "compliance" => EntityViewport::Legal,
235            "technical" | "engineering" | "tech" => EntityViewport::Technical,
236            "academic" | "research" | "scholarly" => EntityViewport::Academic,
237            "personal" | "biographical" | "private" => EntityViewport::Personal,
238            "political" | "policy" | "government" => EntityViewport::Political,
239            "media" | "press" | "pr" | "public_relations" => EntityViewport::Media,
240            "historical" | "history" | "past" => EntityViewport::Historical,
241            "general" | "generic" | "" => EntityViewport::General,
242            other => EntityViewport::Custom(other.to_string()),
243        })
244    }
245}
246
247impl std::fmt::Display for EntityViewport {
248    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
249        write!(f, "{}", self.as_str())
250    }
251}
252
253// ============================================================================
254// Entity Type
255// ============================================================================
256
257/// Entity type classification.
258///
259/// Organized into categories:
260/// - **Named** (ML-required): Person, Organization, Location
261/// - **Temporal** (pattern): Date, Time
262/// - **Numeric** (pattern): Money, Percent, Quantity, Cardinal, Ordinal
263/// - **Contact** (pattern): Email, Url, Phone
264///
265/// # Examples
266///
267/// ```rust,ignore
268/// use anno_core::EntityType;
269///
270/// let ty = EntityType::Email;
271/// assert!(ty.category().pattern_detectable());
272/// assert!(!ty.category().requires_ml());
273///
274/// let ty = EntityType::Person;
275/// assert!(ty.category().requires_ml());
276/// ```
277#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
278#[non_exhaustive]
279pub enum EntityType {
280    // === Named Entities (ML-required) ===
281    /// Person name (PER) - requires ML/context
282    Person,
283    /// Organization name (ORG) - requires ML/context
284    Organization,
285    /// Location/Place (LOC/GPE) - requires ML/context
286    Location,
287
288    // === Temporal Entities (Pattern-detectable) ===
289    /// Date expression (DATE) - pattern-detectable
290    Date,
291    /// Time expression (TIME) - pattern-detectable
292    Time,
293
294    // === Numeric Entities (Pattern-detectable) ===
295    /// Monetary value (MONEY) - pattern-detectable
296    Money,
297    /// Percentage (PERCENT) - pattern-detectable
298    Percent,
299    /// Quantity with unit (QUANTITY) - pattern-detectable
300    Quantity,
301    /// Cardinal number (CARDINAL) - pattern-detectable
302    Cardinal,
303    /// Ordinal number (ORDINAL) - pattern-detectable
304    Ordinal,
305
306    // === Contact Entities (Pattern-detectable) ===
307    /// Email address - pattern-detectable
308    Email,
309    /// URL/URI - pattern-detectable
310    Url,
311    /// Phone number - pattern-detectable
312    Phone,
313
314    // === Extensibility ===
315    /// Domain-specific custom type with explicit category
316    Custom {
317        /// Type name (e.g., "DISEASE", "PRODUCT", "EVENT")
318        name: String,
319        /// Category for this custom type
320        category: EntityCategory,
321    },
322
323    /// Legacy catch-all for unknown types (prefer Custom for new code)
324    #[serde(rename = "Other")]
325    Other(String),
326}
327
328impl EntityType {
329    /// Get the category of this entity type.
330    #[must_use]
331    pub fn category(&self) -> EntityCategory {
332        match self {
333            // Agent entities (people/groups)
334            EntityType::Person => EntityCategory::Agent,
335            // Organization entities
336            EntityType::Organization => EntityCategory::Organization,
337            // Place entities (locations)
338            EntityType::Location => EntityCategory::Place,
339            // Temporal entities
340            EntityType::Date | EntityType::Time => EntityCategory::Temporal,
341            // Numeric entities
342            EntityType::Money
343            | EntityType::Percent
344            | EntityType::Quantity
345            | EntityType::Cardinal
346            | EntityType::Ordinal => EntityCategory::Numeric,
347            // Contact entities
348            EntityType::Email | EntityType::Url | EntityType::Phone => EntityCategory::Contact,
349            // Custom with explicit category
350            EntityType::Custom { category, .. } => *category,
351            // Legacy Other - assume misc
352            EntityType::Other(_) => EntityCategory::Misc,
353        }
354    }
355
356    /// Returns true if this entity type requires ML for detection.
357    #[must_use]
358    pub fn requires_ml(&self) -> bool {
359        self.category().requires_ml()
360    }
361
362    /// Returns true if this entity type can be detected via patterns.
363    #[must_use]
364    pub fn pattern_detectable(&self) -> bool {
365        self.category().pattern_detectable()
366    }
367
368    /// Convert to standard label string (CoNLL/OntoNotes format).
369    #[must_use]
370    pub fn as_label(&self) -> &str {
371        match self {
372            EntityType::Person => "PER",
373            EntityType::Organization => "ORG",
374            EntityType::Location => "LOC",
375            EntityType::Date => "DATE",
376            EntityType::Time => "TIME",
377            EntityType::Money => "MONEY",
378            EntityType::Percent => "PERCENT",
379            EntityType::Quantity => "QUANTITY",
380            EntityType::Cardinal => "CARDINAL",
381            EntityType::Ordinal => "ORDINAL",
382            EntityType::Email => "EMAIL",
383            EntityType::Url => "URL",
384            EntityType::Phone => "PHONE",
385            EntityType::Custom { name, .. } => name.as_str(),
386            EntityType::Other(s) => s.as_str(),
387        }
388    }
389
390    /// Parse from standard label string.
391    ///
392    /// Handles various formats: CoNLL (PER), OntoNotes (PERSON), BIO (B-PER).
393    #[must_use]
394    pub fn from_label(label: &str) -> Self {
395        // Strip BIO prefix if present
396        let label = label
397            .strip_prefix("B-")
398            .or_else(|| label.strip_prefix("I-"))
399            .or_else(|| label.strip_prefix("E-"))
400            .or_else(|| label.strip_prefix("S-"))
401            .unwrap_or(label);
402
403        match label.to_uppercase().as_str() {
404            // Named entities (multiple variations)
405            "PER" | "PERSON" => EntityType::Person,
406            "ORG" | "ORGANIZATION" | "COMPANY" | "CORPORATION" => EntityType::Organization,
407            "LOC" | "LOCATION" | "GPE" | "GEO-LOC" => EntityType::Location,
408            // WNUT / FewNERD specific types (common in social media / Wikipedia)
409            "FACILITY" | "FAC" | "BUILDING" => {
410                EntityType::custom("BUILDING", EntityCategory::Place)
411            }
412            "PRODUCT" | "PROD" => EntityType::custom("PRODUCT", EntityCategory::Misc),
413            "EVENT" => EntityType::custom("EVENT", EntityCategory::Creative),
414            "CREATIVE-WORK" | "WORK_OF_ART" | "ART" => {
415                EntityType::custom("CREATIVE_WORK", EntityCategory::Creative)
416            }
417            "GROUP" | "NORP" => EntityType::custom("GROUP", EntityCategory::Agent),
418            // Temporal
419            "DATE" => EntityType::Date,
420            "TIME" => EntityType::Time,
421            // Numeric
422            "MONEY" | "CURRENCY" => EntityType::Money,
423            "PERCENT" | "PERCENTAGE" => EntityType::Percent,
424            "QUANTITY" => EntityType::Quantity,
425            "CARDINAL" => EntityType::Cardinal,
426            "ORDINAL" => EntityType::Ordinal,
427            // Contact
428            "EMAIL" => EntityType::Email,
429            "URL" | "URI" => EntityType::Url,
430            "PHONE" | "TELEPHONE" => EntityType::Phone,
431            // MISC variations
432            "MISC" | "MISCELLANEOUS" | "OTHER" => EntityType::Other("MISC".to_string()),
433            // Biomedical types
434            "DISEASE" | "DISORDER" => EntityType::custom("DISEASE", EntityCategory::Misc),
435            "CHEMICAL" | "DRUG" => EntityType::custom("CHEMICAL", EntityCategory::Misc),
436            "GENE" => EntityType::custom("GENE", EntityCategory::Misc),
437            "PROTEIN" => EntityType::custom("PROTEIN", EntityCategory::Misc),
438            // Unknown -> Other
439            other => EntityType::Other(other.to_string()),
440        }
441    }
442
443    /// Create a custom domain-specific entity type.
444    ///
445    /// # Examples
446    ///
447    /// ```rust,ignore
448    /// use anno_core::{EntityType, EntityCategory};
449    ///
450    /// // Medical entity - custom domain-specific type
451    /// let disease = EntityType::custom("DISEASE", EntityCategory::Agent);
452    /// assert!(disease.requires_ml());
453    ///
454    /// // ID patterns - can be detected via patterns
455    /// let product_id = EntityType::custom("PRODUCT_ID", EntityCategory::Misc);
456    /// ```
457    #[must_use]
458    pub fn custom(name: impl Into<String>, category: EntityCategory) -> Self {
459        EntityType::Custom {
460            name: name.into(),
461            category,
462        }
463    }
464}
465
466impl std::fmt::Display for EntityType {
467    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
468        write!(f, "{}", self.as_label())
469    }
470}
471
472impl std::str::FromStr for EntityType {
473    type Err = std::convert::Infallible;
474
475    /// Parse from standard label string. Never fails - unknown labels become `Other`.
476    fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
477        Ok(Self::from_label(s))
478    }
479}
480
481// =============================================================================
482// Type Mapping for Domain-Specific Datasets
483// =============================================================================
484
485/// Maps domain-specific entity types to standard NER types.
486///
487/// # Research Context (Familiarity paper, arXiv:2412.10121)
488///
489/// Type mapping creates "label overlap" between training and evaluation:
490/// - Mapping ACTOR → Person increases overlap
491/// - This can inflate zero-shot F1 scores
492///
493/// Use `LabelShift::from_type_sets()` to quantify how much overlap exists.
494/// High overlap (>80%) means the evaluation is NOT truly zero-shot.
495///
496/// # When to Use TypeMapper
497///
498/// - Cross-dataset comparison (normalize schemas for fair eval)
499/// - Domain adaptation (map new labels to known types)
500///
501/// # When NOT to Use TypeMapper
502///
503/// - True zero-shot evaluation (keep labels distinct)
504/// - Measuring generalization (overlap hides generalization failures)
505///
506/// # Example
507///
508/// ```rust,ignore
509/// use anno_core::{TypeMapper, EntityType, EntityCategory};
510///
511/// // MIT Movie dataset mapping
512/// let mut mapper = TypeMapper::new();
513/// mapper.add("ACTOR", EntityType::Person);
514/// mapper.add("DIRECTOR", EntityType::Person);
515/// mapper.add("TITLE", EntityType::custom("WORK_OF_ART", EntityCategory::Creative));
516///
517/// assert_eq!(mapper.map("ACTOR"), Some(&EntityType::Person));
518/// assert_eq!(mapper.normalize("DIRECTOR"), EntityType::Person);
519/// ```
520#[derive(Debug, Clone, Default)]
521pub struct TypeMapper {
522    mappings: std::collections::HashMap<String, EntityType>,
523}
524
525impl TypeMapper {
526    /// Create empty mapper.
527    #[must_use]
528    pub fn new() -> Self {
529        Self::default()
530    }
531
532    /// Create mapper for MIT Movie dataset.
533    #[must_use]
534    pub fn mit_movie() -> Self {
535        let mut mapper = Self::new();
536        // Map to standard types where possible
537        mapper.add("ACTOR", EntityType::Person);
538        mapper.add("DIRECTOR", EntityType::Person);
539        mapper.add("CHARACTER", EntityType::Person);
540        mapper.add(
541            "TITLE",
542            EntityType::custom("WORK_OF_ART", EntityCategory::Creative),
543        );
544        mapper.add("GENRE", EntityType::custom("GENRE", EntityCategory::Misc));
545        mapper.add("YEAR", EntityType::Date);
546        mapper.add("RATING", EntityType::custom("RATING", EntityCategory::Misc));
547        mapper.add("PLOT", EntityType::custom("PLOT", EntityCategory::Misc));
548        mapper
549    }
550
551    /// Create mapper for MIT Restaurant dataset.
552    #[must_use]
553    pub fn mit_restaurant() -> Self {
554        let mut mapper = Self::new();
555        mapper.add("RESTAURANT_NAME", EntityType::Organization);
556        mapper.add("LOCATION", EntityType::Location);
557        mapper.add(
558            "CUISINE",
559            EntityType::custom("CUISINE", EntityCategory::Misc),
560        );
561        mapper.add("DISH", EntityType::custom("DISH", EntityCategory::Misc));
562        mapper.add("PRICE", EntityType::Money);
563        mapper.add(
564            "AMENITY",
565            EntityType::custom("AMENITY", EntityCategory::Misc),
566        );
567        mapper.add("HOURS", EntityType::Time);
568        mapper
569    }
570
571    /// Create mapper for biomedical datasets (BC5CDR, NCBI).
572    #[must_use]
573    pub fn biomedical() -> Self {
574        let mut mapper = Self::new();
575        mapper.add(
576            "DISEASE",
577            EntityType::custom("DISEASE", EntityCategory::Agent),
578        );
579        mapper.add(
580            "CHEMICAL",
581            EntityType::custom("CHEMICAL", EntityCategory::Misc),
582        );
583        mapper.add("DRUG", EntityType::custom("DRUG", EntityCategory::Misc));
584        mapper.add("GENE", EntityType::custom("GENE", EntityCategory::Misc));
585        mapper.add(
586            "PROTEIN",
587            EntityType::custom("PROTEIN", EntityCategory::Misc),
588        );
589        // GENIA types
590        mapper.add("DNA", EntityType::custom("DNA", EntityCategory::Misc));
591        mapper.add("RNA", EntityType::custom("RNA", EntityCategory::Misc));
592        mapper.add(
593            "cell_line",
594            EntityType::custom("CELL_LINE", EntityCategory::Misc),
595        );
596        mapper.add(
597            "cell_type",
598            EntityType::custom("CELL_TYPE", EntityCategory::Misc),
599        );
600        mapper
601    }
602
603    /// Create mapper for social media NER datasets (TweetNER7, etc.).
604    #[must_use]
605    pub fn social_media() -> Self {
606        let mut mapper = Self::new();
607        // TweetNER7 types
608        mapper.add("person", EntityType::Person);
609        mapper.add("corporation", EntityType::Organization);
610        mapper.add("location", EntityType::Location);
611        mapper.add("group", EntityType::Organization);
612        mapper.add(
613            "product",
614            EntityType::custom("PRODUCT", EntityCategory::Misc),
615        );
616        mapper.add(
617            "creative_work",
618            EntityType::custom("WORK_OF_ART", EntityCategory::Creative),
619        );
620        mapper.add("event", EntityType::custom("EVENT", EntityCategory::Misc));
621        mapper
622    }
623
624    /// Create mapper for manufacturing domain datasets (FabNER, etc.).
625    #[must_use]
626    pub fn manufacturing() -> Self {
627        let mut mapper = Self::new();
628        // FabNER entity types
629        mapper.add("MATE", EntityType::custom("MATERIAL", EntityCategory::Misc));
630        mapper.add("MANP", EntityType::custom("PROCESS", EntityCategory::Misc));
631        mapper.add("MACEQ", EntityType::custom("MACHINE", EntityCategory::Misc));
632        mapper.add(
633            "APPL",
634            EntityType::custom("APPLICATION", EntityCategory::Misc),
635        );
636        mapper.add("FEAT", EntityType::custom("FEATURE", EntityCategory::Misc));
637        mapper.add(
638            "PARA",
639            EntityType::custom("PARAMETER", EntityCategory::Misc),
640        );
641        mapper.add("PRO", EntityType::custom("PROPERTY", EntityCategory::Misc));
642        mapper.add(
643            "CHAR",
644            EntityType::custom("CHARACTERISTIC", EntityCategory::Misc),
645        );
646        mapper.add(
647            "ENAT",
648            EntityType::custom("ENABLING_TECHNOLOGY", EntityCategory::Misc),
649        );
650        mapper.add(
651            "CONPRI",
652            EntityType::custom("CONCEPT_PRINCIPLE", EntityCategory::Misc),
653        );
654        mapper.add(
655            "BIOP",
656            EntityType::custom("BIO_PROCESS", EntityCategory::Misc),
657        );
658        mapper.add(
659            "MANS",
660            EntityType::custom("MAN_STANDARD", EntityCategory::Misc),
661        );
662        mapper
663    }
664
665    /// Add a mapping from source label to target type.
666    pub fn add(&mut self, source: impl Into<String>, target: EntityType) {
667        self.mappings.insert(source.into().to_uppercase(), target);
668    }
669
670    /// Get mapped type for a label (returns None if not mapped).
671    #[must_use]
672    pub fn map(&self, label: &str) -> Option<&EntityType> {
673        self.mappings.get(&label.to_uppercase())
674    }
675
676    /// Normalize a label to EntityType, using mapping if available.
677    ///
678    /// Falls back to `EntityType::from_label()` if no mapping exists.
679    #[must_use]
680    pub fn normalize(&self, label: &str) -> EntityType {
681        self.map(label)
682            .cloned()
683            .unwrap_or_else(|| EntityType::from_label(label))
684    }
685
686    /// Check if a label is mapped.
687    #[must_use]
688    pub fn contains(&self, label: &str) -> bool {
689        self.mappings.contains_key(&label.to_uppercase())
690    }
691
692    /// Get all source labels.
693    pub fn labels(&self) -> impl Iterator<Item = &String> {
694        self.mappings.keys()
695    }
696}
697
698/// Extraction method used to identify an entity.
699///
700/// # Research Context
701///
702/// Different extraction methods have different strengths:
703///
704/// | Method | Precision | Recall | Generalization | Use Case |
705/// |--------|-----------|--------|----------------|----------|
706/// | Pattern | Very High | Low | N/A (format-based) | Dates, emails, money |
707/// | Neural | High | High | Good | General NER |
708/// | Lexicon | Very High | Low | None | Closed-domain entities |
709/// | SoftLexicon | Medium | High | Good for rare types | Low-resource NER |
710/// | GatedEnsemble | Highest | Highest | Contextual | Short texts, domain shift |
711///
712/// See `docs/` for repo-local notes and entry points.
713#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, Default)]
714#[non_exhaustive]
715pub enum ExtractionMethod {
716    /// Regex pattern matching (high precision for structured data like dates, money).
717    /// Does not generalize - only detects format-based entities.
718    Pattern,
719
720    /// Neural model inference (BERT, GLiNER, etc.).
721    /// The recommended default for general NER. Generalizes to unseen entities.
722    #[default]
723    Neural,
724
725    /// Exact lexicon/gazetteer lookup (deprecated approach).
726    /// High precision on known entities, zero recall on novel entities.
727    /// Only use for closed domains (stock tickers, medical codes).
728    #[deprecated(since = "0.2.0", note = "Use Neural or GatedEnsemble instead")]
729    Lexicon,
730
731    /// Embedding-based soft lexicon matching.
732    /// Useful for low-resource languages and rare entity types.
733    /// See: Rijhwani et al. (2020) "Soft Gazetteers for Low-Resource NER"
734    SoftLexicon,
735
736    /// Gated ensemble: neural + lexicon with learned weighting.
737    /// Model learns when to trust lexicon vs. context.
738    /// See: Nie et al. (2021) "GEMNET: Effective Gated Gazetteer Representations"
739    GatedEnsemble,
740
741    /// Multiple methods agreed on this entity (high confidence).
742    Consensus,
743
744    /// Heuristic-based extraction (capitalization, word shape, context).
745    /// Used by heuristic backends that don't use neural models.
746    Heuristic,
747
748    /// Unknown or unspecified extraction method.
749    Unknown,
750
751    /// Legacy rule-based extraction (for backward compatibility).
752    #[deprecated(since = "0.2.0", note = "Use Heuristic or Pattern instead")]
753    Rule,
754
755    /// Legacy alias for Neural (for backward compatibility).
756    #[deprecated(since = "0.2.0", note = "Use Neural instead")]
757    ML,
758
759    /// Legacy alias for Consensus (for backward compatibility).
760    #[deprecated(since = "0.2.0", note = "Use Consensus instead")]
761    Ensemble,
762}
763
764impl ExtractionMethod {
765    /// Returns true if this extraction method produces probabilistically calibrated
766    /// confidence scores suitable for calibration analysis (ECE, Brier score, etc.).
767    ///
768    /// # Calibrated Methods
769    ///
770    /// - **Neural**: Softmax outputs are intended to be probabilistic (though may need
771    ///   temperature scaling for true calibration)
772    /// - **GatedEnsemble**: Produces learned probability estimates
773    /// - **SoftLexicon**: Embedding similarity is pseudo-probabilistic
774    ///
775    /// # Uncalibrated Methods
776    ///
777    /// - **Pattern**: Binary (match/no-match); confidence is typically hardcoded
778    /// - **Heuristic**: Arbitrary scores from hand-crafted rules
779    /// - **Lexicon**: Binary exact match
780    /// - **Consensus**: Agreement count, not a probability
781    ///
782    /// # Example
783    ///
784    /// ```rust,ignore
785    /// use anno_core::ExtractionMethod;
786    ///
787    /// assert!(ExtractionMethod::Neural.is_calibrated());
788    /// assert!(!ExtractionMethod::Pattern.is_calibrated());
789    /// assert!(!ExtractionMethod::Heuristic.is_calibrated());
790    /// ```
791    #[must_use]
792    pub const fn is_calibrated(&self) -> bool {
793        #[allow(deprecated)]
794        match self {
795            ExtractionMethod::Neural => true,
796            ExtractionMethod::GatedEnsemble => true,
797            ExtractionMethod::SoftLexicon => true,
798            ExtractionMethod::ML => true, // Legacy alias for Neural
799            // Everything else is not calibrated
800            ExtractionMethod::Pattern => false,
801            ExtractionMethod::Lexicon => false,
802            ExtractionMethod::Consensus => false,
803            ExtractionMethod::Heuristic => false,
804            ExtractionMethod::Unknown => false,
805            ExtractionMethod::Rule => false,
806            ExtractionMethod::Ensemble => false,
807        }
808    }
809
810    /// Returns the confidence interpretation for this extraction method.
811    ///
812    /// This helps users understand what the confidence score means:
813    /// - `"probability"`: Score approximates P(correct)
814    /// - `"heuristic_score"`: Score is a non-probabilistic quality measure
815    /// - `"binary"`: Score is 0 or 1 (or a fixed value for matches)
816    #[must_use]
817    pub const fn confidence_interpretation(&self) -> &'static str {
818        #[allow(deprecated)]
819        match self {
820            ExtractionMethod::Neural | ExtractionMethod::ML => "probability",
821            ExtractionMethod::GatedEnsemble | ExtractionMethod::SoftLexicon => "probability",
822            ExtractionMethod::Pattern | ExtractionMethod::Lexicon => "binary",
823            ExtractionMethod::Heuristic | ExtractionMethod::Rule => "heuristic_score",
824            ExtractionMethod::Consensus | ExtractionMethod::Ensemble => "agreement_ratio",
825            ExtractionMethod::Unknown => "unknown",
826        }
827    }
828}
829
830impl std::fmt::Display for ExtractionMethod {
831    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
832        #[allow(deprecated)]
833        match self {
834            ExtractionMethod::Pattern => write!(f, "pattern"),
835            ExtractionMethod::Neural => write!(f, "neural"),
836            ExtractionMethod::Lexicon => write!(f, "lexicon"),
837            ExtractionMethod::SoftLexicon => write!(f, "soft_lexicon"),
838            ExtractionMethod::GatedEnsemble => write!(f, "gated_ensemble"),
839            ExtractionMethod::Consensus => write!(f, "consensus"),
840            ExtractionMethod::Heuristic => write!(f, "heuristic"),
841            ExtractionMethod::Unknown => write!(f, "unknown"),
842            ExtractionMethod::Rule => write!(f, "heuristic"), // Legacy alias
843            ExtractionMethod::ML => write!(f, "neural"),      // Legacy alias
844            ExtractionMethod::Ensemble => write!(f, "consensus"), // Legacy alias
845        }
846    }
847}
848
849// =============================================================================
850// Lexicon Traits
851// =============================================================================
852
853/// Exact-match lexicon/gazetteer for entity lookup.
854///
855/// # Research Context
856///
857/// Gazetteers (lists of known entities) are a classic NER technique. Modern research
858/// suggests they are most valuable when:
859///
860/// 1. **Domain is closed**: Stock tickers, medical codes, known product catalogs
861/// 2. **Text is short**: where context is insufficient
862/// 3. **Used as features**: Input to neural model, not final output (Song et al. 2020)
863///
864/// They're harmful when:
865/// 1. **Domain is open**: Novel entities not in the list get missed
866/// 2. **Used as authority**: Hardcoded lookups inflate test scores but fail in production
867///
868/// # When to Use
869///
870/// ```text
871/// Decision: Should I use a Lexicon?
872///
873/// Is entity type CLOSED (fixed, known list)?
874/// ├─ Yes: Lexicon is appropriate
875/// │       Examples: stock tickers, ICD-10 codes, country names
876/// └─ No:  Use Neural extraction instead
877///         Examples: person names, organization names, products
878/// ```
879///
880/// # Example
881///
882/// ```rust,ignore
883/// use anno_core::{Lexicon, EntityType, HashMapLexicon};
884///
885/// // Create a domain-specific lexicon
886/// let mut lexicon = HashMapLexicon::new("stock_tickers");
887/// lexicon.insert("AAPL", EntityType::Organization, 0.99);
888/// lexicon.insert("GOOGL", EntityType::Organization, 0.99);
889///
890/// // Lookup
891/// if let Some((entity_type, confidence)) = lexicon.lookup("AAPL") {
892///     assert_eq!(entity_type, EntityType::Organization);
893///     assert!(confidence > 0.9);
894/// }
895/// ```
896///
897/// # References
898///
899/// - Song et al. (2020). "Improving Neural NER with Gazetteers"
900/// - Nie et al. (2021). "GEMNET: Effective Gated Gazetteer Representations"
901/// - Rijhwani et al. (2020). "Soft Gazetteers for Low-Resource NER"
902pub trait Lexicon: Send + Sync {
903    /// Lookup an exact string, returning entity type and confidence if found.
904    ///
905    /// Returns `None` if the text is not in the lexicon.
906    fn lookup(&self, text: &str) -> Option<(EntityType, f64)>;
907
908    /// Check if the lexicon contains this exact string.
909    fn contains(&self, text: &str) -> bool {
910        self.lookup(text).is_some()
911    }
912
913    /// Get the lexicon source identifier (for provenance tracking).
914    fn source(&self) -> &str;
915
916    /// Get approximate number of entries (for debugging/metrics).
917    fn len(&self) -> usize;
918
919    /// Check if lexicon is empty.
920    fn is_empty(&self) -> bool {
921        self.len() == 0
922    }
923}
924
925/// Simple HashMap-based lexicon implementation.
926///
927/// Suitable for small to medium lexicons (<100k entries).
928/// For larger lexicons, consider a trie-based or FST implementation.
929#[derive(Debug, Clone)]
930pub struct HashMapLexicon {
931    entries: std::collections::HashMap<String, (EntityType, f64)>,
932    source: String,
933}
934
935impl HashMapLexicon {
936    /// Create a new empty lexicon with the given source identifier.
937    #[must_use]
938    pub fn new(source: impl Into<String>) -> Self {
939        Self {
940            entries: std::collections::HashMap::new(),
941            source: source.into(),
942        }
943    }
944
945    /// Insert an entry into the lexicon.
946    pub fn insert(&mut self, text: impl Into<String>, entity_type: EntityType, confidence: f64) {
947        self.entries.insert(text.into(), (entity_type, confidence));
948    }
949
950    /// Create from an iterator of (text, type, confidence) tuples.
951    pub fn from_iter<I, S>(source: impl Into<String>, entries: I) -> Self
952    where
953        I: IntoIterator<Item = (S, EntityType, f64)>,
954        S: Into<String>,
955    {
956        let mut lexicon = Self::new(source);
957        for (text, entity_type, confidence) in entries {
958            lexicon.insert(text, entity_type, confidence);
959        }
960        lexicon
961    }
962
963    /// Get all entries as an iterator (for debugging).
964    pub fn entries(&self) -> impl Iterator<Item = (&str, &EntityType, f64)> {
965        self.entries.iter().map(|(k, (t, c))| (k.as_str(), t, *c))
966    }
967}
968
969impl Lexicon for HashMapLexicon {
970    fn lookup(&self, text: &str) -> Option<(EntityType, f64)> {
971        self.entries.get(text).cloned()
972    }
973
974    fn source(&self) -> &str {
975        &self.source
976    }
977
978    fn len(&self) -> usize {
979        self.entries.len()
980    }
981}
982
983/// Provenance information for an extracted entity.
984///
985/// Tracks where an entity came from for debugging, explainability,
986/// and confidence calibration in hybrid/ensemble systems.
987#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Default)]
988pub struct Provenance {
989    /// Name of the backend that produced this entity (e.g., "pattern", "bert-onnx")
990    pub source: Cow<'static, str>,
991    /// Extraction method used
992    pub method: ExtractionMethod,
993    /// Specific pattern/rule name (for pattern/rule-based extraction)
994    pub pattern: Option<Cow<'static, str>>,
995    /// Raw confidence from the source model (before any calibration)
996    pub raw_confidence: Option<f64>,
997    /// Model version for reproducibility (e.g., "gliner-v2.1", "bert-base-uncased-2024-01")
998    #[serde(default, skip_serializing_if = "Option::is_none")]
999    pub model_version: Option<Cow<'static, str>>,
1000    /// Timestamp when extraction occurred (ISO 8601)
1001    #[serde(default, skip_serializing_if = "Option::is_none")]
1002    pub timestamp: Option<String>,
1003}
1004
1005impl Provenance {
1006    /// Create provenance for regex-based extraction.
1007    #[must_use]
1008    pub fn pattern(pattern_name: &'static str) -> Self {
1009        Self {
1010            source: Cow::Borrowed("pattern"),
1011            method: ExtractionMethod::Pattern,
1012            pattern: Some(Cow::Borrowed(pattern_name)),
1013            raw_confidence: Some(1.0), // Patterns are deterministic
1014            model_version: None,
1015            timestamp: None,
1016        }
1017    }
1018
1019    /// Create provenance for ML-based extraction.
1020    ///
1021    /// Accepts both static strings and owned strings:
1022    /// ```rust,ignore
1023    /// // Static string (zero allocation)
1024    /// Provenance::ml("gliner", 0.95);
1025    ///
1026    /// // Owned string (dynamic model name)
1027    /// Provenance::ml(model_name.to_string(), 0.95);
1028    /// ```
1029    #[must_use]
1030    pub fn ml(model_name: impl Into<Cow<'static, str>>, confidence: f64) -> Self {
1031        Self {
1032            source: model_name.into(),
1033            method: ExtractionMethod::Neural,
1034            pattern: None,
1035            raw_confidence: Some(confidence),
1036            model_version: None,
1037            timestamp: None,
1038        }
1039    }
1040
1041    /// Deprecated: Use `ml()` instead which now accepts both static and owned strings.
1042    #[deprecated(
1043        since = "0.2.1",
1044        note = "Use ml() instead, it now accepts owned strings"
1045    )]
1046    #[must_use]
1047    pub fn ml_owned(model_name: impl Into<String>, confidence: f64) -> Self {
1048        Self::ml(Cow::Owned(model_name.into()), confidence)
1049    }
1050
1051    /// Create provenance for ensemble/hybrid extraction.
1052    #[must_use]
1053    pub fn ensemble(sources: &'static str) -> Self {
1054        Self {
1055            source: Cow::Borrowed(sources),
1056            method: ExtractionMethod::Consensus,
1057            pattern: None,
1058            raw_confidence: None,
1059            model_version: None,
1060            timestamp: None,
1061        }
1062    }
1063
1064    /// Create provenance with model version for reproducibility.
1065    #[must_use]
1066    pub fn with_version(mut self, version: &'static str) -> Self {
1067        self.model_version = Some(Cow::Borrowed(version));
1068        self
1069    }
1070
1071    /// Create provenance with timestamp.
1072    #[must_use]
1073    pub fn with_timestamp(mut self, timestamp: impl Into<String>) -> Self {
1074        self.timestamp = Some(timestamp.into());
1075        self
1076    }
1077}
1078
1079// ============================================================================
1080// Span Types (Multi-Modal Support)
1081// ============================================================================
1082
1083/// A span locator for text and visual modalities.
1084///
1085/// `Span` is a **simplified subset** of [`grounded::Location`] designed for
1086/// the detection layer (`Entity`). It covers the most common cases:
1087///
1088/// - Text offsets (traditional NER)
1089/// - Bounding boxes (visual document understanding)
1090/// - Hybrid (OCR with both text and visual location)
1091///
1092/// # Relationship to `Location`
1093///
1094/// | `Span` variant | `Location` equivalent |
1095/// |----------------|-----------------------|
1096/// | `Text` | `Location::Text` |
1097/// | `BoundingBox` | `Location::BoundingBox` |
1098/// | `Hybrid` | `Location::TextWithBbox` |
1099///
1100/// For modalities not covered by `Span` (temporal, cuboid, genomic, discontinuous),
1101/// use `Location` directly via the canonical `Signal` → `Track` → `Identity` pipeline.
1102///
1103/// # Conversion
1104///
1105/// - `Span → Location`: Always succeeds via `Location::from(&span)`
1106/// - `Location → Span`: Use `location.to_span()`, returns `None` for unsupported variants
1107///
1108/// [`grounded::Location`]: super::grounded::Location
1109#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
1110pub enum Span {
1111    /// Text span with **character offsets** (start, end).
1112    ///
1113    /// Offsets are Unicode scalar value indices (what `text.chars()` counts),
1114    /// consistent with `Entity.start/end` and `grounded::Location::Text`.
1115    Text {
1116        /// Start character offset (inclusive)
1117        start: usize,
1118        /// End character offset (exclusive)
1119        end: usize,
1120    },
1121    /// Visual bounding box (normalized 0.0-1.0 coordinates)
1122    /// For ColPali: image patch locations
1123    BoundingBox {
1124        /// X coordinate (normalized 0.0-1.0)
1125        x: f32,
1126        /// Y coordinate (normalized 0.0-1.0)
1127        y: f32,
1128        /// Width (normalized 0.0-1.0)
1129        width: f32,
1130        /// Height (normalized 0.0-1.0)
1131        height: f32,
1132        /// Optional page number (for multi-page documents)
1133        page: Option<u32>,
1134    },
1135    /// Hybrid: both text and visual location (for OCR-verified extraction)
1136    Hybrid {
1137        /// Start character offset (inclusive)
1138        start: usize,
1139        /// End character offset (exclusive)
1140        end: usize,
1141        /// Bounding box for visual location
1142        bbox: Box<Span>,
1143    },
1144}
1145
1146impl Span {
1147    /// Create a text span.
1148    #[must_use]
1149    pub const fn text(start: usize, end: usize) -> Self {
1150        Self::Text { start, end }
1151    }
1152
1153    /// Create a bounding box span with normalized coordinates.
1154    #[must_use]
1155    pub fn bbox(x: f32, y: f32, width: f32, height: f32) -> Self {
1156        Self::BoundingBox {
1157            x,
1158            y,
1159            width,
1160            height,
1161            page: None,
1162        }
1163    }
1164
1165    /// Create a bounding box with page number.
1166    #[must_use]
1167    pub fn bbox_on_page(x: f32, y: f32, width: f32, height: f32, page: u32) -> Self {
1168        Self::BoundingBox {
1169            x,
1170            y,
1171            width,
1172            height,
1173            page: Some(page),
1174        }
1175    }
1176
1177    /// Check if this is a text span.
1178    #[must_use]
1179    pub const fn is_text(&self) -> bool {
1180        matches!(self, Self::Text { .. } | Self::Hybrid { .. })
1181    }
1182
1183    /// Check if this has visual location.
1184    #[must_use]
1185    pub const fn is_visual(&self) -> bool {
1186        matches!(self, Self::BoundingBox { .. } | Self::Hybrid { .. })
1187    }
1188
1189    /// Get text offsets if available.
1190    #[must_use]
1191    pub const fn text_offsets(&self) -> Option<(usize, usize)> {
1192        match self {
1193            Self::Text { start, end } => Some((*start, *end)),
1194            Self::Hybrid { start, end, .. } => Some((*start, *end)),
1195            Self::BoundingBox { .. } => None,
1196        }
1197    }
1198
1199    /// Calculate span length for text spans.
1200    #[must_use]
1201    pub fn len(&self) -> usize {
1202        match self {
1203            Self::Text { start, end } => end.saturating_sub(*start),
1204            Self::Hybrid { start, end, .. } => end.saturating_sub(*start),
1205            Self::BoundingBox { .. } => 0,
1206        }
1207    }
1208
1209    /// Check if span is empty.
1210    #[must_use]
1211    pub fn is_empty(&self) -> bool {
1212        self.len() == 0
1213    }
1214}
1215
1216// ============================================================================
1217// Discontinuous Spans (W2NER/ACE-style)
1218// ============================================================================
1219
1220/// A discontinuous span representing non-contiguous entity mentions.
1221///
1222/// Some entities span multiple non-adjacent text regions:
1223/// - "severe \[pain\] in the \[abdomen\]" → "severe abdominal pain"
1224/// - "the \[president\] ... \[Obama\]" → coreference
1225///
1226/// This is required for:
1227/// - **Medical NER**: Anatomical modifiers separated from findings
1228/// - **Legal NER**: Parties referenced across clauses
1229/// - **W2NER**: Word-word relation grids that detect discontinuous entities
1230///
1231/// # Offset Unit (CRITICAL)
1232///
1233/// `DiscontinuousSpan` uses **character offsets** (Unicode scalar value indices),
1234/// consistent with [`Entity::start`](super::entity::Entity::start) /
1235/// [`Entity::end`](super::entity::Entity::end) and `anno::core::grounded::Location`.
1236///
1237/// This is intentionally *not* byte offsets. If you have byte offsets (from regex,
1238/// `str::find`, tokenizers, etc.), convert them to character offsets first (see
1239/// `anno::offset::SpanConverter` in the `anno` crate).
1240///
1241/// # Example
1242///
1243/// ```rust,ignore
1244/// use anno_core::DiscontinuousSpan;
1245///
1246/// // "severe pain in the abdomen" where "severe" modifies "pain"
1247/// // but they're separated by other words
1248/// let span = DiscontinuousSpan::new(vec![
1249///     0..6,   // "severe"
1250///     12..16, // "pain"
1251/// ]);
1252///
1253/// assert_eq!(span.num_segments(), 2);
1254/// assert!(span.is_discontinuous());
1255/// ```
1256#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
1257pub struct DiscontinuousSpan {
1258    /// Non-overlapping segments, sorted by start position.
1259    /// Each `Range<usize>` represents (start_char, end_char).
1260    segments: Vec<std::ops::Range<usize>>,
1261}
1262
1263impl DiscontinuousSpan {
1264    /// Create a new discontinuous span from segments.
1265    ///
1266    /// Segments are sorted and validated (no overlaps).
1267    #[must_use]
1268    pub fn new(mut segments: Vec<std::ops::Range<usize>>) -> Self {
1269        // Sort by start position
1270        segments.sort_by_key(|r| r.start);
1271        Self { segments }
1272    }
1273
1274    /// Create from a single contiguous span.
1275    #[must_use]
1276    #[allow(clippy::single_range_in_vec_init)] // Intentional: contiguous is special case of discontinuous
1277    pub fn contiguous(start: usize, end: usize) -> Self {
1278        Self {
1279            segments: vec![start..end],
1280        }
1281    }
1282
1283    /// Number of segments.
1284    #[must_use]
1285    pub fn num_segments(&self) -> usize {
1286        self.segments.len()
1287    }
1288
1289    /// True if this spans multiple non-adjacent regions.
1290    #[must_use]
1291    pub fn is_discontinuous(&self) -> bool {
1292        self.segments.len() > 1
1293    }
1294
1295    /// True if this is a single contiguous span.
1296    #[must_use]
1297    pub fn is_contiguous(&self) -> bool {
1298        self.segments.len() <= 1
1299    }
1300
1301    /// Get the segments.
1302    #[must_use]
1303    pub fn segments(&self) -> &[std::ops::Range<usize>] {
1304        &self.segments
1305    }
1306
1307    /// Get the overall bounding range (start of first to end of last).
1308    #[must_use]
1309    pub fn bounding_range(&self) -> Option<std::ops::Range<usize>> {
1310        if self.segments.is_empty() {
1311            return None;
1312        }
1313        let start = self.segments.first()?.start;
1314        let end = self.segments.last()?.end;
1315        Some(start..end)
1316    }
1317
1318    /// Total character length (sum of all segments).
1319    ///
1320    #[must_use]
1321    pub fn total_len(&self) -> usize {
1322        self.segments.iter().map(|r| r.end - r.start).sum()
1323    }
1324
1325    /// Extract text from each segment and join with separator.
1326    #[must_use]
1327    pub fn extract_text(&self, text: &str, separator: &str) -> String {
1328        self.segments
1329            .iter()
1330            .map(|r| {
1331                let start = r.start;
1332                let len = r.end.saturating_sub(r.start);
1333                text.chars().skip(start).take(len).collect::<String>()
1334            })
1335            .collect::<Vec<_>>()
1336            .join(separator)
1337    }
1338
1339    /// Check if a character position falls within any segment.
1340    ///
1341    /// # Arguments
1342    ///
1343    /// * `pos` - Character offset to check (Unicode scalar value index)
1344    ///
1345    /// # Returns
1346    ///
1347    /// `true` if the character position falls within any segment of this span.
1348    #[must_use]
1349    pub fn contains(&self, pos: usize) -> bool {
1350        self.segments.iter().any(|r| r.contains(&pos))
1351    }
1352
1353    /// Convert to a regular Span (uses bounding range, loses discontinuity info).
1354    #[must_use]
1355    pub fn to_span(&self) -> Option<Span> {
1356        self.bounding_range().map(|r| Span::Text {
1357            start: r.start,
1358            end: r.end,
1359        })
1360    }
1361}
1362
1363impl From<std::ops::Range<usize>> for DiscontinuousSpan {
1364    fn from(range: std::ops::Range<usize>) -> Self {
1365        Self::contiguous(range.start, range.end)
1366    }
1367}
1368
1369impl Default for Span {
1370    fn default() -> Self {
1371        Self::Text { start: 0, end: 0 }
1372    }
1373}
1374
1375// ============================================================================
1376// Hierarchical Confidence (Coarse-to-Fine)
1377// ============================================================================
1378
1379/// Hierarchical confidence scores for coarse-to-fine extraction.
1380///
1381/// Research (HiNet, InfoHier) shows that extraction benefits from
1382/// decomposed confidence:
1383/// - **Linkage**: "Is there ANY entity here?" (binary, fast filter)
1384/// - **Type**: "What type is it?" (fine-grained classification)
1385/// - **Boundary**: "Where exactly does it start/end?" (span refinement)
1386#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
1387pub struct HierarchicalConfidence {
1388    /// Coarse: probability that this span contains ANY entity (0.0-1.0)
1389    /// Used for early filtering in the TPLinker "handshaking" matrix.
1390    pub linkage: f32,
1391    /// Fine: probability that the type classification is correct (0.0-1.0)
1392    pub type_score: f32,
1393    /// Boundary: confidence in the exact span boundaries (0.0-1.0)
1394    /// Low for entities with fuzzy boundaries (e.g., "the CEO" vs "CEO")
1395    pub boundary: f32,
1396}
1397
1398impl HierarchicalConfidence {
1399    /// Create hierarchical confidence with all scores.
1400    #[must_use]
1401    pub fn new(linkage: f32, type_score: f32, boundary: f32) -> Self {
1402        Self {
1403            linkage: linkage.clamp(0.0, 1.0),
1404            type_score: type_score.clamp(0.0, 1.0),
1405            boundary: boundary.clamp(0.0, 1.0),
1406        }
1407    }
1408
1409    /// Create from a single confidence score (legacy compatibility).
1410    /// Assigns same score to all levels.
1411    #[must_use]
1412    pub fn from_single(confidence: f32) -> Self {
1413        let c = confidence.clamp(0.0, 1.0);
1414        Self {
1415            linkage: c,
1416            type_score: c,
1417            boundary: c,
1418        }
1419    }
1420
1421    /// Calculate combined confidence (geometric mean).
1422    /// Geometric mean penalizes low scores more than arithmetic mean.
1423    #[must_use]
1424    pub fn combined(&self) -> f32 {
1425        (self.linkage * self.type_score * self.boundary).powf(1.0 / 3.0)
1426    }
1427
1428    /// Calculate combined confidence as f64 for legacy compatibility.
1429    #[must_use]
1430    pub fn as_f64(&self) -> f64 {
1431        self.combined() as f64
1432    }
1433
1434    /// Check if passes minimum threshold at all levels.
1435    #[must_use]
1436    pub fn passes_threshold(&self, linkage_min: f32, type_min: f32, boundary_min: f32) -> bool {
1437        self.linkage >= linkage_min && self.type_score >= type_min && self.boundary >= boundary_min
1438    }
1439}
1440
1441impl Default for HierarchicalConfidence {
1442    fn default() -> Self {
1443        Self {
1444            linkage: 1.0,
1445            type_score: 1.0,
1446            boundary: 1.0,
1447        }
1448    }
1449}
1450
1451impl From<f64> for HierarchicalConfidence {
1452    fn from(confidence: f64) -> Self {
1453        Self::from_single(confidence as f32)
1454    }
1455}
1456
1457impl From<f32> for HierarchicalConfidence {
1458    fn from(confidence: f32) -> Self {
1459        Self::from_single(confidence)
1460    }
1461}
1462
1463// ============================================================================
1464// Ragged Batch (ModernBERT Unpadding)
1465// ============================================================================
1466
1467/// A ragged (unpadded) batch for efficient ModernBERT inference.
1468///
1469/// ModernBERT achieves its speed advantage by avoiding padding tokens entirely.
1470/// Instead of `[batch, max_seq_len]`, it uses a single contiguous 1D sequence
1471/// with offset indices to track document boundaries.
1472///
1473/// # Memory Layout
1474///
1475/// ```text
1476/// Traditional (padded):
1477/// [doc1_tok1, doc1_tok2, PAD, PAD, PAD]  <- wasted compute
1478/// [doc2_tok1, doc2_tok2, doc2_tok3, PAD, PAD]
1479///
1480/// Ragged (unpadded):
1481/// [doc1_tok1, doc1_tok2, doc2_tok1, doc2_tok2, doc2_tok3]
1482/// cumulative_offsets: [0, 2, 5]  <- doc1 is [0..2], doc2 is [2..5]
1483/// ```
1484#[derive(Debug, Clone)]
1485pub struct RaggedBatch {
1486    /// Token IDs flattened into a single contiguous array.
1487    /// Shape: `[total_tokens]` (1D, no padding)
1488    pub token_ids: Vec<u32>,
1489    /// Cumulative sequence lengths.
1490    /// Length: batch_size + 1
1491    /// Document i spans tokens \[offsets\[i\]..offsets\[i+1\])
1492    pub cumulative_offsets: Vec<u32>,
1493    /// Maximum sequence length in this batch (for kernel bounds).
1494    pub max_seq_len: usize,
1495}
1496
1497impl RaggedBatch {
1498    /// Create a new ragged batch from sequences.
1499    pub fn from_sequences(sequences: &[Vec<u32>]) -> Self {
1500        let total_tokens: usize = sequences.iter().map(|s| s.len()).sum();
1501        let mut token_ids = Vec::with_capacity(total_tokens);
1502        let mut cumulative_offsets = Vec::with_capacity(sequences.len() + 1);
1503        let mut max_seq_len = 0;
1504
1505        cumulative_offsets.push(0);
1506        for seq in sequences {
1507            token_ids.extend_from_slice(seq);
1508            // Check for overflow: u32::MAX is 4,294,967,295
1509            // If token_ids.len() exceeds this, we'll truncate (which is a bug)
1510            // but in practice, this is unlikely for reasonable batch sizes
1511            let len = token_ids.len();
1512            if len > u32::MAX as usize {
1513                // This would overflow - use saturating cast to prevent panic
1514                // but log a warning as this indicates a problem
1515                log::warn!(
1516                    "Token count {} exceeds u32::MAX, truncating to {}",
1517                    len,
1518                    u32::MAX
1519                );
1520                cumulative_offsets.push(u32::MAX);
1521            } else {
1522                cumulative_offsets.push(len as u32);
1523            }
1524            max_seq_len = max_seq_len.max(seq.len());
1525        }
1526
1527        Self {
1528            token_ids,
1529            cumulative_offsets,
1530            max_seq_len,
1531        }
1532    }
1533
1534    /// Get the number of documents in this batch.
1535    #[must_use]
1536    pub fn batch_size(&self) -> usize {
1537        self.cumulative_offsets.len().saturating_sub(1)
1538    }
1539
1540    /// Get the total number of tokens (no padding).
1541    #[must_use]
1542    pub fn total_tokens(&self) -> usize {
1543        self.token_ids.len()
1544    }
1545
1546    /// Get token range for a specific document.
1547    #[must_use]
1548    pub fn doc_range(&self, doc_idx: usize) -> Option<std::ops::Range<usize>> {
1549        if doc_idx + 1 < self.cumulative_offsets.len() {
1550            let start = self.cumulative_offsets[doc_idx] as usize;
1551            let end = self.cumulative_offsets[doc_idx + 1] as usize;
1552            Some(start..end)
1553        } else {
1554            None
1555        }
1556    }
1557
1558    /// Get tokens for a specific document.
1559    #[must_use]
1560    pub fn doc_tokens(&self, doc_idx: usize) -> Option<&[u32]> {
1561        self.doc_range(doc_idx).map(|r| &self.token_ids[r])
1562    }
1563
1564    /// Calculate memory saved vs padded batch.
1565    #[must_use]
1566    pub fn padding_savings(&self) -> f64 {
1567        let padded_size = self.batch_size() * self.max_seq_len;
1568        if padded_size == 0 {
1569            return 0.0;
1570        }
1571        1.0 - (self.total_tokens() as f64 / padded_size as f64)
1572    }
1573}
1574
1575// ============================================================================
1576// Span Candidate Generation
1577// ============================================================================
1578
1579/// A candidate span for entity extraction.
1580///
1581/// In GLiNER/bi-encoder systems, we generate all possible spans up to a
1582/// maximum width and score them against entity type embeddings.
1583#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
1584pub struct SpanCandidate {
1585    /// Document index in the batch
1586    pub doc_idx: u32,
1587    /// Start token index (within the document)
1588    pub start: u32,
1589    /// End token index (exclusive)
1590    pub end: u32,
1591}
1592
1593impl SpanCandidate {
1594    /// Create a new span candidate.
1595    #[must_use]
1596    pub const fn new(doc_idx: u32, start: u32, end: u32) -> Self {
1597        Self {
1598            doc_idx,
1599            start,
1600            end,
1601        }
1602    }
1603
1604    /// Get span width (number of tokens).
1605    #[must_use]
1606    pub const fn width(&self) -> u32 {
1607        self.end.saturating_sub(self.start)
1608    }
1609}
1610
1611/// Generate all valid span candidates for a ragged batch.
1612///
1613/// This is the "gnarly" operation in GLiNER - efficiently enumerating
1614/// all valid spans without O(N^2) memory allocation.
1615pub fn generate_span_candidates(batch: &RaggedBatch, max_width: usize) -> Vec<SpanCandidate> {
1616    let mut candidates = Vec::new();
1617
1618    for doc_idx in 0..batch.batch_size() {
1619        if let Some(range) = batch.doc_range(doc_idx) {
1620            let doc_len = range.len();
1621            // Generate all spans [i, j) where j - i <= max_width
1622            for start in 0..doc_len {
1623                let max_end = (start + max_width).min(doc_len);
1624                for end in (start + 1)..=max_end {
1625                    candidates.push(SpanCandidate::new(doc_idx as u32, start as u32, end as u32));
1626                }
1627            }
1628        }
1629    }
1630
1631    candidates
1632}
1633
1634/// Generate span candidates with early filtering.
1635///
1636/// Uses a linkage mask to skip low-probability spans (TPLinker optimization).
1637pub fn generate_filtered_candidates(
1638    batch: &RaggedBatch,
1639    max_width: usize,
1640    linkage_mask: &[f32],
1641    threshold: f32,
1642) -> Vec<SpanCandidate> {
1643    let mut candidates = Vec::new();
1644    let mut mask_idx = 0;
1645
1646    for doc_idx in 0..batch.batch_size() {
1647        if let Some(range) = batch.doc_range(doc_idx) {
1648            let doc_len = range.len();
1649            for start in 0..doc_len {
1650                let max_end = (start + max_width).min(doc_len);
1651                for end in (start + 1)..=max_end {
1652                    // Only include if linkage probability exceeds threshold
1653                    if mask_idx < linkage_mask.len() && linkage_mask[mask_idx] >= threshold {
1654                        candidates.push(SpanCandidate::new(
1655                            doc_idx as u32,
1656                            start as u32,
1657                            end as u32,
1658                        ));
1659                    }
1660                    mask_idx += 1;
1661                }
1662            }
1663        }
1664    }
1665
1666    candidates
1667}
1668
1669// ============================================================================
1670// Entity (Extended)
1671// ============================================================================
1672
1673/// A recognized named entity or relation trigger.
1674///
1675/// # Entity Structure
1676///
1677/// ```text
1678/// "Contact John at john@example.com on Jan 15"
1679///          ^^^^    ^^^^^^^^^^^^^^^^    ^^^^^^
1680///          PER     EMAIL               DATE
1681///          |       |                   |
1682///          Named   Contact             Temporal
1683///          (ML)    (Pattern)           (Pattern)
1684/// ```
1685///
1686/// # Core Fields (Stable API)
1687///
1688/// - `text`, `entity_type`, `start`, `end`, `confidence` — always present
1689/// - `normalized`, `provenance` — commonly used optional fields
1690/// - `kb_id`, `canonical_id` — knowledge graph and coreference support
1691///
1692/// # Extended Fields (Research/Experimental)
1693///
1694/// The following fields support advanced research applications but may evolve:
1695///
1696/// | Field | Purpose | Status |
1697/// |-------|---------|--------|
1698/// | `visual_span` | Multi-modal (ColPali) extraction | Experimental |
1699/// | `discontinuous_span` | W2NER non-contiguous entities | Experimental |
1700/// | `valid_from`, `valid_until` | Temporal knowledge graphs | Research |
1701/// | `viewport` | Multi-faceted entity representation | Research |
1702/// | `hierarchical_confidence` | Coarse-to-fine NER | Experimental |
1703///
1704/// These fields are `#[serde(skip_serializing_if = "Option::is_none")]` so they
1705/// have no overhead when unused.
1706///
1707/// # Knowledge Graph Support
1708///
1709/// For GraphRAG and coreference resolution, entities support:
1710/// - `kb_id`: External knowledge base identifier (e.g., Wikidata Q-ID)
1711/// - `canonical_id`: Local coreference cluster ID (links "John" and "he")
1712///
1713/// # Normalization
1714///
1715/// Entities can have a normalized form for downstream processing:
1716/// - Dates: "Jan 15" → "2024-01-15" (ISO 8601)
1717/// - Money: "$1.5M" → "1500000 USD"
1718/// - Locations: "NYC" → "New York City"
1719#[derive(Debug, Clone, Serialize, Deserialize)]
1720pub struct Entity {
1721    /// Entity text (surface form as it appears in source)
1722    pub text: String,
1723    /// Entity type classification
1724    pub entity_type: EntityType,
1725    /// Start position (character offset, NOT byte offset).
1726    ///
1727    /// For Unicode text, character offsets differ from byte offsets.
1728    /// Use `anno::offset::bytes_to_chars` to convert if needed.
1729    pub start: usize,
1730    /// End position (character offset, exclusive).
1731    ///
1732    /// For Unicode text, character offsets differ from byte offsets.
1733    /// Use `anno::offset::bytes_to_chars` to convert if needed.
1734    pub end: usize,
1735    /// Confidence score (0.0-1.0, calibrated)
1736    pub confidence: f64,
1737    /// Normalized/canonical form (e.g., "Jan 15" → "2024-01-15")
1738    #[serde(default, skip_serializing_if = "Option::is_none")]
1739    pub normalized: Option<String>,
1740    /// Provenance: which backend/method produced this entity
1741    #[serde(default, skip_serializing_if = "Option::is_none")]
1742    pub provenance: Option<Provenance>,
1743    /// External knowledge base ID (e.g., "Q7186" for Marie Curie in Wikidata).
1744    /// Used for entity linking and GraphRAG applications.
1745    #[serde(default, skip_serializing_if = "Option::is_none")]
1746    pub kb_id: Option<String>,
1747    /// Local coreference cluster ID.
1748    /// Multiple mentions with the same `canonical_id` refer to the same entity.
1749    /// Example: "Marie Curie" and "she" might share `canonical_id = CanonicalId(42)`.
1750    #[serde(default, skip_serializing_if = "Option::is_none")]
1751    pub canonical_id: Option<super::types::CanonicalId>,
1752    /// Hierarchical confidence (coarse-to-fine).
1753    /// Provides linkage, type, and boundary scores separately.
1754    #[serde(default, skip_serializing_if = "Option::is_none")]
1755    pub hierarchical_confidence: Option<HierarchicalConfidence>,
1756    /// Visual span for multi-modal (ColPali) extraction.
1757    /// When set, provides bounding box location in addition to text offsets.
1758    #[serde(default, skip_serializing_if = "Option::is_none")]
1759    pub visual_span: Option<Span>,
1760    /// Discontinuous span for non-contiguous entity mentions (W2NER support).
1761    /// When set, overrides `start`/`end` for length calculations.
1762    /// Example: "New York and LA \[airports\]" where "airports" modifies both.
1763    #[serde(default, skip_serializing_if = "Option::is_none")]
1764    pub discontinuous_span: Option<DiscontinuousSpan>,
1765    // =========================================================================
1766    // Temporal Validity (Research: Temporal Knowledge Graphs)
1767    // =========================================================================
1768    /// Start of temporal validity interval for this entity assertion.
1769    ///
1770    /// Entities are facts that may change over time:
1771    /// - "Satya Nadella is CEO of Microsoft" is valid from [2014, present]
1772    /// - "Steve Ballmer was CEO of Microsoft" was valid from [2000, 2014]
1773    ///
1774    /// When `None`, the entity is either:
1775    /// - Currently valid (no known end date)
1776    /// - Atemporal (timeless fact like "Paris is in France")
1777    ///
1778    /// # Example
1779    /// ```rust,ignore
1780    /// use anno_core::{Entity, EntityType};
1781    /// use chrono::{TimeZone, Utc};
1782    ///
1783    /// let mut entity = Entity::new("CEO of Microsoft", EntityType::Person, 0, 16, 0.9);
1784    /// entity.valid_from = Some(Utc.with_ymd_and_hms(2008, 10, 1, 0, 0, 0).unwrap());
1785    /// ```
1786    #[serde(default, skip_serializing_if = "Option::is_none")]
1787    pub valid_from: Option<chrono::DateTime<chrono::Utc>>,
1788    /// End of temporal validity interval for this entity assertion.
1789    ///
1790    /// When `None` and `valid_from` is set, the fact is currently valid.
1791    /// When both are `None`, the entity is atemporal.
1792    #[serde(default, skip_serializing_if = "Option::is_none")]
1793    pub valid_until: Option<chrono::DateTime<chrono::Utc>>,
1794    // =========================================================================
1795    // Viewport / Context (Research: Entity Manifolds)
1796    // =========================================================================
1797    /// Viewport context for multi-faceted entity representation.
1798    ///
1799    /// The same real-world entity can have different "faces" in different contexts:
1800    /// - "Marie Curie" in an academic context: professor, researcher
1801    /// - "Marie Curie" in a scientific context: physicist, chemist
1802    /// - "Marie Curie" in a personal context: mother, educator
1803    ///
1804    /// This enables "holographic" entity projection at query time:
1805    /// given a query context, project the entity manifold to the relevant viewport.
1806    ///
1807    /// # Example
1808    /// ```rust,ignore
1809    /// use anno_core::{Entity, EntityType, EntityViewport};
1810    ///
1811    /// let mut entity = Entity::new("Marie Curie", EntityType::Person, 0, 11, 0.9);
1812    /// entity.viewport = Some(EntityViewport::Academic);
1813    /// ```
1814    #[serde(default, skip_serializing_if = "Option::is_none")]
1815    pub viewport: Option<EntityViewport>,
1816}
1817
1818impl Entity {
1819    /// Create a new entity.
1820    #[must_use]
1821    pub fn new(
1822        text: impl Into<String>,
1823        entity_type: EntityType,
1824        start: usize,
1825        end: usize,
1826        confidence: f64,
1827    ) -> Self {
1828        Self {
1829            text: text.into(),
1830            entity_type,
1831            start,
1832            end,
1833            confidence: confidence.clamp(0.0, 1.0),
1834            normalized: None,
1835            provenance: None,
1836            kb_id: None,
1837            canonical_id: None,
1838            hierarchical_confidence: None,
1839            visual_span: None,
1840            discontinuous_span: None,
1841            valid_from: None,
1842            valid_until: None,
1843            viewport: None,
1844        }
1845    }
1846
1847    /// Create a new entity with provenance information.
1848    #[must_use]
1849    pub fn with_provenance(
1850        text: impl Into<String>,
1851        entity_type: EntityType,
1852        start: usize,
1853        end: usize,
1854        confidence: f64,
1855        provenance: Provenance,
1856    ) -> Self {
1857        Self {
1858            text: text.into(),
1859            entity_type,
1860            start,
1861            end,
1862            confidence: confidence.clamp(0.0, 1.0),
1863            normalized: None,
1864            provenance: Some(provenance),
1865            kb_id: None,
1866            canonical_id: None,
1867            hierarchical_confidence: None,
1868            visual_span: None,
1869            discontinuous_span: None,
1870            valid_from: None,
1871            valid_until: None,
1872            viewport: None,
1873        }
1874    }
1875
1876    /// Create an entity with hierarchical confidence scores.
1877    #[must_use]
1878    pub fn with_hierarchical_confidence(
1879        text: impl Into<String>,
1880        entity_type: EntityType,
1881        start: usize,
1882        end: usize,
1883        confidence: HierarchicalConfidence,
1884    ) -> Self {
1885        Self {
1886            text: text.into(),
1887            entity_type,
1888            start,
1889            end,
1890            confidence: confidence.as_f64(),
1891            normalized: None,
1892            provenance: None,
1893            kb_id: None,
1894            canonical_id: None,
1895            hierarchical_confidence: Some(confidence),
1896            visual_span: None,
1897            discontinuous_span: None,
1898            valid_from: None,
1899            valid_until: None,
1900            viewport: None,
1901        }
1902    }
1903
1904    /// Create an entity from a visual bounding box (ColPali multi-modal).
1905    #[must_use]
1906    pub fn from_visual(
1907        text: impl Into<String>,
1908        entity_type: EntityType,
1909        bbox: Span,
1910        confidence: f64,
1911    ) -> Self {
1912        Self {
1913            text: text.into(),
1914            entity_type,
1915            start: 0,
1916            end: 0,
1917            confidence: confidence.clamp(0.0, 1.0),
1918            normalized: None,
1919            provenance: None,
1920            kb_id: None,
1921            canonical_id: None,
1922            hierarchical_confidence: None,
1923            visual_span: Some(bbox),
1924            discontinuous_span: None,
1925            valid_from: None,
1926            valid_until: None,
1927            viewport: None,
1928        }
1929    }
1930
1931    /// Create an entity with default confidence (1.0).
1932    #[must_use]
1933    pub fn with_type(
1934        text: impl Into<String>,
1935        entity_type: EntityType,
1936        start: usize,
1937        end: usize,
1938    ) -> Self {
1939        Self::new(text, entity_type, start, end, 1.0)
1940    }
1941
1942    /// Link this entity to an external knowledge base.
1943    ///
1944    /// # Examples
1945    /// ```rust,ignore
1946    /// use anno_core::{Entity, EntityType};
1947    /// let mut e = Entity::new("Marie Curie", EntityType::Person, 0, 11, 0.95);
1948    /// e.link_to_kb("Q7186"); // Wikidata ID
1949    /// ```
1950    pub fn link_to_kb(&mut self, kb_id: impl Into<String>) {
1951        self.kb_id = Some(kb_id.into());
1952    }
1953
1954    /// Assign this entity to a coreference cluster.
1955    ///
1956    /// Entities with the same `canonical_id` refer to the same real-world entity.
1957    pub fn set_canonical(&mut self, canonical_id: impl Into<super::types::CanonicalId>) {
1958        self.canonical_id = Some(canonical_id.into());
1959    }
1960
1961    /// Builder-style method to set canonical ID.
1962    ///
1963    /// # Example
1964    /// ```
1965    /// use anno_core::{CanonicalId, Entity, EntityType};
1966    /// let entity = Entity::new("John", EntityType::Person, 0, 4, 0.9)
1967    ///     .with_canonical_id(42);
1968    /// assert_eq!(entity.canonical_id, Some(CanonicalId::new(42)));
1969    /// ```
1970    #[must_use]
1971    pub fn with_canonical_id(mut self, canonical_id: impl Into<super::types::CanonicalId>) -> Self {
1972        self.canonical_id = Some(canonical_id.into());
1973        self
1974    }
1975
1976    /// Check if this entity is linked to a knowledge base.
1977    #[must_use]
1978    pub fn is_linked(&self) -> bool {
1979        self.kb_id.is_some()
1980    }
1981
1982    /// Check if this entity has coreference information.
1983    #[must_use]
1984    pub fn has_coreference(&self) -> bool {
1985        self.canonical_id.is_some()
1986    }
1987
1988    /// Check if this entity has a discontinuous span.
1989    ///
1990    /// Discontinuous entities span non-contiguous text regions.
1991    /// Example: "New York and LA airports" contains "New York airports"
1992    /// as a discontinuous entity.
1993    #[must_use]
1994    pub fn is_discontinuous(&self) -> bool {
1995        self.discontinuous_span
1996            .as_ref()
1997            .map(|s| s.is_discontinuous())
1998            .unwrap_or(false)
1999    }
2000
2001    /// Get the discontinuous segments if present.
2002    ///
2003    /// Returns `None` if this is a contiguous entity.
2004    #[must_use]
2005    pub fn discontinuous_segments(&self) -> Option<Vec<std::ops::Range<usize>>> {
2006        self.discontinuous_span
2007            .as_ref()
2008            .filter(|s| s.is_discontinuous())
2009            .map(|s| s.segments().to_vec())
2010    }
2011
2012    /// Set a discontinuous span for this entity.
2013    ///
2014    /// This is used by W2NER and similar models that detect non-contiguous mentions.
2015    pub fn set_discontinuous_span(&mut self, span: DiscontinuousSpan) {
2016        // Update start/end to match the bounding range
2017        if let Some(bounding) = span.bounding_range() {
2018            self.start = bounding.start;
2019            self.end = bounding.end;
2020        }
2021        self.discontinuous_span = Some(span);
2022    }
2023
2024    /// Get the total length covered by this entity, in **characters**.
2025    ///
2026    /// - **Contiguous**: `end - start`
2027    /// - **Discontinuous**: sum of segment lengths
2028    ///
2029    /// This is intentionally consistent: all offsets in `anno::core` entity spans
2030    /// are **character offsets** (Unicode scalar values), not byte offsets.
2031    #[must_use]
2032    pub fn total_len(&self) -> usize {
2033        if let Some(ref span) = self.discontinuous_span {
2034            span.segments().iter().map(|r| r.end - r.start).sum()
2035        } else {
2036            self.end.saturating_sub(self.start)
2037        }
2038    }
2039
2040    /// Set the normalized form for this entity.
2041    ///
2042    /// # Examples
2043    ///
2044    /// ```rust,ignore
2045    /// use anno_core::{Entity, EntityType};
2046    ///
2047    /// let mut entity = Entity::new("Jan 15", EntityType::Date, 0, 6, 0.95);
2048    /// entity.set_normalized("2024-01-15");
2049    /// assert_eq!(entity.normalized.as_deref(), Some("2024-01-15"));
2050    /// ```
2051    pub fn set_normalized(&mut self, normalized: impl Into<String>) {
2052        self.normalized = Some(normalized.into());
2053    }
2054
2055    /// Get the normalized form, or the original text if not normalized.
2056    #[must_use]
2057    pub fn normalized_or_text(&self) -> &str {
2058        self.normalized.as_deref().unwrap_or(&self.text)
2059    }
2060
2061    /// Get the extraction method, if known.
2062    #[must_use]
2063    pub fn method(&self) -> ExtractionMethod {
2064        self.provenance
2065            .as_ref()
2066            .map_or(ExtractionMethod::Unknown, |p| p.method)
2067    }
2068
2069    /// Get the source backend name, if known.
2070    #[must_use]
2071    pub fn source(&self) -> Option<&str> {
2072        self.provenance.as_ref().map(|p| p.source.as_ref())
2073    }
2074
2075    /// Get the entity category.
2076    #[must_use]
2077    pub fn category(&self) -> EntityCategory {
2078        self.entity_type.category()
2079    }
2080
2081    /// Returns true if this entity was detected via patterns (not ML).
2082    #[must_use]
2083    pub fn is_structured(&self) -> bool {
2084        self.entity_type.pattern_detectable()
2085    }
2086
2087    /// Returns true if this entity required ML for detection.
2088    #[must_use]
2089    pub fn is_named(&self) -> bool {
2090        self.entity_type.requires_ml()
2091    }
2092
2093    /// Check if this entity overlaps with another.
2094    #[must_use]
2095    pub fn overlaps(&self, other: &Entity) -> bool {
2096        !(self.end <= other.start || other.end <= self.start)
2097    }
2098
2099    /// Calculate overlap ratio (IoU) with another entity.
2100    #[must_use]
2101    pub fn overlap_ratio(&self, other: &Entity) -> f64 {
2102        let intersection_start = self.start.max(other.start);
2103        let intersection_end = self.end.min(other.end);
2104
2105        if intersection_start >= intersection_end {
2106            return 0.0;
2107        }
2108
2109        let intersection = (intersection_end - intersection_start) as f64;
2110        let union = ((self.end - self.start) + (other.end - other.start)
2111            - (intersection_end - intersection_start)) as f64;
2112
2113        if union == 0.0 {
2114            return 1.0;
2115        }
2116
2117        intersection / union
2118    }
2119
2120    /// Set hierarchical confidence scores.
2121    pub fn set_hierarchical_confidence(&mut self, confidence: HierarchicalConfidence) {
2122        self.confidence = confidence.as_f64();
2123        self.hierarchical_confidence = Some(confidence);
2124    }
2125
2126    /// Get the linkage confidence (coarse filter score).
2127    #[must_use]
2128    pub fn linkage_confidence(&self) -> f32 {
2129        self.hierarchical_confidence
2130            .map_or(self.confidence as f32, |h| h.linkage)
2131    }
2132
2133    /// Get the type classification confidence.
2134    #[must_use]
2135    pub fn type_confidence(&self) -> f32 {
2136        self.hierarchical_confidence
2137            .map_or(self.confidence as f32, |h| h.type_score)
2138    }
2139
2140    /// Get the boundary confidence.
2141    #[must_use]
2142    pub fn boundary_confidence(&self) -> f32 {
2143        self.hierarchical_confidence
2144            .map_or(self.confidence as f32, |h| h.boundary)
2145    }
2146
2147    /// Check if this entity has visual location (multi-modal).
2148    #[must_use]
2149    pub fn is_visual(&self) -> bool {
2150        self.visual_span.is_some()
2151    }
2152
2153    /// Get the text span (start, end).
2154    #[must_use]
2155    pub const fn text_span(&self) -> (usize, usize) {
2156        (self.start, self.end)
2157    }
2158
2159    /// Get the span length.
2160    #[must_use]
2161    pub const fn span_len(&self) -> usize {
2162        self.end.saturating_sub(self.start)
2163    }
2164
2165    /// Create a unified TextSpan with both byte and char offsets.
2166    ///
2167    /// This is useful when you need to work with both offset systems.
2168    /// The `text` parameter must be the original source text from which
2169    /// this entity was extracted.
2170    ///
2171    /// # Arguments
2172    /// * `source_text` - The original text (needed to compute byte offsets)
2173    ///
2174    /// # Returns
2175    /// A TextSpan with both byte and char offsets.
2176    ///
2177    /// # Note
2178    ///
2179    /// This method requires the offset conversion utilities from the `anno` crate.
2180    /// Use `anno::offset::char_to_byte_offsets()` directly for now.
2181    ///
2182    /// # Example
2183    /// ```rust,ignore
2184    /// use anno_core::{Entity, EntityType};
2185    ///
2186    /// let (byte_start, byte_end) = char_to_byte_offsets(text, entity.start, entity.end);
2187    /// ```
2188    #[allow(dead_code)]
2189    #[doc(hidden)]
2190    pub fn to_text_span(&self, _source_text: &str) -> serde_json::Value {
2191        unimplemented!("Use anno::offset utilities directly - see method docs")
2192    }
2193
2194    /// Set visual span for multi-modal extraction.
2195    pub fn set_visual_span(&mut self, span: Span) {
2196        self.visual_span = Some(span);
2197    }
2198
2199    /// Safely extract text from source using character offsets.
2200    ///
2201    /// Entity stores character offsets, not byte offsets. This method
2202    /// correctly extracts text by iterating over characters.
2203    ///
2204    /// # Arguments
2205    /// * `source_text` - The original text from which this entity was extracted
2206    ///
2207    /// # Returns
2208    /// The extracted text, or empty string if offsets are invalid
2209    ///
2210    /// # Example
2211    /// ```rust,ignore
2212    /// use anno_core::{Entity, EntityType};
2213    ///
2214    /// let text = "Hello, 日本!";
2215    /// let entity = Entity::new("日本", EntityType::Location, 7, 9, 0.95);
2216    /// assert_eq!(entity.extract_text(text), "日本");
2217    /// ```
2218    #[must_use]
2219    pub fn extract_text(&self, source_text: &str) -> String {
2220        // Performance: Use cached length if available, but fallback to counting
2221        // For single entity extraction, this is fine. For batch operations,
2222        // use extract_text_with_len with pre-computed length.
2223        let char_count = source_text.chars().count();
2224        self.extract_text_with_len(source_text, char_count)
2225    }
2226
2227    /// Extract text with pre-computed text length (performance optimization).
2228    ///
2229    /// Use this when validating/clamping multiple entities from the same text
2230    /// to avoid recalculating `text.chars().count()` for each entity.
2231    ///
2232    /// # Arguments
2233    /// * `source_text` - The original text
2234    /// * `text_char_count` - Pre-computed character count (from `text.chars().count()`)
2235    ///
2236    /// # Returns
2237    /// The extracted text, or empty string if offsets are invalid
2238    #[must_use]
2239    pub fn extract_text_with_len(&self, source_text: &str, text_char_count: usize) -> String {
2240        if self.start >= text_char_count || self.end > text_char_count || self.start >= self.end {
2241            return String::new();
2242        }
2243        source_text
2244            .chars()
2245            .skip(self.start)
2246            .take(self.end - self.start)
2247            .collect()
2248    }
2249
2250    // =========================================================================
2251    // Temporal Validity Methods
2252    // =========================================================================
2253
2254    /// Set the temporal validity start for this entity assertion.
2255    ///
2256    /// # Example
2257    /// ```rust,ignore
2258    /// use anno_core::{Entity, EntityType};
2259    /// use chrono::{TimeZone, Utc};
2260    ///
2261    /// let mut entity = Entity::new("CEO", EntityType::Person, 0, 3, 0.9);
2262    /// entity.set_valid_from(Utc.with_ymd_and_hms(2008, 10, 1, 0, 0, 0).unwrap());
2263    /// assert!(entity.is_temporal());
2264    /// ```
2265    pub fn set_valid_from(&mut self, dt: chrono::DateTime<chrono::Utc>) {
2266        self.valid_from = Some(dt);
2267    }
2268
2269    /// Set the temporal validity end for this entity assertion.
2270    pub fn set_valid_until(&mut self, dt: chrono::DateTime<chrono::Utc>) {
2271        self.valid_until = Some(dt);
2272    }
2273
2274    /// Set both temporal bounds at once.
2275    pub fn set_temporal_range(
2276        &mut self,
2277        from: chrono::DateTime<chrono::Utc>,
2278        until: chrono::DateTime<chrono::Utc>,
2279    ) {
2280        self.valid_from = Some(from);
2281        self.valid_until = Some(until);
2282    }
2283
2284    /// Check if this entity has temporal validity information.
2285    #[must_use]
2286    pub fn is_temporal(&self) -> bool {
2287        self.valid_from.is_some() || self.valid_until.is_some()
2288    }
2289
2290    /// Check if this entity was valid at a specific point in time.
2291    ///
2292    /// Returns `true` if:
2293    /// - No temporal bounds are set (atemporal entity)
2294    /// - The timestamp falls within [valid_from, valid_until]
2295    ///
2296    /// # Example
2297    /// ```rust,ignore
2298    /// use anno_core::{Entity, EntityType};
2299    /// use chrono::{TimeZone, Utc};
2300    ///
2301    /// let mut entity = Entity::new("CEO of Microsoft", EntityType::Person, 0, 16, 0.9);
2302    /// entity.set_valid_from(Utc.with_ymd_and_hms(2008, 1, 1, 0, 0, 0).unwrap());
2303    /// entity.set_valid_until(Utc.with_ymd_and_hms(2023, 12, 31, 0, 0, 0).unwrap());
2304    ///
2305    /// let query_2015 = Utc.with_ymd_and_hms(2015, 6, 1, 0, 0, 0).unwrap();
2306    /// let query_2005 = Utc.with_ymd_and_hms(2005, 6, 1, 0, 0, 0).unwrap();
2307    ///
2308    /// assert!(entity.valid_at(&query_2015));
2309    /// assert!(!entity.valid_at(&query_2005));
2310    /// ```
2311    #[must_use]
2312    pub fn valid_at(&self, timestamp: &chrono::DateTime<chrono::Utc>) -> bool {
2313        match (&self.valid_from, &self.valid_until) {
2314            (None, None) => true,                      // Atemporal - always valid
2315            (Some(from), None) => timestamp >= from,   // Started, still valid
2316            (None, Some(until)) => timestamp <= until, // Unknown start, ended
2317            (Some(from), Some(until)) => timestamp >= from && timestamp <= until,
2318        }
2319    }
2320
2321    /// Check if this entity is currently valid (at the current time).
2322    #[must_use]
2323    pub fn is_currently_valid(&self) -> bool {
2324        self.valid_at(&chrono::Utc::now())
2325    }
2326
2327    // =========================================================================
2328    // Viewport/Context Methods
2329    // =========================================================================
2330
2331    /// Set the viewport context for this entity.
2332    ///
2333    /// # Example
2334    /// ```rust,ignore
2335    /// use anno_core::{Entity, EntityType, EntityViewport};
2336    ///
2337    /// let mut entity = Entity::new("Marie Curie", EntityType::Person, 0, 11, 0.9);
2338    /// entity.set_viewport(EntityViewport::Academic);
2339    /// assert!(entity.has_viewport());
2340    /// ```
2341    pub fn set_viewport(&mut self, viewport: EntityViewport) {
2342        self.viewport = Some(viewport);
2343    }
2344
2345    /// Check if this entity has a viewport context.
2346    #[must_use]
2347    pub fn has_viewport(&self) -> bool {
2348        self.viewport.is_some()
2349    }
2350
2351    /// Get the viewport, defaulting to General if not set.
2352    #[must_use]
2353    pub fn viewport_or_default(&self) -> EntityViewport {
2354        self.viewport.clone().unwrap_or_default()
2355    }
2356
2357    /// Check if this entity matches a viewport context.
2358    ///
2359    /// Returns true if:
2360    /// - The entity has no viewport (matches any)
2361    /// - The entity's viewport matches the query
2362    #[must_use]
2363    pub fn matches_viewport(&self, query_viewport: &EntityViewport) -> bool {
2364        match &self.viewport {
2365            None => true, // No viewport = matches any
2366            Some(v) => v == query_viewport,
2367        }
2368    }
2369
2370    /// Create a builder for fluent entity construction.
2371    #[must_use]
2372    pub fn builder(text: impl Into<String>, entity_type: EntityType) -> EntityBuilder {
2373        EntityBuilder::new(text, entity_type)
2374    }
2375
2376    // =========================================================================
2377    // Validation Methods (Production Quality)
2378    // =========================================================================
2379
2380    /// Validate this entity against the source text.
2381    ///
2382    /// Returns a list of validation issues. Empty list means the entity is valid.
2383    ///
2384    /// # Checks Performed
2385    ///
2386    /// 1. **Span bounds**: `start < end`, both within text length
2387    /// 2. **Text match**: `text` matches the span in source
2388    /// 3. **Confidence range**: `confidence` in [0.0, 1.0]
2389    /// 4. **Type consistency**: Custom types have non-empty names
2390    /// 5. **Discontinuous consistency**: If present, segments are valid
2391    ///
2392    /// # Example
2393    ///
2394    /// ```rust,ignore
2395    /// use anno_core::{Entity, EntityType};
2396    ///
2397    /// let text = "John works at Apple";
2398    /// let entity = Entity::new("John", EntityType::Person, 0, 4, 0.95);
2399    ///
2400    /// let issues = entity.validate(text);
2401    /// assert!(issues.is_empty(), "Entity should be valid");
2402    ///
2403    /// // Invalid entity: span doesn't match text
2404    /// let bad = Entity::new("Jane", EntityType::Person, 0, 4, 0.95);
2405    /// let issues = bad.validate(text);
2406    /// assert!(!issues.is_empty(), "Entity text doesn't match span");
2407    /// ```
2408    #[must_use]
2409    pub fn validate(&self, source_text: &str) -> Vec<ValidationIssue> {
2410        // Performance: Calculate length once, delegate to optimized version
2411        let char_count = source_text.chars().count();
2412        self.validate_with_len(source_text, char_count)
2413    }
2414
2415    /// Validate entity with pre-computed text length (performance optimization).
2416    ///
2417    /// Use this when validating multiple entities from the same text to avoid
2418    /// recalculating `text.chars().count()` for each entity.
2419    ///
2420    /// # Arguments
2421    /// * `source_text` - The original text
2422    /// * `text_char_count` - Pre-computed character count (from `text.chars().count()`)
2423    ///
2424    /// # Returns
2425    /// Vector of validation issues (empty if valid)
2426    #[must_use]
2427    pub fn validate_with_len(
2428        &self,
2429        source_text: &str,
2430        text_char_count: usize,
2431    ) -> Vec<ValidationIssue> {
2432        let mut issues = Vec::new();
2433
2434        // 1. Span bounds
2435        if self.start >= self.end {
2436            issues.push(ValidationIssue::InvalidSpan {
2437                start: self.start,
2438                end: self.end,
2439                reason: "start must be less than end".to_string(),
2440            });
2441        }
2442
2443        if self.end > text_char_count {
2444            issues.push(ValidationIssue::SpanOutOfBounds {
2445                end: self.end,
2446                text_len: text_char_count,
2447            });
2448        }
2449
2450        // 2. Text match (only if span is valid)
2451        if self.start < self.end && self.end <= text_char_count {
2452            let actual = self.extract_text_with_len(source_text, text_char_count);
2453            if actual != self.text {
2454                issues.push(ValidationIssue::TextMismatch {
2455                    expected: self.text.clone(),
2456                    actual,
2457                    start: self.start,
2458                    end: self.end,
2459                });
2460            }
2461        }
2462
2463        // 3. Confidence range
2464        if !(0.0..=1.0).contains(&self.confidence) {
2465            issues.push(ValidationIssue::InvalidConfidence {
2466                value: self.confidence,
2467            });
2468        }
2469
2470        // 4. Type consistency
2471        if let EntityType::Custom { ref name, .. } = self.entity_type {
2472            if name.is_empty() {
2473                issues.push(ValidationIssue::InvalidType {
2474                    reason: "Custom entity type has empty name".to_string(),
2475                });
2476            }
2477        }
2478
2479        // 5. Discontinuous span consistency
2480        if let Some(ref disc_span) = self.discontinuous_span {
2481            for (i, seg) in disc_span.segments().iter().enumerate() {
2482                if seg.start >= seg.end {
2483                    issues.push(ValidationIssue::InvalidSpan {
2484                        start: seg.start,
2485                        end: seg.end,
2486                        reason: format!("discontinuous segment {} is invalid", i),
2487                    });
2488                }
2489                if seg.end > text_char_count {
2490                    issues.push(ValidationIssue::SpanOutOfBounds {
2491                        end: seg.end,
2492                        text_len: text_char_count,
2493                    });
2494                }
2495            }
2496        }
2497
2498        issues
2499    }
2500
2501    /// Check if this entity is valid against the source text.
2502    ///
2503    /// Convenience method that returns `true` if `validate()` returns empty.
2504    #[must_use]
2505    pub fn is_valid(&self, source_text: &str) -> bool {
2506        self.validate(source_text).is_empty()
2507    }
2508
2509    /// Validate a batch of entities efficiently.
2510    ///
2511    /// Returns a map of entity index -> validation issues.
2512    /// Only entities with issues are included.
2513    ///
2514    /// # Example
2515    ///
2516    /// ```rust,ignore
2517    /// use anno_core::{Entity, EntityType};
2518    ///
2519    /// let text = "John and Jane work at Apple";
2520    /// let entities = vec![
2521    ///     Entity::new("John", EntityType::Person, 0, 4, 0.95),
2522    ///     Entity::new("Wrong", EntityType::Person, 9, 13, 0.8),
2523    /// ];
2524    ///
2525    /// let issues = Entity::validate_batch(&entities, text);
2526    /// assert!(issues.is_empty() || issues.contains_key(&1)); // Second entity might fail
2527    /// ```
2528    #[must_use]
2529    pub fn validate_batch(
2530        entities: &[Entity],
2531        source_text: &str,
2532    ) -> std::collections::HashMap<usize, Vec<ValidationIssue>> {
2533        entities
2534            .iter()
2535            .enumerate()
2536            .filter_map(|(idx, entity)| {
2537                let issues = entity.validate(source_text);
2538                if issues.is_empty() {
2539                    None
2540                } else {
2541                    Some((idx, issues))
2542                }
2543            })
2544            .collect()
2545    }
2546}
2547
2548/// Validation issue found during entity validation.
2549#[derive(Debug, Clone, PartialEq)]
2550pub enum ValidationIssue {
2551    /// Span bounds are invalid (start >= end).
2552    InvalidSpan {
2553        /// Start position of the invalid span.
2554        start: usize,
2555        /// End position of the invalid span.
2556        end: usize,
2557        /// Description of why the span is invalid.
2558        reason: String,
2559    },
2560    /// Span extends beyond text length.
2561    SpanOutOfBounds {
2562        /// End position that exceeds the text.
2563        end: usize,
2564        /// Actual length of the text.
2565        text_len: usize,
2566    },
2567    /// Entity text doesn't match the span in source.
2568    TextMismatch {
2569        /// Text stored in the entity.
2570        expected: String,
2571        /// Text found at the span in source.
2572        actual: String,
2573        /// Start position of the span.
2574        start: usize,
2575        /// End position of the span.
2576        end: usize,
2577    },
2578    /// Confidence is outside [0.0, 1.0].
2579    InvalidConfidence {
2580        /// The invalid confidence value.
2581        value: f64,
2582    },
2583    /// Entity type is invalid.
2584    InvalidType {
2585        /// Description of why the type is invalid.
2586        reason: String,
2587    },
2588}
2589
2590impl std::fmt::Display for ValidationIssue {
2591    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
2592        match self {
2593            ValidationIssue::InvalidSpan { start, end, reason } => {
2594                write!(f, "Invalid span [{}, {}): {}", start, end, reason)
2595            }
2596            ValidationIssue::SpanOutOfBounds { end, text_len } => {
2597                write!(f, "Span end {} exceeds text length {}", end, text_len)
2598            }
2599            ValidationIssue::TextMismatch {
2600                expected,
2601                actual,
2602                start,
2603                end,
2604            } => {
2605                write!(
2606                    f,
2607                    "Text mismatch at [{}, {}): expected '{}', got '{}'",
2608                    start, end, expected, actual
2609                )
2610            }
2611            ValidationIssue::InvalidConfidence { value } => {
2612                write!(f, "Confidence {} outside [0.0, 1.0]", value)
2613            }
2614            ValidationIssue::InvalidType { reason } => {
2615                write!(f, "Invalid entity type: {}", reason)
2616            }
2617        }
2618    }
2619}
2620
2621/// Fluent builder for constructing entities with optional fields.
2622///
2623/// # Example
2624///
2625/// ```rust,ignore
2626/// use anno_core::{Entity, EntityType, Provenance};
2627///
2628/// let entity = Entity::builder("Marie Curie", EntityType::Person)
2629///     .span(0, 11)
2630///     .confidence(0.95)
2631///     .kb_id("Q7186")
2632///     .provenance(Provenance::ml("bert", 0.95))
2633///     .build();
2634/// ```
2635#[derive(Debug, Clone)]
2636pub struct EntityBuilder {
2637    text: String,
2638    entity_type: EntityType,
2639    start: usize,
2640    end: usize,
2641    confidence: f64,
2642    normalized: Option<String>,
2643    provenance: Option<Provenance>,
2644    kb_id: Option<String>,
2645    canonical_id: Option<super::types::CanonicalId>,
2646    hierarchical_confidence: Option<HierarchicalConfidence>,
2647    visual_span: Option<Span>,
2648    discontinuous_span: Option<DiscontinuousSpan>,
2649    valid_from: Option<chrono::DateTime<chrono::Utc>>,
2650    valid_until: Option<chrono::DateTime<chrono::Utc>>,
2651    viewport: Option<EntityViewport>,
2652}
2653
2654impl EntityBuilder {
2655    /// Create a new builder.
2656    #[must_use]
2657    pub fn new(text: impl Into<String>, entity_type: EntityType) -> Self {
2658        Self {
2659            text: text.into(),
2660            entity_type,
2661            start: 0,
2662            end: 0,
2663            confidence: 1.0,
2664            normalized: None,
2665            provenance: None,
2666            kb_id: None,
2667            canonical_id: None,
2668            hierarchical_confidence: None,
2669            visual_span: None,
2670            discontinuous_span: None,
2671            valid_from: None,
2672            valid_until: None,
2673            viewport: None,
2674        }
2675    }
2676
2677    /// Set span offsets.
2678    #[must_use]
2679    pub const fn span(mut self, start: usize, end: usize) -> Self {
2680        self.start = start;
2681        self.end = end;
2682        self
2683    }
2684
2685    /// Set confidence score.
2686    #[must_use]
2687    pub fn confidence(mut self, confidence: f64) -> Self {
2688        self.confidence = confidence.clamp(0.0, 1.0);
2689        self
2690    }
2691
2692    /// Set hierarchical confidence.
2693    #[must_use]
2694    pub fn hierarchical_confidence(mut self, confidence: HierarchicalConfidence) -> Self {
2695        self.confidence = confidence.as_f64();
2696        self.hierarchical_confidence = Some(confidence);
2697        self
2698    }
2699
2700    /// Set normalized form.
2701    #[must_use]
2702    pub fn normalized(mut self, normalized: impl Into<String>) -> Self {
2703        self.normalized = Some(normalized.into());
2704        self
2705    }
2706
2707    /// Set provenance.
2708    #[must_use]
2709    pub fn provenance(mut self, provenance: Provenance) -> Self {
2710        self.provenance = Some(provenance);
2711        self
2712    }
2713
2714    /// Set knowledge base ID.
2715    #[must_use]
2716    pub fn kb_id(mut self, kb_id: impl Into<String>) -> Self {
2717        self.kb_id = Some(kb_id.into());
2718        self
2719    }
2720
2721    /// Set canonical (coreference) ID.
2722    #[must_use]
2723    pub const fn canonical_id(mut self, canonical_id: u64) -> Self {
2724        self.canonical_id = Some(super::types::CanonicalId::new(canonical_id));
2725        self
2726    }
2727
2728    /// Set visual span.
2729    #[must_use]
2730    pub fn visual_span(mut self, span: Span) -> Self {
2731        self.visual_span = Some(span);
2732        self
2733    }
2734
2735    /// Set discontinuous span for non-contiguous entities.
2736    ///
2737    /// This automatically updates `start` and `end` to the bounding range.
2738    #[must_use]
2739    pub fn discontinuous_span(mut self, span: DiscontinuousSpan) -> Self {
2740        // Update start/end to bounding range
2741        if let Some(bounding) = span.bounding_range() {
2742            self.start = bounding.start;
2743            self.end = bounding.end;
2744        }
2745        self.discontinuous_span = Some(span);
2746        self
2747    }
2748
2749    /// Set temporal validity start (when this entity assertion became true).
2750    ///
2751    /// # Example
2752    /// ```rust,ignore
2753    /// use anno_core::{EntityBuilder, EntityType};
2754    /// use chrono::{TimeZone, Utc};
2755    ///
2756    /// let entity = EntityBuilder::new("CEO of Microsoft", EntityType::Person)
2757    ///     .span(0, 12)
2758    ///     .valid_from(Utc.with_ymd_and_hms(2008, 10, 1, 0, 0, 0).unwrap())
2759    ///     .build();
2760    /// assert!(entity.valid_from.is_some());
2761    /// ```
2762    #[must_use]
2763    pub fn valid_from(mut self, dt: chrono::DateTime<chrono::Utc>) -> Self {
2764        self.valid_from = Some(dt);
2765        self
2766    }
2767
2768    /// Set temporal validity end (when this entity assertion stopped being true).
2769    #[must_use]
2770    pub fn valid_until(mut self, dt: chrono::DateTime<chrono::Utc>) -> Self {
2771        self.valid_until = Some(dt);
2772        self
2773    }
2774
2775    /// Set temporal validity range (convenience method).
2776    #[must_use]
2777    pub fn temporal_range(
2778        mut self,
2779        from: chrono::DateTime<chrono::Utc>,
2780        until: chrono::DateTime<chrono::Utc>,
2781    ) -> Self {
2782        self.valid_from = Some(from);
2783        self.valid_until = Some(until);
2784        self
2785    }
2786
2787    /// Set the viewport context for multi-faceted entity representation.
2788    ///
2789    /// # Example
2790    /// ```rust,ignore
2791    /// use anno_core::{EntityBuilder, EntityType, EntityViewport};
2792    ///
2793    /// let entity = EntityBuilder::new("Marie Curie", EntityType::Person)
2794    ///     .span(0, 11)
2795    ///     .viewport(EntityViewport::Academic)
2796    ///     .build();
2797    /// assert_eq!(entity.viewport, Some(EntityViewport::Academic));
2798    /// ```
2799    #[must_use]
2800    pub fn viewport(mut self, viewport: EntityViewport) -> Self {
2801        self.viewport = Some(viewport);
2802        self
2803    }
2804
2805    /// Build the entity.
2806    #[must_use]
2807    pub fn build(self) -> Entity {
2808        Entity {
2809            text: self.text,
2810            entity_type: self.entity_type,
2811            start: self.start,
2812            end: self.end,
2813            confidence: self.confidence,
2814            normalized: self.normalized,
2815            provenance: self.provenance,
2816            kb_id: self.kb_id,
2817            canonical_id: self.canonical_id,
2818            hierarchical_confidence: self.hierarchical_confidence,
2819            visual_span: self.visual_span,
2820            discontinuous_span: self.discontinuous_span,
2821            valid_from: self.valid_from,
2822            valid_until: self.valid_until,
2823            viewport: self.viewport,
2824        }
2825    }
2826}
2827
2828// ============================================================================
2829// Relation (for Knowledge Graph Construction)
2830// ============================================================================
2831
2832/// A relation between two entities, forming a knowledge graph triple.
2833///
2834/// In the GLiNER bi-encoder paradigm, relations are detected just like entities:
2835/// the relation trigger text ("CEO of", "located in") is matched against
2836/// relation type labels in the same latent space.
2837///
2838/// # Structure
2839///
2840/// ```text
2841/// Triple: (Head, Relation, Tail)
2842///
2843/// "Marie Curie worked at the Sorbonne"
2844///  ^^^^^^^^^^^ ~~~~~~~~~ ^^^^^^^^
2845///  Head        Rel       Tail
2846///  (Person)  (Employment)  (Organization)
2847/// ```
2848///
2849/// # TPLinker/Joint Extraction
2850///
2851/// For joint extraction, relations are extracted in a single pass with entities.
2852/// The `trigger_span` captures the text that indicates the relation.
2853#[derive(Debug, Clone, Serialize, Deserialize)]
2854pub struct Relation {
2855    /// The source entity (head of the triple)
2856    pub head: Entity,
2857    /// The target entity (tail of the triple)
2858    pub tail: Entity,
2859    /// Relation type label (e.g., "EMPLOYMENT", "LOCATED_IN", "FOUNDED_BY")
2860    pub relation_type: String,
2861    /// Optional trigger span: the text that indicates this relation
2862    /// For "CEO of", this would be the span covering "CEO of"
2863    pub trigger_span: Option<(usize, usize)>,
2864    /// Confidence score for this relation (0.0-1.0)
2865    pub confidence: f64,
2866}
2867
2868impl Relation {
2869    /// Create a new relation between two entities.
2870    #[must_use]
2871    pub fn new(
2872        head: Entity,
2873        tail: Entity,
2874        relation_type: impl Into<String>,
2875        confidence: f64,
2876    ) -> Self {
2877        Self {
2878            head,
2879            tail,
2880            relation_type: relation_type.into(),
2881            trigger_span: None,
2882            confidence: confidence.clamp(0.0, 1.0),
2883        }
2884    }
2885
2886    /// Create a relation with an explicit trigger span.
2887    #[must_use]
2888    pub fn with_trigger(
2889        head: Entity,
2890        tail: Entity,
2891        relation_type: impl Into<String>,
2892        trigger_start: usize,
2893        trigger_end: usize,
2894        confidence: f64,
2895    ) -> Self {
2896        Self {
2897            head,
2898            tail,
2899            relation_type: relation_type.into(),
2900            trigger_span: Some((trigger_start, trigger_end)),
2901            confidence: confidence.clamp(0.0, 1.0),
2902        }
2903    }
2904
2905    /// Convert to a triple string representation (for debugging/display).
2906    #[must_use]
2907    pub fn as_triple(&self) -> String {
2908        format!(
2909            "({}, {}, {})",
2910            self.head.text, self.relation_type, self.tail.text
2911        )
2912    }
2913
2914    /// Check if the head and tail entities are adjacent (within n tokens).
2915    /// Useful for filtering spurious long-distance relations.
2916    #[must_use]
2917    pub fn span_distance(&self) -> usize {
2918        if self.head.end <= self.tail.start {
2919            self.tail.start.saturating_sub(self.head.end)
2920        } else if self.tail.end <= self.head.start {
2921            self.head.start.saturating_sub(self.tail.end)
2922        } else {
2923            0 // Overlapping spans
2924        }
2925    }
2926}
2927
2928#[cfg(test)]
2929mod tests {
2930    #![allow(clippy::unwrap_used)] // unwrap() is acceptable in test code
2931    use super::*;
2932
2933    #[test]
2934    fn test_entity_type_roundtrip() {
2935        let types = [
2936            EntityType::Person,
2937            EntityType::Organization,
2938            EntityType::Location,
2939            EntityType::Date,
2940            EntityType::Money,
2941            EntityType::Percent,
2942        ];
2943
2944        for t in types {
2945            let label = t.as_label();
2946            let parsed = EntityType::from_label(label);
2947            assert_eq!(t, parsed);
2948        }
2949    }
2950
2951    #[test]
2952    fn test_entity_overlap() {
2953        let e1 = Entity::new("John", EntityType::Person, 0, 4, 0.9);
2954        let e2 = Entity::new("Smith", EntityType::Person, 5, 10, 0.9);
2955        let e3 = Entity::new("John Smith", EntityType::Person, 0, 10, 0.9);
2956
2957        assert!(!e1.overlaps(&e2)); // No overlap
2958        assert!(e1.overlaps(&e3)); // e1 is contained in e3
2959        assert!(e3.overlaps(&e2)); // e3 contains e2
2960    }
2961
2962    #[test]
2963    fn test_confidence_clamping() {
2964        let e1 = Entity::new("test", EntityType::Person, 0, 4, 1.5);
2965        assert!((e1.confidence - 1.0).abs() < f64::EPSILON);
2966
2967        let e2 = Entity::new("test", EntityType::Person, 0, 4, -0.5);
2968        assert!(e2.confidence.abs() < f64::EPSILON);
2969    }
2970
2971    #[test]
2972    fn test_entity_categories() {
2973        // Agent/Org/Place entities require ML
2974        assert_eq!(EntityType::Person.category(), EntityCategory::Agent);
2975        assert_eq!(
2976            EntityType::Organization.category(),
2977            EntityCategory::Organization
2978        );
2979        assert_eq!(EntityType::Location.category(), EntityCategory::Place);
2980        assert!(EntityType::Person.requires_ml());
2981        assert!(!EntityType::Person.pattern_detectable());
2982
2983        // Temporal entities are pattern-detectable
2984        assert_eq!(EntityType::Date.category(), EntityCategory::Temporal);
2985        assert_eq!(EntityType::Time.category(), EntityCategory::Temporal);
2986        assert!(EntityType::Date.pattern_detectable());
2987        assert!(!EntityType::Date.requires_ml());
2988
2989        // Numeric entities are pattern-detectable
2990        assert_eq!(EntityType::Money.category(), EntityCategory::Numeric);
2991        assert_eq!(EntityType::Percent.category(), EntityCategory::Numeric);
2992        assert!(EntityType::Money.pattern_detectable());
2993
2994        // Contact entities are pattern-detectable
2995        assert_eq!(EntityType::Email.category(), EntityCategory::Contact);
2996        assert_eq!(EntityType::Url.category(), EntityCategory::Contact);
2997        assert_eq!(EntityType::Phone.category(), EntityCategory::Contact);
2998        assert!(EntityType::Email.pattern_detectable());
2999    }
3000
3001    #[test]
3002    fn test_new_types_roundtrip() {
3003        let types = [
3004            EntityType::Time,
3005            EntityType::Email,
3006            EntityType::Url,
3007            EntityType::Phone,
3008            EntityType::Quantity,
3009            EntityType::Cardinal,
3010            EntityType::Ordinal,
3011        ];
3012
3013        for t in types {
3014            let label = t.as_label();
3015            let parsed = EntityType::from_label(label);
3016            assert_eq!(t, parsed, "Roundtrip failed for {}", label);
3017        }
3018    }
3019
3020    #[test]
3021    fn test_custom_entity_type() {
3022        let disease = EntityType::custom("DISEASE", EntityCategory::Agent);
3023        assert_eq!(disease.as_label(), "DISEASE");
3024        assert!(disease.requires_ml());
3025
3026        let product_id = EntityType::custom("PRODUCT_ID", EntityCategory::Misc);
3027        assert_eq!(product_id.as_label(), "PRODUCT_ID");
3028        assert!(!product_id.requires_ml());
3029        assert!(!product_id.pattern_detectable());
3030    }
3031
3032    #[test]
3033    fn test_entity_normalization() {
3034        let mut e = Entity::new("Jan 15", EntityType::Date, 0, 6, 0.95);
3035        assert!(e.normalized.is_none());
3036        assert_eq!(e.normalized_or_text(), "Jan 15");
3037
3038        e.set_normalized("2024-01-15");
3039        assert_eq!(e.normalized.as_deref(), Some("2024-01-15"));
3040        assert_eq!(e.normalized_or_text(), "2024-01-15");
3041    }
3042
3043    #[test]
3044    fn test_entity_helpers() {
3045        let named = Entity::new("John", EntityType::Person, 0, 4, 0.9);
3046        assert!(named.is_named());
3047        assert!(!named.is_structured());
3048        assert_eq!(named.category(), EntityCategory::Agent);
3049
3050        let structured = Entity::new("$100", EntityType::Money, 0, 4, 0.95);
3051        assert!(!structured.is_named());
3052        assert!(structured.is_structured());
3053        assert_eq!(structured.category(), EntityCategory::Numeric);
3054    }
3055
3056    #[test]
3057    fn test_knowledge_linking() {
3058        let mut entity = Entity::new("Marie Curie", EntityType::Person, 0, 11, 0.95);
3059        assert!(!entity.is_linked());
3060        assert!(!entity.has_coreference());
3061
3062        entity.link_to_kb("Q7186"); // Wikidata ID
3063        assert!(entity.is_linked());
3064        assert_eq!(entity.kb_id.as_deref(), Some("Q7186"));
3065
3066        entity.set_canonical(42);
3067        assert!(entity.has_coreference());
3068        assert_eq!(
3069            entity.canonical_id,
3070            Some(crate::core::types::CanonicalId::new(42))
3071        );
3072    }
3073
3074    #[test]
3075    fn test_relation_creation() {
3076        let head = Entity::new("Marie Curie", EntityType::Person, 0, 11, 0.95);
3077        let tail = Entity::new("Sorbonne", EntityType::Organization, 24, 32, 0.90);
3078
3079        let relation = Relation::new(head.clone(), tail.clone(), "WORKED_AT", 0.85);
3080        assert_eq!(relation.relation_type, "WORKED_AT");
3081        assert_eq!(relation.as_triple(), "(Marie Curie, WORKED_AT, Sorbonne)");
3082        assert!(relation.trigger_span.is_none());
3083
3084        // With trigger span
3085        let relation2 = Relation::with_trigger(head, tail, "EMPLOYMENT", 13, 19, 0.85);
3086        assert_eq!(relation2.trigger_span, Some((13, 19)));
3087    }
3088
3089    #[test]
3090    fn test_relation_span_distance() {
3091        // Head at 0-11, tail at 24-32 -> distance is 24-11 = 13
3092        let head = Entity::new("Marie Curie", EntityType::Person, 0, 11, 0.95);
3093        let tail = Entity::new("Sorbonne", EntityType::Organization, 24, 32, 0.90);
3094        let relation = Relation::new(head, tail, "WORKED_AT", 0.85);
3095        assert_eq!(relation.span_distance(), 13);
3096    }
3097
3098    #[test]
3099    fn test_relation_category() {
3100        // Relation types should be categorized as Relation
3101        let rel_type = EntityType::custom("CEO_OF", EntityCategory::Relation);
3102        assert_eq!(rel_type.category(), EntityCategory::Relation);
3103        assert!(rel_type.category().is_relation());
3104        assert!(rel_type.requires_ml()); // Relations require ML
3105    }
3106
3107    // ========================================================================
3108    // Span Tests
3109    // ========================================================================
3110
3111    #[test]
3112    fn test_span_text() {
3113        let span = Span::text(10, 20);
3114        assert!(span.is_text());
3115        assert!(!span.is_visual());
3116        assert_eq!(span.text_offsets(), Some((10, 20)));
3117        assert_eq!(span.len(), 10);
3118        assert!(!span.is_empty());
3119    }
3120
3121    #[test]
3122    fn test_span_bbox() {
3123        let span = Span::bbox(0.1, 0.2, 0.3, 0.4);
3124        assert!(!span.is_text());
3125        assert!(span.is_visual());
3126        assert_eq!(span.text_offsets(), None);
3127        assert_eq!(span.len(), 0); // No text length
3128    }
3129
3130    #[test]
3131    fn test_span_bbox_with_page() {
3132        let span = Span::bbox_on_page(0.1, 0.2, 0.3, 0.4, 5);
3133        if let Span::BoundingBox { page, .. } = span {
3134            assert_eq!(page, Some(5));
3135        } else {
3136            panic!("Expected BoundingBox");
3137        }
3138    }
3139
3140    #[test]
3141    fn test_span_hybrid() {
3142        let bbox = Span::bbox(0.1, 0.2, 0.3, 0.4);
3143        let hybrid = Span::Hybrid {
3144            start: 10,
3145            end: 20,
3146            bbox: Box::new(bbox),
3147        };
3148        assert!(hybrid.is_text());
3149        assert!(hybrid.is_visual());
3150        assert_eq!(hybrid.text_offsets(), Some((10, 20)));
3151        assert_eq!(hybrid.len(), 10);
3152    }
3153
3154    // ========================================================================
3155    // Hierarchical Confidence Tests
3156    // ========================================================================
3157
3158    #[test]
3159    fn test_hierarchical_confidence_new() {
3160        let hc = HierarchicalConfidence::new(0.9, 0.8, 0.7);
3161        assert!((hc.linkage - 0.9).abs() < f32::EPSILON);
3162        assert!((hc.type_score - 0.8).abs() < f32::EPSILON);
3163        assert!((hc.boundary - 0.7).abs() < f32::EPSILON);
3164    }
3165
3166    #[test]
3167    fn test_hierarchical_confidence_clamping() {
3168        let hc = HierarchicalConfidence::new(1.5, -0.5, 0.5);
3169        assert!((hc.linkage - 1.0).abs() < f32::EPSILON);
3170        assert!(hc.type_score.abs() < f32::EPSILON);
3171        assert!((hc.boundary - 0.5).abs() < f32::EPSILON);
3172    }
3173
3174    #[test]
3175    fn test_hierarchical_confidence_from_single() {
3176        let hc = HierarchicalConfidence::from_single(0.8);
3177        assert!((hc.linkage - 0.8).abs() < f32::EPSILON);
3178        assert!((hc.type_score - 0.8).abs() < f32::EPSILON);
3179        assert!((hc.boundary - 0.8).abs() < f32::EPSILON);
3180    }
3181
3182    #[test]
3183    fn test_hierarchical_confidence_combined() {
3184        let hc = HierarchicalConfidence::new(1.0, 1.0, 1.0);
3185        assert!((hc.combined() - 1.0).abs() < f32::EPSILON);
3186
3187        let hc2 = HierarchicalConfidence::new(0.8, 0.8, 0.8);
3188        assert!((hc2.combined() - 0.8).abs() < f32::EPSILON);
3189
3190        // Geometric mean: (0.5 * 0.5 * 0.5)^(1/3) = 0.5
3191        let hc3 = HierarchicalConfidence::new(0.5, 0.5, 0.5);
3192        assert!((hc3.combined() - 0.5).abs() < 0.001);
3193    }
3194
3195    #[test]
3196    fn test_hierarchical_confidence_threshold() {
3197        let hc = HierarchicalConfidence::new(0.9, 0.8, 0.7);
3198        assert!(hc.passes_threshold(0.5, 0.5, 0.5));
3199        assert!(hc.passes_threshold(0.9, 0.8, 0.7));
3200        assert!(!hc.passes_threshold(0.95, 0.8, 0.7)); // linkage too high
3201        assert!(!hc.passes_threshold(0.9, 0.85, 0.7)); // type too high
3202    }
3203
3204    #[test]
3205    fn test_hierarchical_confidence_from_f64() {
3206        let hc: HierarchicalConfidence = 0.85_f64.into();
3207        assert!((hc.linkage - 0.85).abs() < 0.001);
3208    }
3209
3210    // ========================================================================
3211    // RaggedBatch Tests
3212    // ========================================================================
3213
3214    #[test]
3215    fn test_ragged_batch_from_sequences() {
3216        let seqs = vec![vec![1, 2, 3], vec![4, 5], vec![6, 7, 8, 9]];
3217        let batch = RaggedBatch::from_sequences(&seqs);
3218
3219        assert_eq!(batch.batch_size(), 3);
3220        assert_eq!(batch.total_tokens(), 9);
3221        assert_eq!(batch.max_seq_len, 4);
3222        assert_eq!(batch.cumulative_offsets, vec![0, 3, 5, 9]);
3223    }
3224
3225    #[test]
3226    fn test_ragged_batch_doc_range() {
3227        let seqs = vec![vec![1, 2, 3], vec![4, 5]];
3228        let batch = RaggedBatch::from_sequences(&seqs);
3229
3230        assert_eq!(batch.doc_range(0), Some(0..3));
3231        assert_eq!(batch.doc_range(1), Some(3..5));
3232        assert_eq!(batch.doc_range(2), None);
3233    }
3234
3235    #[test]
3236    fn test_ragged_batch_doc_tokens() {
3237        let seqs = vec![vec![1, 2, 3], vec![4, 5]];
3238        let batch = RaggedBatch::from_sequences(&seqs);
3239
3240        assert_eq!(batch.doc_tokens(0), Some(&[1, 2, 3][..]));
3241        assert_eq!(batch.doc_tokens(1), Some(&[4, 5][..]));
3242    }
3243
3244    #[test]
3245    fn test_ragged_batch_padding_savings() {
3246        // 3 docs: [3, 2, 4] tokens, max = 4
3247        // Padded: 3 * 4 = 12, actual: 9
3248        // Savings: 1 - 9/12 = 0.25
3249        let seqs = vec![vec![1, 2, 3], vec![4, 5], vec![6, 7, 8, 9]];
3250        let batch = RaggedBatch::from_sequences(&seqs);
3251        let savings = batch.padding_savings();
3252        assert!((savings - 0.25).abs() < 0.001);
3253    }
3254
3255    // ========================================================================
3256    // SpanCandidate Tests
3257    // ========================================================================
3258
3259    #[test]
3260    fn test_span_candidate() {
3261        let sc = SpanCandidate::new(0, 5, 10);
3262        assert_eq!(sc.doc_idx, 0);
3263        assert_eq!(sc.start, 5);
3264        assert_eq!(sc.end, 10);
3265        assert_eq!(sc.width(), 5);
3266    }
3267
3268    #[test]
3269    fn test_generate_span_candidates() {
3270        let seqs = vec![vec![1, 2, 3]]; // doc with 3 tokens
3271        let batch = RaggedBatch::from_sequences(&seqs);
3272        let candidates = generate_span_candidates(&batch, 2);
3273
3274        // With max_width=2: [0,1], [1,2], [2,3], [0,2], [1,3]
3275        // = spans: (0,1), (0,2), (1,2), (1,3), (2,3)
3276        assert_eq!(candidates.len(), 5);
3277
3278        // Verify all candidates are valid
3279        for c in &candidates {
3280            assert_eq!(c.doc_idx, 0);
3281            assert!(c.end as usize <= 3);
3282            assert!(c.width() as usize <= 2);
3283        }
3284    }
3285
3286    #[test]
3287    fn test_generate_filtered_candidates() {
3288        let seqs = vec![vec![1, 2, 3]];
3289        let batch = RaggedBatch::from_sequences(&seqs);
3290
3291        // With max_width=2, we have 5 candidates
3292        // Set mask: only first 2 pass threshold
3293        let mask = vec![0.9, 0.9, 0.1, 0.1, 0.1];
3294        let candidates = generate_filtered_candidates(&batch, 2, &mask, 0.5);
3295
3296        assert_eq!(candidates.len(), 2);
3297    }
3298
3299    // ========================================================================
3300    // EntityBuilder Tests
3301    // ========================================================================
3302
3303    #[test]
3304    fn test_entity_builder_basic() {
3305        let entity = Entity::builder("John", EntityType::Person)
3306            .span(0, 4)
3307            .confidence(0.95)
3308            .build();
3309
3310        assert_eq!(entity.text, "John");
3311        assert_eq!(entity.entity_type, EntityType::Person);
3312        assert_eq!(entity.start, 0);
3313        assert_eq!(entity.end, 4);
3314        assert!((entity.confidence - 0.95).abs() < f64::EPSILON);
3315    }
3316
3317    #[test]
3318    fn test_entity_builder_full() {
3319        let entity = Entity::builder("Marie Curie", EntityType::Person)
3320            .span(0, 11)
3321            .confidence(0.95)
3322            .kb_id("Q7186")
3323            .canonical_id(42)
3324            .normalized("Marie Salomea Skłodowska Curie")
3325            .provenance(Provenance::ml("bert", 0.95))
3326            .build();
3327
3328        assert_eq!(entity.text, "Marie Curie");
3329        assert_eq!(entity.kb_id.as_deref(), Some("Q7186"));
3330        assert_eq!(
3331            entity.canonical_id,
3332            Some(crate::core::types::CanonicalId::new(42))
3333        );
3334        assert_eq!(
3335            entity.normalized.as_deref(),
3336            Some("Marie Salomea Skłodowska Curie")
3337        );
3338        assert!(entity.provenance.is_some());
3339    }
3340
3341    #[test]
3342    fn test_entity_builder_hierarchical() {
3343        let hc = HierarchicalConfidence::new(0.9, 0.8, 0.7);
3344        let entity = Entity::builder("test", EntityType::Person)
3345            .span(0, 4)
3346            .hierarchical_confidence(hc)
3347            .build();
3348
3349        assert!(entity.hierarchical_confidence.is_some());
3350        assert!((entity.linkage_confidence() - 0.9).abs() < 0.001);
3351        assert!((entity.type_confidence() - 0.8).abs() < 0.001);
3352        assert!((entity.boundary_confidence() - 0.7).abs() < 0.001);
3353    }
3354
3355    #[test]
3356    fn test_entity_builder_visual() {
3357        let bbox = Span::bbox(0.1, 0.2, 0.3, 0.4);
3358        let entity = Entity::builder("receipt item", EntityType::Money)
3359            .visual_span(bbox)
3360            .confidence(0.9)
3361            .build();
3362
3363        assert!(entity.is_visual());
3364        assert!(entity.visual_span.is_some());
3365    }
3366
3367    // ========================================================================
3368    // Entity Helper Method Tests
3369    // ========================================================================
3370
3371    #[test]
3372    fn test_entity_hierarchical_confidence_helpers() {
3373        let mut entity = Entity::new("test", EntityType::Person, 0, 4, 0.8);
3374
3375        // Without hierarchical confidence, falls back to main confidence
3376        assert!((entity.linkage_confidence() - 0.8).abs() < 0.001);
3377        assert!((entity.type_confidence() - 0.8).abs() < 0.001);
3378        assert!((entity.boundary_confidence() - 0.8).abs() < 0.001);
3379
3380        // Set hierarchical confidence
3381        entity.set_hierarchical_confidence(HierarchicalConfidence::new(0.95, 0.85, 0.75));
3382        assert!((entity.linkage_confidence() - 0.95).abs() < 0.001);
3383        assert!((entity.type_confidence() - 0.85).abs() < 0.001);
3384        assert!((entity.boundary_confidence() - 0.75).abs() < 0.001);
3385    }
3386
3387    #[test]
3388    fn test_entity_from_visual() {
3389        let entity = Entity::from_visual(
3390            "receipt total",
3391            EntityType::Money,
3392            Span::bbox(0.5, 0.8, 0.2, 0.05),
3393            0.92,
3394        );
3395
3396        assert!(entity.is_visual());
3397        assert_eq!(entity.start, 0);
3398        assert_eq!(entity.end, 0);
3399        assert!((entity.confidence - 0.92).abs() < f64::EPSILON);
3400    }
3401
3402    #[test]
3403    fn test_entity_span_helpers() {
3404        let entity = Entity::new("test", EntityType::Person, 10, 20, 0.9);
3405        assert_eq!(entity.text_span(), (10, 20));
3406        assert_eq!(entity.span_len(), 10);
3407    }
3408
3409    // ========================================================================
3410    // Provenance Tests
3411    // ========================================================================
3412
3413    #[test]
3414    fn test_provenance_pattern() {
3415        let prov = Provenance::pattern("EMAIL");
3416        assert_eq!(prov.method, ExtractionMethod::Pattern);
3417        assert_eq!(prov.pattern.as_deref(), Some("EMAIL"));
3418        assert_eq!(prov.raw_confidence, Some(1.0)); // Patterns are deterministic
3419    }
3420
3421    #[test]
3422    fn test_provenance_ml() {
3423        let prov = Provenance::ml("bert-ner", 0.87);
3424        assert_eq!(prov.method, ExtractionMethod::Neural);
3425        assert_eq!(prov.source.as_ref(), "bert-ner");
3426        assert_eq!(prov.raw_confidence, Some(0.87));
3427    }
3428
3429    #[test]
3430    fn test_provenance_with_version() {
3431        let prov = Provenance::ml("gliner", 0.92).with_version("v2.1.0");
3432
3433        assert_eq!(prov.model_version.as_deref(), Some("v2.1.0"));
3434        assert_eq!(prov.source.as_ref(), "gliner");
3435    }
3436
3437    #[test]
3438    fn test_provenance_with_timestamp() {
3439        let prov = Provenance::pattern("DATE").with_timestamp("2024-01-15T10:30:00Z");
3440
3441        assert_eq!(prov.timestamp.as_deref(), Some("2024-01-15T10:30:00Z"));
3442    }
3443
3444    #[test]
3445    fn test_provenance_builder_chain() {
3446        let prov = Provenance::ml("modernbert-ner", 0.95)
3447            .with_version("v1.0.0")
3448            .with_timestamp("2024-11-27T12:00:00Z");
3449
3450        assert_eq!(prov.method, ExtractionMethod::Neural);
3451        assert_eq!(prov.source.as_ref(), "modernbert-ner");
3452        assert_eq!(prov.raw_confidence, Some(0.95));
3453        assert_eq!(prov.model_version.as_deref(), Some("v1.0.0"));
3454        assert_eq!(prov.timestamp.as_deref(), Some("2024-11-27T12:00:00Z"));
3455    }
3456
3457    #[test]
3458    fn test_provenance_serialization() {
3459        let prov = Provenance::ml("test", 0.9)
3460            .with_version("v1.0")
3461            .with_timestamp("2024-01-01");
3462
3463        let json = serde_json::to_string(&prov).unwrap();
3464        assert!(json.contains("model_version"));
3465        assert!(json.contains("v1.0"));
3466
3467        let restored: Provenance = serde_json::from_str(&json).unwrap();
3468        assert_eq!(restored.model_version.as_deref(), Some("v1.0"));
3469        assert_eq!(restored.timestamp.as_deref(), Some("2024-01-01"));
3470    }
3471}
3472
3473#[cfg(test)]
3474mod proptests {
3475    #![allow(clippy::unwrap_used)] // unwrap() is acceptable in property tests
3476    use super::*;
3477    use proptest::prelude::*;
3478
3479    proptest! {
3480        #[test]
3481        fn confidence_always_clamped(conf in -10.0f64..10.0) {
3482            let e = Entity::new("test", EntityType::Person, 0, 4, conf);
3483            prop_assert!(e.confidence >= 0.0);
3484            prop_assert!(e.confidence <= 1.0);
3485        }
3486
3487        #[test]
3488        fn entity_type_roundtrip(label in "[A-Z]{3,10}") {
3489            let et = EntityType::from_label(&label);
3490            let back = EntityType::from_label(et.as_label());
3491            // Other types may round-trip to themselves or normalize
3492            prop_assert!(matches!(back, EntityType::Other(_)) || back == et);
3493        }
3494
3495        #[test]
3496        fn overlap_is_symmetric(
3497            s1 in 0usize..100,
3498            len1 in 1usize..50,
3499            s2 in 0usize..100,
3500            len2 in 1usize..50,
3501        ) {
3502            let e1 = Entity::new("a", EntityType::Person, s1, s1 + len1, 1.0);
3503            let e2 = Entity::new("b", EntityType::Person, s2, s2 + len2, 1.0);
3504            prop_assert_eq!(e1.overlaps(&e2), e2.overlaps(&e1));
3505        }
3506
3507        #[test]
3508        fn overlap_ratio_bounded(
3509            s1 in 0usize..100,
3510            len1 in 1usize..50,
3511            s2 in 0usize..100,
3512            len2 in 1usize..50,
3513        ) {
3514            let e1 = Entity::new("a", EntityType::Person, s1, s1 + len1, 1.0);
3515            let e2 = Entity::new("b", EntityType::Person, s2, s2 + len2, 1.0);
3516            let ratio = e1.overlap_ratio(&e2);
3517            prop_assert!(ratio >= 0.0);
3518            prop_assert!(ratio <= 1.0);
3519        }
3520
3521        #[test]
3522        fn self_overlap_ratio_is_one(s in 0usize..100, len in 1usize..50) {
3523            let e = Entity::new("test", EntityType::Person, s, s + len, 1.0);
3524            let ratio = e.overlap_ratio(&e);
3525            prop_assert!((ratio - 1.0).abs() < 1e-10);
3526        }
3527
3528        #[test]
3529        fn hierarchical_confidence_always_clamped(
3530            linkage in -2.0f32..2.0,
3531            type_score in -2.0f32..2.0,
3532            boundary in -2.0f32..2.0,
3533        ) {
3534            let hc = HierarchicalConfidence::new(linkage, type_score, boundary);
3535            prop_assert!(hc.linkage >= 0.0 && hc.linkage <= 1.0);
3536            prop_assert!(hc.type_score >= 0.0 && hc.type_score <= 1.0);
3537            prop_assert!(hc.boundary >= 0.0 && hc.boundary <= 1.0);
3538            prop_assert!(hc.combined() >= 0.0 && hc.combined() <= 1.0);
3539        }
3540
3541        #[test]
3542        fn span_candidate_width_consistent(
3543            doc in 0u32..10,
3544            start in 0u32..100,
3545            end in 1u32..100,
3546        ) {
3547            let actual_end = start.max(end);
3548            let sc = SpanCandidate::new(doc, start, actual_end);
3549            prop_assert_eq!(sc.width(), actual_end.saturating_sub(start));
3550        }
3551
3552        #[test]
3553        fn ragged_batch_preserves_tokens(
3554            seq_lens in proptest::collection::vec(1usize..10, 1..5),
3555        ) {
3556            // Create sequences with sequential token IDs
3557            let mut counter = 0u32;
3558            let seqs: Vec<Vec<u32>> = seq_lens.iter().map(|&len| {
3559                let seq: Vec<u32> = (counter..counter + len as u32).collect();
3560                counter += len as u32;
3561                seq
3562            }).collect();
3563
3564            let batch = RaggedBatch::from_sequences(&seqs);
3565
3566            // Verify batch properties
3567            prop_assert_eq!(batch.batch_size(), seqs.len());
3568            prop_assert_eq!(batch.total_tokens(), seq_lens.iter().sum::<usize>());
3569
3570            // Verify each doc can be retrieved correctly
3571            for (i, seq) in seqs.iter().enumerate() {
3572                let doc_tokens = batch.doc_tokens(i).unwrap();
3573                prop_assert_eq!(doc_tokens, seq.as_slice());
3574            }
3575        }
3576
3577        #[test]
3578        fn span_text_offsets_consistent(start in 0usize..100, len in 0usize..50) {
3579            let end = start + len;
3580            let span = Span::text(start, end);
3581            let (s, e) = span.text_offsets().unwrap();
3582            prop_assert_eq!(s, start);
3583            prop_assert_eq!(e, end);
3584            prop_assert_eq!(span.len(), len);
3585        }
3586    }
3587
3588    // ========================================================================
3589    // EntityViewport Tests
3590    // ========================================================================
3591
3592    #[test]
3593    fn test_entity_viewport_as_str() {
3594        assert_eq!(EntityViewport::Business.as_str(), "business");
3595        assert_eq!(EntityViewport::Legal.as_str(), "legal");
3596        assert_eq!(EntityViewport::Technical.as_str(), "technical");
3597        assert_eq!(EntityViewport::Academic.as_str(), "academic");
3598        assert_eq!(EntityViewport::Personal.as_str(), "personal");
3599        assert_eq!(EntityViewport::Political.as_str(), "political");
3600        assert_eq!(EntityViewport::Media.as_str(), "media");
3601        assert_eq!(EntityViewport::Historical.as_str(), "historical");
3602        assert_eq!(EntityViewport::General.as_str(), "general");
3603        assert_eq!(
3604            EntityViewport::Custom("custom".to_string()).as_str(),
3605            "custom"
3606        );
3607    }
3608
3609    #[test]
3610    fn test_entity_viewport_is_professional() {
3611        assert!(EntityViewport::Business.is_professional());
3612        assert!(EntityViewport::Legal.is_professional());
3613        assert!(EntityViewport::Technical.is_professional());
3614        assert!(EntityViewport::Academic.is_professional());
3615        assert!(EntityViewport::Political.is_professional());
3616
3617        assert!(!EntityViewport::Personal.is_professional());
3618        assert!(!EntityViewport::Media.is_professional());
3619        assert!(!EntityViewport::Historical.is_professional());
3620        assert!(!EntityViewport::General.is_professional());
3621        assert!(!EntityViewport::Custom("test".to_string()).is_professional());
3622    }
3623
3624    #[test]
3625    fn test_entity_viewport_from_str() {
3626        assert_eq!(
3627            "business".parse::<EntityViewport>().unwrap(),
3628            EntityViewport::Business
3629        );
3630        assert_eq!(
3631            "financial".parse::<EntityViewport>().unwrap(),
3632            EntityViewport::Business
3633        );
3634        assert_eq!(
3635            "corporate".parse::<EntityViewport>().unwrap(),
3636            EntityViewport::Business
3637        );
3638
3639        assert_eq!(
3640            "legal".parse::<EntityViewport>().unwrap(),
3641            EntityViewport::Legal
3642        );
3643        assert_eq!(
3644            "law".parse::<EntityViewport>().unwrap(),
3645            EntityViewport::Legal
3646        );
3647
3648        assert_eq!(
3649            "technical".parse::<EntityViewport>().unwrap(),
3650            EntityViewport::Technical
3651        );
3652        assert_eq!(
3653            "engineering".parse::<EntityViewport>().unwrap(),
3654            EntityViewport::Technical
3655        );
3656
3657        assert_eq!(
3658            "academic".parse::<EntityViewport>().unwrap(),
3659            EntityViewport::Academic
3660        );
3661        assert_eq!(
3662            "research".parse::<EntityViewport>().unwrap(),
3663            EntityViewport::Academic
3664        );
3665
3666        assert_eq!(
3667            "personal".parse::<EntityViewport>().unwrap(),
3668            EntityViewport::Personal
3669        );
3670        assert_eq!(
3671            "biographical".parse::<EntityViewport>().unwrap(),
3672            EntityViewport::Personal
3673        );
3674
3675        assert_eq!(
3676            "political".parse::<EntityViewport>().unwrap(),
3677            EntityViewport::Political
3678        );
3679        assert_eq!(
3680            "policy".parse::<EntityViewport>().unwrap(),
3681            EntityViewport::Political
3682        );
3683
3684        assert_eq!(
3685            "media".parse::<EntityViewport>().unwrap(),
3686            EntityViewport::Media
3687        );
3688        assert_eq!(
3689            "press".parse::<EntityViewport>().unwrap(),
3690            EntityViewport::Media
3691        );
3692
3693        assert_eq!(
3694            "historical".parse::<EntityViewport>().unwrap(),
3695            EntityViewport::Historical
3696        );
3697        assert_eq!(
3698            "history".parse::<EntityViewport>().unwrap(),
3699            EntityViewport::Historical
3700        );
3701
3702        assert_eq!(
3703            "general".parse::<EntityViewport>().unwrap(),
3704            EntityViewport::General
3705        );
3706        assert_eq!(
3707            "generic".parse::<EntityViewport>().unwrap(),
3708            EntityViewport::General
3709        );
3710        assert_eq!(
3711            "".parse::<EntityViewport>().unwrap(),
3712            EntityViewport::General
3713        );
3714
3715        // Custom viewport
3716        assert_eq!(
3717            "custom_viewport".parse::<EntityViewport>().unwrap(),
3718            EntityViewport::Custom("custom_viewport".to_string())
3719        );
3720    }
3721
3722    #[test]
3723    fn test_entity_viewport_from_str_case_insensitive() {
3724        assert_eq!(
3725            "BUSINESS".parse::<EntityViewport>().unwrap(),
3726            EntityViewport::Business
3727        );
3728        assert_eq!(
3729            "Business".parse::<EntityViewport>().unwrap(),
3730            EntityViewport::Business
3731        );
3732        assert_eq!(
3733            "BuSiNeSs".parse::<EntityViewport>().unwrap(),
3734            EntityViewport::Business
3735        );
3736    }
3737
3738    #[test]
3739    fn test_entity_viewport_display() {
3740        assert_eq!(format!("{}", EntityViewport::Business), "business");
3741        assert_eq!(format!("{}", EntityViewport::Academic), "academic");
3742        assert_eq!(
3743            format!("{}", EntityViewport::Custom("test".to_string())),
3744            "test"
3745        );
3746    }
3747
3748    #[test]
3749    fn test_entity_viewport_methods() {
3750        let mut entity = Entity::new("Marie Curie", EntityType::Person, 0, 11, 0.9);
3751
3752        // Initially no viewport
3753        assert!(!entity.has_viewport());
3754        assert_eq!(entity.viewport_or_default(), EntityViewport::General);
3755        assert!(entity.matches_viewport(&EntityViewport::Academic)); // No viewport matches any
3756
3757        // Set viewport
3758        entity.set_viewport(EntityViewport::Academic);
3759        assert!(entity.has_viewport());
3760        assert_eq!(entity.viewport_or_default(), EntityViewport::Academic);
3761        assert!(entity.matches_viewport(&EntityViewport::Academic));
3762        assert!(!entity.matches_viewport(&EntityViewport::Business));
3763    }
3764
3765    #[test]
3766    fn test_entity_builder_with_viewport() {
3767        let entity = Entity::builder("Marie Curie", EntityType::Person)
3768            .span(0, 11)
3769            .viewport(EntityViewport::Academic)
3770            .build();
3771
3772        assert_eq!(entity.viewport, Some(EntityViewport::Academic));
3773        assert!(entity.has_viewport());
3774    }
3775
3776    // ========================================================================
3777    // EntityCategory Tests
3778    // ========================================================================
3779
3780    #[test]
3781    fn test_entity_category_requires_ml() {
3782        assert!(EntityCategory::Agent.requires_ml());
3783        assert!(EntityCategory::Organization.requires_ml());
3784        assert!(EntityCategory::Place.requires_ml());
3785        assert!(EntityCategory::Creative.requires_ml());
3786        assert!(EntityCategory::Relation.requires_ml());
3787
3788        assert!(!EntityCategory::Temporal.requires_ml());
3789        assert!(!EntityCategory::Numeric.requires_ml());
3790        assert!(!EntityCategory::Contact.requires_ml());
3791        assert!(!EntityCategory::Misc.requires_ml());
3792    }
3793
3794    #[test]
3795    fn test_entity_category_pattern_detectable() {
3796        assert!(EntityCategory::Temporal.pattern_detectable());
3797        assert!(EntityCategory::Numeric.pattern_detectable());
3798        assert!(EntityCategory::Contact.pattern_detectable());
3799
3800        assert!(!EntityCategory::Agent.pattern_detectable());
3801        assert!(!EntityCategory::Organization.pattern_detectable());
3802        assert!(!EntityCategory::Place.pattern_detectable());
3803        assert!(!EntityCategory::Creative.pattern_detectable());
3804        assert!(!EntityCategory::Relation.pattern_detectable());
3805        assert!(!EntityCategory::Misc.pattern_detectable());
3806    }
3807
3808    #[test]
3809    fn test_entity_category_is_relation() {
3810        assert!(EntityCategory::Relation.is_relation());
3811
3812        assert!(!EntityCategory::Agent.is_relation());
3813        assert!(!EntityCategory::Organization.is_relation());
3814        assert!(!EntityCategory::Place.is_relation());
3815        assert!(!EntityCategory::Temporal.is_relation());
3816        assert!(!EntityCategory::Numeric.is_relation());
3817        assert!(!EntityCategory::Contact.is_relation());
3818        assert!(!EntityCategory::Creative.is_relation());
3819        assert!(!EntityCategory::Misc.is_relation());
3820    }
3821
3822    #[test]
3823    fn test_entity_category_as_str() {
3824        assert_eq!(EntityCategory::Agent.as_str(), "agent");
3825        assert_eq!(EntityCategory::Organization.as_str(), "organization");
3826        assert_eq!(EntityCategory::Place.as_str(), "place");
3827        assert_eq!(EntityCategory::Creative.as_str(), "creative");
3828        assert_eq!(EntityCategory::Temporal.as_str(), "temporal");
3829        assert_eq!(EntityCategory::Numeric.as_str(), "numeric");
3830        assert_eq!(EntityCategory::Contact.as_str(), "contact");
3831        assert_eq!(EntityCategory::Relation.as_str(), "relation");
3832        assert_eq!(EntityCategory::Misc.as_str(), "misc");
3833    }
3834
3835    #[test]
3836    fn test_entity_category_display() {
3837        assert_eq!(format!("{}", EntityCategory::Agent), "agent");
3838        assert_eq!(format!("{}", EntityCategory::Temporal), "temporal");
3839        assert_eq!(format!("{}", EntityCategory::Relation), "relation");
3840    }
3841}