Skip to main content

anno_core/core/
entity.rs

1//! Entity types and structures for NER.
2//!
3//! # Design Philosophy (Research-Aligned)
4//!
5//! This module implements entity types informed by modern NER research:
6//!
7//! - **GLiNER/Bi-Encoder**: Entity types are *labels to match against*, not fixed classes.
8//!   Relations ("CEO of") are entities too - they're just labels in the same latent space.
9//!
10//! - **TPLinker/Joint Extraction**: Entities and relations can be extracted in a single pass.
11//!   The type system supports relation triggers as first-class mentions.
12//!
13//! - **Knowledge Graphs**: Entities can link to external knowledge bases (`kb_id`) for
14//!   coreference resolution and GraphRAG applications.
15//!
16//! # Type Hierarchy
17//!
18//! ```text
19//! Mention
20//! ├── Entity (single span)
21//! │   ├── Named (ML): Person, Organization, Location
22//! │   ├── Temporal (Pattern): Date, Time
23//! │   ├── Numeric (Pattern): Money, Percent, Quantity, Cardinal, Ordinal
24//! │   └── Contact (Pattern): Email, Url, Phone
25//! │
26//! └── Relation (connects entities)
27//!     └── Trigger text: "CEO of", "located in", "born on"
28//! ```
29//!
30//! # Design Principles
31//!
32//! 1. **Bi-encoder compatible**: Types are semantic labels, not fixed enums
33//! 2. **Joint extraction**: Relations are mentions with trigger spans
34//! 3. **Knowledge linking**: `kb_id` for connecting to external KBs
35//! 4. **Hierarchical confidence**: Coarse (linkage) + fine (type) scores
36//! 5. **Multi-modal ready**: Spans can be text offsets or visual bboxes
37
38use serde::{Deserialize, Serialize};
39use std::borrow::Cow;
40
41// ============================================================================
42// Entity Category (OntoNotes-inspired)
43// ============================================================================
44
45/// Category of entity based on detection characteristics and semantics.
46///
47/// Based on OntoNotes 5.0 categories with extensions for:
48/// - Structured data (Contact, patterns)
49/// - Knowledge graphs (Relation, for TPLinker/GLiNER joint extraction)
50#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
51#[non_exhaustive]
52pub enum EntityCategory {
53    /// Named entities for people/groups (ML-required).
54    /// Types: Person, NORP (nationalities/religious/political groups)
55    Agent,
56    /// Named entities for organizations/facilities (ML-required).
57    /// Types: Organization, Facility
58    Organization,
59    /// Named entities for places (ML-required).
60    /// Types: GPE (geo-political), Location (geographic)
61    Place,
62    /// Named entities for creative/conceptual (ML-required).
63    /// Types: Event, Product, WorkOfArt, Law, Language
64    Creative,
65    /// Temporal entities (pattern-detectable).
66    /// Types: Date, Time
67    Temporal,
68    /// Numeric entities (pattern-detectable).
69    /// Types: Money, Percent, Quantity, Cardinal, Ordinal
70    Numeric,
71    /// Contact/identifier entities (pattern-detectable).
72    /// Types: Email, Url, Phone
73    Contact,
74    /// Relation triggers for knowledge graph construction (ML-required).
75    /// Examples: "CEO of", "located in", "founded by"
76    /// In GLiNER bi-encoder, relations are just another label to match.
77    Relation,
78    /// Miscellaneous/unknown category
79    Misc,
80}
81
82impl EntityCategory {
83    /// Returns true if this category requires ML for detection.
84    #[must_use]
85    pub const fn requires_ml(&self) -> bool {
86        matches!(
87            self,
88            EntityCategory::Agent
89                | EntityCategory::Organization
90                | EntityCategory::Place
91                | EntityCategory::Creative
92                | EntityCategory::Relation
93        )
94    }
95
96    /// Returns true if this category can be detected via patterns.
97    #[must_use]
98    pub const fn pattern_detectable(&self) -> bool {
99        matches!(
100            self,
101            EntityCategory::Temporal | EntityCategory::Numeric | EntityCategory::Contact
102        )
103    }
104
105    /// Returns true if this is a relation (for knowledge graph construction).
106    #[must_use]
107    pub const fn is_relation(&self) -> bool {
108        matches!(self, EntityCategory::Relation)
109    }
110
111    /// Returns OntoNotes-compatible category name.
112    #[must_use]
113    pub const fn as_str(&self) -> &'static str {
114        match self {
115            EntityCategory::Agent => "agent",
116            EntityCategory::Organization => "organization",
117            EntityCategory::Place => "place",
118            EntityCategory::Creative => "creative",
119            EntityCategory::Temporal => "temporal",
120            EntityCategory::Numeric => "numeric",
121            EntityCategory::Contact => "contact",
122            EntityCategory::Relation => "relation",
123            EntityCategory::Misc => "misc",
124        }
125    }
126}
127
128impl std::fmt::Display for EntityCategory {
129    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
130        write!(f, "{}", self.as_str())
131    }
132}
133
134// ============================================================================
135// Entity Viewport (Research: Entity Manifolds)
136// ============================================================================
137
138/// Viewport context for multi-faceted entity representation.
139///
140/// # Research Background
141///
142/// The concept of "Entity Viewports" comes from the observation that
143/// real-world entities are not monolithic - they present different
144/// facets depending on context:
145///
146/// - "Marie Curie" in an **Academic** context: physicist, Nobel laureate
147/// - "Marie Curie" in a **Technical** context: radioactivity researcher, X-ray pioneer
148/// - "Marie Curie" in a **Personal** context: mother, immigrant, educator
149/// - "Marie Curie" in a **Medical** context: founder of mobile X-ray units
150///
151/// Rather than collapsing all information into a single vector,
152/// the viewport model preserves these distinctions and enables
153/// "projection" at query time.
154///
155/// # Usage in RAG Systems
156///
157/// When answering "What were Curie's scientific contributions?", retrieve
158/// facts from the `Academic` viewport. When answering "What was Curie's
159/// personal life like?", retrieve from `Personal`.
160///
161/// # Example
162///
163/// ```rust
164/// use anno_core::{Entity, EntityType, EntityViewport};
165///
166/// let mut entity = Entity::new("Marie Curie", EntityType::Person, 0, 11, 0.9);
167/// entity.viewport = Some(EntityViewport::Academic);
168/// assert!(entity.viewport.as_ref().unwrap().is_professional());
169/// ```
170#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize, Default)]
171#[non_exhaustive]
172pub enum EntityViewport {
173    /// Business/financial context (CEO, revenue, market cap)
174    Business,
175    /// Legal context (lawsuits, settlements, compliance)
176    Legal,
177    /// Technical/engineering context (patents, inventions, code)
178    Technical,
179    /// Academic/research context (publications, citations, grants)
180    Academic,
181    /// Personal/biographical context (family, hobbies, background)
182    Personal,
183    /// Political context (lobbying, donations, policy positions)
184    Political,
185    /// Media/public relations context (interviews, statements, PR)
186    Media,
187    /// Historical context (past roles, timeline events)
188    Historical,
189    /// Generic/unspecified context
190    #[default]
191    General,
192    /// Custom viewport with a descriptive label
193    Custom(String),
194}
195
196impl EntityViewport {
197    /// Human-readable label for the viewport.
198    #[must_use]
199    pub fn as_str(&self) -> &str {
200        match self {
201            EntityViewport::Business => "business",
202            EntityViewport::Legal => "legal",
203            EntityViewport::Technical => "technical",
204            EntityViewport::Academic => "academic",
205            EntityViewport::Personal => "personal",
206            EntityViewport::Political => "political",
207            EntityViewport::Media => "media",
208            EntityViewport::Historical => "historical",
209            EntityViewport::General => "general",
210            EntityViewport::Custom(s) => s,
211        }
212    }
213
214    /// Is this a professional/work-related viewport?
215    #[must_use]
216    pub const fn is_professional(&self) -> bool {
217        matches!(
218            self,
219            EntityViewport::Business
220                | EntityViewport::Legal
221                | EntityViewport::Technical
222                | EntityViewport::Academic
223                | EntityViewport::Political
224        )
225    }
226}
227
228impl std::str::FromStr for EntityViewport {
229    type Err = std::convert::Infallible;
230
231    fn from_str(s: &str) -> Result<Self, Self::Err> {
232        Ok(match s.to_lowercase().as_str() {
233            "business" | "financial" | "corporate" => EntityViewport::Business,
234            "legal" | "law" | "compliance" => EntityViewport::Legal,
235            "technical" | "engineering" | "tech" => EntityViewport::Technical,
236            "academic" | "research" | "scholarly" => EntityViewport::Academic,
237            "personal" | "biographical" | "private" => EntityViewport::Personal,
238            "political" | "policy" | "government" => EntityViewport::Political,
239            "media" | "press" | "pr" | "public_relations" => EntityViewport::Media,
240            "historical" | "history" | "past" => EntityViewport::Historical,
241            "general" | "generic" | "" => EntityViewport::General,
242            other => EntityViewport::Custom(other.to_string()),
243        })
244    }
245}
246
247impl std::fmt::Display for EntityViewport {
248    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
249        write!(f, "{}", self.as_str())
250    }
251}
252
253// ============================================================================
254// Entity Type
255// ============================================================================
256
257/// Entity type classification.
258///
259/// Organized into categories:
260/// - **Named** (ML-required): Person, Organization, Location
261/// - **Temporal** (pattern): Date, Time
262/// - **Numeric** (pattern): Money, Percent, Quantity, Cardinal, Ordinal
263/// - **Contact** (pattern): Email, Url, Phone
264///
265/// # Examples
266///
267/// ```
268/// use anno_core::EntityType;
269///
270/// let ty = EntityType::Email;
271/// assert!(ty.category().pattern_detectable());
272/// assert!(!ty.category().requires_ml());
273///
274/// let ty = EntityType::Person;
275/// assert!(ty.category().requires_ml());
276/// ```
277#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
278#[non_exhaustive]
279pub enum EntityType {
280    // === Named Entities (ML-required) ===
281    /// Person name (PER) - requires ML/context
282    Person,
283    /// Organization name (ORG) - requires ML/context
284    Organization,
285    /// Location/Place (LOC/GPE) - requires ML/context
286    Location,
287
288    // === Temporal Entities (Pattern-detectable) ===
289    /// Date expression (DATE) - pattern-detectable
290    Date,
291    /// Time expression (TIME) - pattern-detectable
292    Time,
293
294    // === Numeric Entities (Pattern-detectable) ===
295    /// Monetary value (MONEY) - pattern-detectable
296    Money,
297    /// Percentage (PERCENT) - pattern-detectable
298    Percent,
299    /// Quantity with unit (QUANTITY) - pattern-detectable
300    Quantity,
301    /// Cardinal number (CARDINAL) - pattern-detectable
302    Cardinal,
303    /// Ordinal number (ORDINAL) - pattern-detectable
304    Ordinal,
305
306    // === Contact Entities (Pattern-detectable) ===
307    /// Email address - pattern-detectable
308    Email,
309    /// URL/URI - pattern-detectable
310    Url,
311    /// Phone number - pattern-detectable
312    Phone,
313
314    // === Extensibility ===
315    /// Domain-specific custom type with explicit category
316    Custom {
317        /// Type name (e.g., "DISEASE", "PRODUCT", "EVENT")
318        name: String,
319        /// Category for this custom type
320        category: EntityCategory,
321    },
322
323    /// Legacy catch-all for unknown types (prefer Custom for new code)
324    #[serde(rename = "Other")]
325    Other(String),
326}
327
328impl EntityType {
329    /// Get the category of this entity type.
330    #[must_use]
331    pub fn category(&self) -> EntityCategory {
332        match self {
333            // Agent entities (people/groups)
334            EntityType::Person => EntityCategory::Agent,
335            // Organization entities
336            EntityType::Organization => EntityCategory::Organization,
337            // Place entities (locations)
338            EntityType::Location => EntityCategory::Place,
339            // Temporal entities
340            EntityType::Date | EntityType::Time => EntityCategory::Temporal,
341            // Numeric entities
342            EntityType::Money
343            | EntityType::Percent
344            | EntityType::Quantity
345            | EntityType::Cardinal
346            | EntityType::Ordinal => EntityCategory::Numeric,
347            // Contact entities
348            EntityType::Email | EntityType::Url | EntityType::Phone => EntityCategory::Contact,
349            // Custom with explicit category
350            EntityType::Custom { category, .. } => *category,
351            // Legacy Other - assume misc
352            EntityType::Other(_) => EntityCategory::Misc,
353        }
354    }
355
356    /// Returns true if this entity type requires ML for detection.
357    #[must_use]
358    pub fn requires_ml(&self) -> bool {
359        self.category().requires_ml()
360    }
361
362    /// Returns true if this entity type can be detected via patterns.
363    #[must_use]
364    pub fn pattern_detectable(&self) -> bool {
365        self.category().pattern_detectable()
366    }
367
368    /// Convert to standard label string (CoNLL/OntoNotes format).
369    ///
370    /// ```
371    /// use anno_core::EntityType;
372    ///
373    /// assert_eq!(EntityType::Person.as_label(), "PER");
374    /// assert_eq!(EntityType::Location.as_label(), "LOC");
375    /// ```
376    #[must_use]
377    pub fn as_label(&self) -> &str {
378        match self {
379            EntityType::Person => "PER",
380            EntityType::Organization => "ORG",
381            EntityType::Location => "LOC",
382            EntityType::Date => "DATE",
383            EntityType::Time => "TIME",
384            EntityType::Money => "MONEY",
385            EntityType::Percent => "PERCENT",
386            EntityType::Quantity => "QUANTITY",
387            EntityType::Cardinal => "CARDINAL",
388            EntityType::Ordinal => "ORDINAL",
389            EntityType::Email => "EMAIL",
390            EntityType::Url => "URL",
391            EntityType::Phone => "PHONE",
392            EntityType::Custom { name, .. } => name.as_str(),
393            EntityType::Other(s) => s.as_str(),
394        }
395    }
396
397    /// Parse from standard label string.
398    ///
399    /// Handles various formats: CoNLL (PER), OntoNotes (PERSON), BIO (B-PER).
400    ///
401    /// ```
402    /// use anno_core::EntityType;
403    ///
404    /// assert_eq!(EntityType::from_label("PER"), EntityType::Person);
405    /// assert_eq!(EntityType::from_label("B-ORG"), EntityType::Organization);
406    /// assert_eq!(EntityType::from_label("PERSON"), EntityType::Person);
407    /// ```
408    #[must_use]
409    pub fn from_label(label: &str) -> Self {
410        // Strip BIO prefix if present
411        let label = label
412            .strip_prefix("B-")
413            .or_else(|| label.strip_prefix("I-"))
414            .or_else(|| label.strip_prefix("E-"))
415            .or_else(|| label.strip_prefix("S-"))
416            .unwrap_or(label);
417
418        match label.to_uppercase().as_str() {
419            // Named entities (multiple variations)
420            "PER" | "PERSON" => EntityType::Person,
421            "ORG" | "ORGANIZATION" | "COMPANY" | "CORPORATION" => EntityType::Organization,
422            "LOC" | "LOCATION" | "GPE" | "GEO-LOC" => EntityType::Location,
423            // WNUT / FewNERD specific types (common in social media / Wikipedia)
424            "FACILITY" | "FAC" | "BUILDING" => {
425                EntityType::custom("BUILDING", EntityCategory::Place)
426            }
427            "PRODUCT" | "PROD" => EntityType::custom("PRODUCT", EntityCategory::Misc),
428            "EVENT" => EntityType::custom("EVENT", EntityCategory::Creative),
429            "CREATIVE-WORK" | "WORK_OF_ART" | "ART" => {
430                EntityType::custom("CREATIVE_WORK", EntityCategory::Creative)
431            }
432            "GROUP" | "NORP" => EntityType::custom("GROUP", EntityCategory::Agent),
433            // Temporal
434            "DATE" => EntityType::Date,
435            "TIME" => EntityType::Time,
436            // Numeric
437            "MONEY" | "CURRENCY" => EntityType::Money,
438            "PERCENT" | "PERCENTAGE" => EntityType::Percent,
439            "QUANTITY" => EntityType::Quantity,
440            "CARDINAL" => EntityType::Cardinal,
441            "ORDINAL" => EntityType::Ordinal,
442            // Contact
443            "EMAIL" => EntityType::Email,
444            "URL" | "URI" => EntityType::Url,
445            "PHONE" | "TELEPHONE" => EntityType::Phone,
446            // MISC variations
447            "MISC" | "MISCELLANEOUS" | "OTHER" => EntityType::Other("MISC".to_string()),
448            // Biomedical types
449            "DISEASE" | "DISORDER" => EntityType::custom("DISEASE", EntityCategory::Misc),
450            "CHEMICAL" | "DRUG" => EntityType::custom("CHEMICAL", EntityCategory::Misc),
451            "GENE" => EntityType::custom("GENE", EntityCategory::Misc),
452            "PROTEIN" => EntityType::custom("PROTEIN", EntityCategory::Misc),
453            // Unknown -> Other
454            other => EntityType::Other(other.to_string()),
455        }
456    }
457
458    /// Create a custom domain-specific entity type.
459    ///
460    /// # Examples
461    ///
462    /// ```
463    /// use anno_core::{EntityType, EntityCategory};
464    ///
465    /// let disease = EntityType::custom("DISEASE", EntityCategory::Agent);
466    /// assert!(disease.requires_ml());
467    ///
468    /// let product_id = EntityType::custom("PRODUCT_ID", EntityCategory::Misc);
469    /// assert!(!product_id.requires_ml());
470    /// ```
471    #[must_use]
472    pub fn custom(name: impl Into<String>, category: EntityCategory) -> Self {
473        EntityType::Custom {
474            name: name.into(),
475            category,
476        }
477    }
478}
479
480impl std::fmt::Display for EntityType {
481    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
482        write!(f, "{}", self.as_label())
483    }
484}
485
486impl std::str::FromStr for EntityType {
487    type Err = std::convert::Infallible;
488
489    /// Parse from standard label string. Never fails - unknown labels become `Other`.
490    fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
491        Ok(Self::from_label(s))
492    }
493}
494
495// =============================================================================
496// Type Mapping for Domain-Specific Datasets
497// =============================================================================
498
499/// Maps domain-specific entity types to standard NER types.
500///
501/// # Research Context (Familiarity paper, arXiv:2412.10121)
502///
503/// Type mapping creates "label overlap" between training and evaluation:
504/// - Mapping ACTOR → Person increases overlap
505/// - This can inflate zero-shot F1 scores
506///
507/// Use `LabelShift::from_type_sets()` to quantify how much overlap exists.
508/// High overlap (>80%) means the evaluation is NOT truly zero-shot.
509///
510/// # When to Use TypeMapper
511///
512/// - Cross-dataset comparison (normalize schemas for fair eval)
513/// - Domain adaptation (map new labels to known types)
514///
515/// # When NOT to Use TypeMapper
516///
517/// - True zero-shot evaluation (keep labels distinct)
518/// - Measuring generalization (overlap hides generalization failures)
519///
520/// # Example
521///
522/// ```rust
523/// use anno_core::{TypeMapper, EntityType, EntityCategory};
524///
525/// // MIT Movie dataset mapping
526/// let mut mapper = TypeMapper::new();
527/// mapper.add("ACTOR", EntityType::Person);
528/// mapper.add("DIRECTOR", EntityType::Person);
529/// mapper.add("TITLE", EntityType::custom("WORK_OF_ART", EntityCategory::Creative));
530///
531/// assert_eq!(mapper.map("ACTOR"), Some(&EntityType::Person));
532/// assert_eq!(mapper.normalize("DIRECTOR"), EntityType::Person);
533/// ```
534#[derive(Debug, Clone, Default)]
535pub struct TypeMapper {
536    mappings: std::collections::HashMap<String, EntityType>,
537}
538
539impl TypeMapper {
540    /// Create empty mapper.
541    #[must_use]
542    pub fn new() -> Self {
543        Self::default()
544    }
545
546    /// Create mapper for MIT Movie dataset.
547    #[must_use]
548    pub fn mit_movie() -> Self {
549        let mut mapper = Self::new();
550        // Map to standard types where possible
551        mapper.add("ACTOR", EntityType::Person);
552        mapper.add("DIRECTOR", EntityType::Person);
553        mapper.add("CHARACTER", EntityType::Person);
554        mapper.add(
555            "TITLE",
556            EntityType::custom("WORK_OF_ART", EntityCategory::Creative),
557        );
558        mapper.add("GENRE", EntityType::custom("GENRE", EntityCategory::Misc));
559        mapper.add("YEAR", EntityType::Date);
560        mapper.add("RATING", EntityType::custom("RATING", EntityCategory::Misc));
561        mapper.add("PLOT", EntityType::custom("PLOT", EntityCategory::Misc));
562        mapper
563    }
564
565    /// Create mapper for MIT Restaurant dataset.
566    #[must_use]
567    pub fn mit_restaurant() -> Self {
568        let mut mapper = Self::new();
569        mapper.add("RESTAURANT_NAME", EntityType::Organization);
570        mapper.add("LOCATION", EntityType::Location);
571        mapper.add(
572            "CUISINE",
573            EntityType::custom("CUISINE", EntityCategory::Misc),
574        );
575        mapper.add("DISH", EntityType::custom("DISH", EntityCategory::Misc));
576        mapper.add("PRICE", EntityType::Money);
577        mapper.add(
578            "AMENITY",
579            EntityType::custom("AMENITY", EntityCategory::Misc),
580        );
581        mapper.add("HOURS", EntityType::Time);
582        mapper
583    }
584
585    /// Create mapper for biomedical datasets (BC5CDR, NCBI).
586    #[must_use]
587    pub fn biomedical() -> Self {
588        let mut mapper = Self::new();
589        mapper.add(
590            "DISEASE",
591            EntityType::custom("DISEASE", EntityCategory::Agent),
592        );
593        mapper.add(
594            "CHEMICAL",
595            EntityType::custom("CHEMICAL", EntityCategory::Misc),
596        );
597        mapper.add("DRUG", EntityType::custom("DRUG", EntityCategory::Misc));
598        mapper.add("GENE", EntityType::custom("GENE", EntityCategory::Misc));
599        mapper.add(
600            "PROTEIN",
601            EntityType::custom("PROTEIN", EntityCategory::Misc),
602        );
603        // GENIA types
604        mapper.add("DNA", EntityType::custom("DNA", EntityCategory::Misc));
605        mapper.add("RNA", EntityType::custom("RNA", EntityCategory::Misc));
606        mapper.add(
607            "cell_line",
608            EntityType::custom("CELL_LINE", EntityCategory::Misc),
609        );
610        mapper.add(
611            "cell_type",
612            EntityType::custom("CELL_TYPE", EntityCategory::Misc),
613        );
614        mapper
615    }
616
617    /// Create mapper for social media NER datasets (TweetNER7, etc.).
618    #[must_use]
619    pub fn social_media() -> Self {
620        let mut mapper = Self::new();
621        // TweetNER7 types
622        mapper.add("person", EntityType::Person);
623        mapper.add("corporation", EntityType::Organization);
624        mapper.add("location", EntityType::Location);
625        mapper.add("group", EntityType::Organization);
626        mapper.add(
627            "product",
628            EntityType::custom("PRODUCT", EntityCategory::Misc),
629        );
630        mapper.add(
631            "creative_work",
632            EntityType::custom("WORK_OF_ART", EntityCategory::Creative),
633        );
634        mapper.add("event", EntityType::custom("EVENT", EntityCategory::Misc));
635        mapper
636    }
637
638    /// Create mapper for manufacturing domain datasets (FabNER, etc.).
639    #[must_use]
640    pub fn manufacturing() -> Self {
641        let mut mapper = Self::new();
642        // FabNER entity types
643        mapper.add("MATE", EntityType::custom("MATERIAL", EntityCategory::Misc));
644        mapper.add("MANP", EntityType::custom("PROCESS", EntityCategory::Misc));
645        mapper.add("MACEQ", EntityType::custom("MACHINE", EntityCategory::Misc));
646        mapper.add(
647            "APPL",
648            EntityType::custom("APPLICATION", EntityCategory::Misc),
649        );
650        mapper.add("FEAT", EntityType::custom("FEATURE", EntityCategory::Misc));
651        mapper.add(
652            "PARA",
653            EntityType::custom("PARAMETER", EntityCategory::Misc),
654        );
655        mapper.add("PRO", EntityType::custom("PROPERTY", EntityCategory::Misc));
656        mapper.add(
657            "CHAR",
658            EntityType::custom("CHARACTERISTIC", EntityCategory::Misc),
659        );
660        mapper.add(
661            "ENAT",
662            EntityType::custom("ENABLING_TECHNOLOGY", EntityCategory::Misc),
663        );
664        mapper.add(
665            "CONPRI",
666            EntityType::custom("CONCEPT_PRINCIPLE", EntityCategory::Misc),
667        );
668        mapper.add(
669            "BIOP",
670            EntityType::custom("BIO_PROCESS", EntityCategory::Misc),
671        );
672        mapper.add(
673            "MANS",
674            EntityType::custom("MAN_STANDARD", EntityCategory::Misc),
675        );
676        mapper
677    }
678
679    /// Add a mapping from source label to target type.
680    pub fn add(&mut self, source: impl Into<String>, target: EntityType) {
681        self.mappings.insert(source.into().to_uppercase(), target);
682    }
683
684    /// Get mapped type for a label (returns None if not mapped).
685    #[must_use]
686    pub fn map(&self, label: &str) -> Option<&EntityType> {
687        self.mappings.get(&label.to_uppercase())
688    }
689
690    /// Normalize a label to EntityType, using mapping if available.
691    ///
692    /// Falls back to `EntityType::from_label()` if no mapping exists.
693    #[must_use]
694    pub fn normalize(&self, label: &str) -> EntityType {
695        self.map(label)
696            .cloned()
697            .unwrap_or_else(|| EntityType::from_label(label))
698    }
699
700    /// Check if a label is mapped.
701    #[must_use]
702    pub fn contains(&self, label: &str) -> bool {
703        self.mappings.contains_key(&label.to_uppercase())
704    }
705
706    /// Get all source labels.
707    pub fn labels(&self) -> impl Iterator<Item = &String> {
708        self.mappings.keys()
709    }
710}
711
712/// Extraction method used to identify an entity.
713///
714/// # Research Context
715///
716/// Different extraction methods have different strengths:
717///
718/// | Method | Precision | Recall | Generalization | Use Case |
719/// |--------|-----------|--------|----------------|----------|
720/// | Pattern | Very High | Low | N/A (format-based) | Dates, emails, money |
721/// | Neural | High | High | Good | General NER |
722/// | Lexicon | Very High | Low | None | Closed-domain entities |
723/// | SoftLexicon | Medium | High | Good for rare types | Low-resource NER |
724/// | GatedEnsemble | Highest | Highest | Contextual | Short texts, domain shift |
725///
726/// See `docs/` for repo-local notes and entry points.
727#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, Default)]
728#[non_exhaustive]
729pub enum ExtractionMethod {
730    /// Regex pattern matching (high precision for structured data like dates, money).
731    /// Does not generalize - only detects format-based entities.
732    Pattern,
733
734    /// Neural model inference (BERT, GLiNER, etc.).
735    /// The recommended default for general NER. Generalizes to unseen entities.
736    #[default]
737    Neural,
738
739    /// Exact lexicon/gazetteer lookup (deprecated approach).
740    /// High precision on known entities, zero recall on novel entities.
741    /// Only use for closed domains (stock tickers, medical codes).
742    #[deprecated(since = "0.2.0", note = "Use Neural or GatedEnsemble instead")]
743    Lexicon,
744
745    /// Embedding-based soft lexicon matching.
746    /// Useful for low-resource languages and rare entity types.
747    /// See: Rijhwani et al. (2020) "Soft Gazetteers for Low-Resource NER"
748    SoftLexicon,
749
750    /// Gated ensemble: neural + lexicon with learned weighting.
751    /// Model learns when to trust lexicon vs. context.
752    /// See: Nie et al. (2021) "GEMNET: Effective Gated Gazetteer Representations"
753    GatedEnsemble,
754
755    /// Multiple methods agreed on this entity (high confidence).
756    Consensus,
757
758    /// Heuristic-based extraction (capitalization, word shape, context).
759    /// Used by heuristic backends that don't use neural models.
760    Heuristic,
761
762    /// Unknown or unspecified extraction method.
763    Unknown,
764
765    /// Legacy rule-based extraction (for backward compatibility).
766    #[deprecated(since = "0.2.0", note = "Use Heuristic or Pattern instead")]
767    Rule,
768
769    /// Legacy alias for Neural (for backward compatibility).
770    #[deprecated(since = "0.2.0", note = "Use Neural instead")]
771    ML,
772
773    /// Legacy alias for Consensus (for backward compatibility).
774    #[deprecated(since = "0.2.0", note = "Use Consensus instead")]
775    Ensemble,
776}
777
778impl ExtractionMethod {
779    /// Returns true if this extraction method produces probabilistically calibrated
780    /// confidence scores suitable for calibration analysis (ECE, Brier score, etc.).
781    ///
782    /// # Calibrated Methods
783    ///
784    /// - **Neural**: Softmax outputs are intended to be probabilistic (though may need
785    ///   temperature scaling for true calibration)
786    /// - **GatedEnsemble**: Produces learned probability estimates
787    /// - **SoftLexicon**: Embedding similarity is pseudo-probabilistic
788    ///
789    /// # Uncalibrated Methods
790    ///
791    /// - **Pattern**: Binary (match/no-match); confidence is typically hardcoded
792    /// - **Heuristic**: Arbitrary scores from hand-crafted rules
793    /// - **Lexicon**: Binary exact match
794    /// - **Consensus**: Agreement count, not a probability
795    ///
796    /// # Example
797    ///
798    /// ```rust
799    /// use anno_core::ExtractionMethod;
800    ///
801    /// assert!(ExtractionMethod::Neural.is_calibrated());
802    /// assert!(!ExtractionMethod::Pattern.is_calibrated());
803    /// assert!(!ExtractionMethod::Heuristic.is_calibrated());
804    /// ```
805    #[must_use]
806    pub const fn is_calibrated(&self) -> bool {
807        #[allow(deprecated)]
808        match self {
809            ExtractionMethod::Neural => true,
810            ExtractionMethod::GatedEnsemble => true,
811            ExtractionMethod::SoftLexicon => true,
812            ExtractionMethod::ML => true, // Legacy alias for Neural
813            // Everything else is not calibrated
814            ExtractionMethod::Pattern => false,
815            ExtractionMethod::Lexicon => false,
816            ExtractionMethod::Consensus => false,
817            ExtractionMethod::Heuristic => false,
818            ExtractionMethod::Unknown => false,
819            ExtractionMethod::Rule => false,
820            ExtractionMethod::Ensemble => false,
821        }
822    }
823
824    /// Returns the confidence interpretation for this extraction method.
825    ///
826    /// This helps users understand what the confidence score means:
827    /// - `"probability"`: Score approximates P(correct)
828    /// - `"heuristic_score"`: Score is a non-probabilistic quality measure
829    /// - `"binary"`: Score is 0 or 1 (or a fixed value for matches)
830    #[must_use]
831    pub const fn confidence_interpretation(&self) -> &'static str {
832        #[allow(deprecated)]
833        match self {
834            ExtractionMethod::Neural | ExtractionMethod::ML => "probability",
835            ExtractionMethod::GatedEnsemble | ExtractionMethod::SoftLexicon => "probability",
836            ExtractionMethod::Pattern | ExtractionMethod::Lexicon => "binary",
837            ExtractionMethod::Heuristic | ExtractionMethod::Rule => "heuristic_score",
838            ExtractionMethod::Consensus | ExtractionMethod::Ensemble => "agreement_ratio",
839            ExtractionMethod::Unknown => "unknown",
840        }
841    }
842}
843
844impl std::fmt::Display for ExtractionMethod {
845    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
846        #[allow(deprecated)]
847        match self {
848            ExtractionMethod::Pattern => write!(f, "pattern"),
849            ExtractionMethod::Neural => write!(f, "neural"),
850            ExtractionMethod::Lexicon => write!(f, "lexicon"),
851            ExtractionMethod::SoftLexicon => write!(f, "soft_lexicon"),
852            ExtractionMethod::GatedEnsemble => write!(f, "gated_ensemble"),
853            ExtractionMethod::Consensus => write!(f, "consensus"),
854            ExtractionMethod::Heuristic => write!(f, "heuristic"),
855            ExtractionMethod::Unknown => write!(f, "unknown"),
856            ExtractionMethod::Rule => write!(f, "heuristic"), // Legacy alias
857            ExtractionMethod::ML => write!(f, "neural"),      // Legacy alias
858            ExtractionMethod::Ensemble => write!(f, "consensus"), // Legacy alias
859        }
860    }
861}
862
863// =============================================================================
864// Lexicon Traits
865// =============================================================================
866
867/// Exact-match lexicon/gazetteer for entity lookup.
868///
869/// # Research Context
870///
871/// Gazetteers (lists of known entities) are a classic NER technique. Modern research
872/// suggests they are most valuable when:
873///
874/// 1. **Domain is closed**: Stock tickers, medical codes, known product catalogs
875/// 2. **Text is short**: where context is insufficient
876/// 3. **Used as features**: Input to neural model, not final output (Song et al. 2020)
877///
878/// They're harmful when:
879/// 1. **Domain is open**: Novel entities not in the list get missed
880/// 2. **Used as authority**: Hardcoded lookups inflate test scores but fail in production
881///
882/// # When to Use
883///
884/// ```text
885/// Decision: Should I use a Lexicon?
886///
887/// Is entity type CLOSED (fixed, known list)?
888/// ├─ Yes: Lexicon is appropriate
889/// │       Examples: stock tickers, ICD-10 codes, country names
890/// └─ No:  Use Neural extraction instead
891///         Examples: person names, organization names, products
892/// ```
893///
894/// # Example
895///
896/// ```rust
897/// use anno_core::{Lexicon, EntityType, HashMapLexicon};
898///
899/// // Create a domain-specific lexicon
900/// let mut lexicon = HashMapLexicon::new("stock_tickers");
901/// lexicon.insert("AAPL", EntityType::Organization, 0.99);
902/// lexicon.insert("GOOGL", EntityType::Organization, 0.99);
903///
904/// // Lookup
905/// if let Some((entity_type, confidence)) = lexicon.lookup("AAPL") {
906///     assert_eq!(entity_type, EntityType::Organization);
907///     assert!(confidence > 0.9);
908/// }
909/// ```
910///
911/// # References
912///
913/// - Song et al. (2020). "Improving Neural NER with Gazetteers"
914/// - Nie et al. (2021). "GEMNET: Effective Gated Gazetteer Representations"
915/// - Rijhwani et al. (2020). "Soft Gazetteers for Low-Resource NER"
916pub trait Lexicon: Send + Sync {
917    /// Lookup an exact string, returning entity type and confidence if found.
918    ///
919    /// Returns `None` if the text is not in the lexicon.
920    fn lookup(&self, text: &str) -> Option<(EntityType, f64)>;
921
922    /// Check if the lexicon contains this exact string.
923    fn contains(&self, text: &str) -> bool {
924        self.lookup(text).is_some()
925    }
926
927    /// Get the lexicon source identifier (for provenance tracking).
928    fn source(&self) -> &str;
929
930    /// Get approximate number of entries (for debugging/metrics).
931    fn len(&self) -> usize;
932
933    /// Check if lexicon is empty.
934    fn is_empty(&self) -> bool {
935        self.len() == 0
936    }
937}
938
939/// Simple HashMap-based lexicon implementation.
940///
941/// Suitable for small to medium lexicons (<100k entries).
942/// For larger lexicons, consider a trie-based or FST implementation.
943#[derive(Debug, Clone)]
944pub struct HashMapLexicon {
945    entries: std::collections::HashMap<String, (EntityType, f64)>,
946    source: String,
947}
948
949impl HashMapLexicon {
950    /// Create a new empty lexicon with the given source identifier.
951    #[must_use]
952    pub fn new(source: impl Into<String>) -> Self {
953        Self {
954            entries: std::collections::HashMap::new(),
955            source: source.into(),
956        }
957    }
958
959    /// Insert an entry into the lexicon.
960    pub fn insert(&mut self, text: impl Into<String>, entity_type: EntityType, confidence: f64) {
961        self.entries.insert(text.into(), (entity_type, confidence));
962    }
963
964    /// Create from an iterator of (text, type, confidence) tuples.
965    pub fn from_iter<I, S>(source: impl Into<String>, entries: I) -> Self
966    where
967        I: IntoIterator<Item = (S, EntityType, f64)>,
968        S: Into<String>,
969    {
970        let mut lexicon = Self::new(source);
971        for (text, entity_type, confidence) in entries {
972            lexicon.insert(text, entity_type, confidence);
973        }
974        lexicon
975    }
976
977    /// Get all entries as an iterator (for debugging).
978    pub fn entries(&self) -> impl Iterator<Item = (&str, &EntityType, f64)> {
979        self.entries.iter().map(|(k, (t, c))| (k.as_str(), t, *c))
980    }
981}
982
983impl Lexicon for HashMapLexicon {
984    fn lookup(&self, text: &str) -> Option<(EntityType, f64)> {
985        self.entries.get(text).cloned()
986    }
987
988    fn source(&self) -> &str {
989        &self.source
990    }
991
992    fn len(&self) -> usize {
993        self.entries.len()
994    }
995}
996
997/// Provenance information for an extracted entity.
998///
999/// Tracks where an entity came from for debugging, explainability,
1000/// and confidence calibration in hybrid/ensemble systems.
1001#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Default)]
1002pub struct Provenance {
1003    /// Name of the backend that produced this entity (e.g., "pattern", "bert-onnx")
1004    pub source: Cow<'static, str>,
1005    /// Extraction method used
1006    pub method: ExtractionMethod,
1007    /// Specific pattern/rule name (for pattern/rule-based extraction)
1008    pub pattern: Option<Cow<'static, str>>,
1009    /// Raw confidence from the source model (before any calibration)
1010    pub raw_confidence: Option<f64>,
1011    /// Model version for reproducibility (e.g., "gliner-v2.1", "bert-base-uncased-2024-01")
1012    #[serde(default, skip_serializing_if = "Option::is_none")]
1013    pub model_version: Option<Cow<'static, str>>,
1014    /// Timestamp when extraction occurred (ISO 8601)
1015    #[serde(default, skip_serializing_if = "Option::is_none")]
1016    pub timestamp: Option<String>,
1017}
1018
1019impl Provenance {
1020    /// Create provenance for regex-based extraction.
1021    #[must_use]
1022    pub fn pattern(pattern_name: &'static str) -> Self {
1023        Self {
1024            source: Cow::Borrowed("pattern"),
1025            method: ExtractionMethod::Pattern,
1026            pattern: Some(Cow::Borrowed(pattern_name)),
1027            raw_confidence: Some(1.0), // Patterns are deterministic
1028            model_version: None,
1029            timestamp: None,
1030        }
1031    }
1032
1033    /// Create provenance for ML-based extraction.
1034    ///
1035    /// Accepts both static strings and owned strings:
1036    /// ```rust
1037    /// use anno_core::Provenance;
1038    ///
1039    /// // Static string (zero allocation)
1040    /// let p1 = Provenance::ml("gliner", 0.95);
1041    ///
1042    /// // Owned string (dynamic model name)
1043    /// let model_name = "bert-base";
1044    /// let p2 = Provenance::ml(model_name.to_string(), 0.95);
1045    /// ```
1046    #[must_use]
1047    pub fn ml(model_name: impl Into<Cow<'static, str>>, confidence: f64) -> Self {
1048        Self {
1049            source: model_name.into(),
1050            method: ExtractionMethod::Neural,
1051            pattern: None,
1052            raw_confidence: Some(confidence),
1053            model_version: None,
1054            timestamp: None,
1055        }
1056    }
1057
1058    /// Deprecated: Use `ml()` instead which now accepts both static and owned strings.
1059    #[deprecated(
1060        since = "0.2.1",
1061        note = "Use ml() instead, it now accepts owned strings"
1062    )]
1063    #[must_use]
1064    pub fn ml_owned(model_name: impl Into<String>, confidence: f64) -> Self {
1065        Self::ml(Cow::Owned(model_name.into()), confidence)
1066    }
1067
1068    /// Create provenance for ensemble/hybrid extraction.
1069    #[must_use]
1070    pub fn ensemble(sources: &'static str) -> Self {
1071        Self {
1072            source: Cow::Borrowed(sources),
1073            method: ExtractionMethod::Consensus,
1074            pattern: None,
1075            raw_confidence: None,
1076            model_version: None,
1077            timestamp: None,
1078        }
1079    }
1080
1081    /// Create provenance with model version for reproducibility.
1082    #[must_use]
1083    pub fn with_version(mut self, version: &'static str) -> Self {
1084        self.model_version = Some(Cow::Borrowed(version));
1085        self
1086    }
1087
1088    /// Create provenance with timestamp.
1089    #[must_use]
1090    pub fn with_timestamp(mut self, timestamp: impl Into<String>) -> Self {
1091        self.timestamp = Some(timestamp.into());
1092        self
1093    }
1094}
1095
1096// ============================================================================
1097// Span Types (Multi-Modal Support)
1098// ============================================================================
1099
1100/// A span locator for text and visual modalities.
1101///
1102/// `Span` is a **simplified subset** of [`grounded::Location`] designed for
1103/// the detection layer (`Entity`). It covers the most common cases:
1104///
1105/// - Text offsets (traditional NER)
1106/// - Bounding boxes (visual document understanding)
1107/// - Hybrid (OCR with both text and visual location)
1108///
1109/// # Relationship to `Location`
1110///
1111/// | `Span` variant | `Location` equivalent |
1112/// |----------------|-----------------------|
1113/// | `Text` | `Location::Text` |
1114/// | `BoundingBox` | `Location::BoundingBox` |
1115/// | `Hybrid` | `Location::TextWithBbox` |
1116///
1117/// For modalities not covered by `Span` (temporal, cuboid, genomic, discontinuous),
1118/// use `Location` directly via the canonical `Signal` → `Track` → `Identity` pipeline.
1119///
1120/// # Conversion
1121///
1122/// - `Span → Location`: Always succeeds via `Location::from(&span)`
1123/// - `Location → Span`: Use `location.to_span()`, returns `None` for unsupported variants
1124///
1125/// [`grounded::Location`]: super::grounded::Location
1126#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
1127pub enum Span {
1128    /// Text span with **character offsets** (start, end).
1129    ///
1130    /// Offsets are Unicode scalar value indices (what `text.chars()` counts),
1131    /// consistent with `Entity.start/end` and `grounded::Location::Text`.
1132    Text {
1133        /// Start character offset (inclusive)
1134        start: usize,
1135        /// End character offset (exclusive)
1136        end: usize,
1137    },
1138    /// Visual bounding box (normalized 0.0-1.0 coordinates)
1139    /// For ColPali: image patch locations
1140    BoundingBox {
1141        /// X coordinate (normalized 0.0-1.0)
1142        x: f32,
1143        /// Y coordinate (normalized 0.0-1.0)
1144        y: f32,
1145        /// Width (normalized 0.0-1.0)
1146        width: f32,
1147        /// Height (normalized 0.0-1.0)
1148        height: f32,
1149        /// Optional page number (for multi-page documents)
1150        page: Option<u32>,
1151    },
1152    /// Hybrid: both text and visual location (for OCR-verified extraction)
1153    Hybrid {
1154        /// Start character offset (inclusive)
1155        start: usize,
1156        /// End character offset (exclusive)
1157        end: usize,
1158        /// Bounding box for visual location
1159        bbox: Box<Span>,
1160    },
1161}
1162
1163impl Span {
1164    /// Create a text span.
1165    #[must_use]
1166    pub const fn text(start: usize, end: usize) -> Self {
1167        Self::Text { start, end }
1168    }
1169
1170    /// Create a bounding box span with normalized coordinates.
1171    #[must_use]
1172    pub fn bbox(x: f32, y: f32, width: f32, height: f32) -> Self {
1173        Self::BoundingBox {
1174            x,
1175            y,
1176            width,
1177            height,
1178            page: None,
1179        }
1180    }
1181
1182    /// Create a bounding box with page number.
1183    #[must_use]
1184    pub fn bbox_on_page(x: f32, y: f32, width: f32, height: f32, page: u32) -> Self {
1185        Self::BoundingBox {
1186            x,
1187            y,
1188            width,
1189            height,
1190            page: Some(page),
1191        }
1192    }
1193
1194    /// Check if this is a text span.
1195    #[must_use]
1196    pub const fn is_text(&self) -> bool {
1197        matches!(self, Self::Text { .. } | Self::Hybrid { .. })
1198    }
1199
1200    /// Check if this has visual location.
1201    #[must_use]
1202    pub const fn is_visual(&self) -> bool {
1203        matches!(self, Self::BoundingBox { .. } | Self::Hybrid { .. })
1204    }
1205
1206    /// Get text offsets if available.
1207    #[must_use]
1208    pub const fn text_offsets(&self) -> Option<(usize, usize)> {
1209        match self {
1210            Self::Text { start, end } => Some((*start, *end)),
1211            Self::Hybrid { start, end, .. } => Some((*start, *end)),
1212            Self::BoundingBox { .. } => None,
1213        }
1214    }
1215
1216    /// Calculate span length for text spans.
1217    #[must_use]
1218    pub fn len(&self) -> usize {
1219        match self {
1220            Self::Text { start, end } => end.saturating_sub(*start),
1221            Self::Hybrid { start, end, .. } => end.saturating_sub(*start),
1222            Self::BoundingBox { .. } => 0,
1223        }
1224    }
1225
1226    /// Check if span is empty.
1227    #[must_use]
1228    pub fn is_empty(&self) -> bool {
1229        self.len() == 0
1230    }
1231}
1232
1233// ============================================================================
1234// Discontinuous Spans (W2NER/ACE-style)
1235// ============================================================================
1236
1237/// A discontinuous span representing non-contiguous entity mentions.
1238///
1239/// Some entities span multiple non-adjacent text regions:
1240/// - "severe \[pain\] in the \[abdomen\]" → "severe abdominal pain"
1241/// - "the \[president\] ... \[Obama\]" → coreference
1242///
1243/// This is required for:
1244/// - **Medical NER**: Anatomical modifiers separated from findings
1245/// - **Legal NER**: Parties referenced across clauses
1246/// - **W2NER**: Word-word relation grids that detect discontinuous entities
1247///
1248/// # Offset Unit (CRITICAL)
1249///
1250/// `DiscontinuousSpan` uses **character offsets** (Unicode scalar value indices),
1251/// consistent with [`Entity::start`](super::entity::Entity::start) /
1252/// [`Entity::end`](super::entity::Entity::end) and `anno::core::grounded::Location`.
1253///
1254/// This is intentionally *not* byte offsets. If you have byte offsets (from regex,
1255/// `str::find`, tokenizers, etc.), convert them to character offsets first (see
1256/// `anno::offset::SpanConverter` in the `anno` crate).
1257///
1258/// # Example
1259///
1260/// ```rust
1261/// use anno_core::DiscontinuousSpan;
1262///
1263/// // "severe pain in the abdomen" where "severe" modifies "pain"
1264/// // but they're separated by other words
1265/// let span = DiscontinuousSpan::new(vec![
1266///     0..6,   // "severe"
1267///     12..16, // "pain"
1268/// ]);
1269///
1270/// assert_eq!(span.num_segments(), 2);
1271/// assert!(span.is_discontinuous());
1272/// ```
1273#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
1274pub struct DiscontinuousSpan {
1275    /// Non-overlapping segments, sorted by start position.
1276    /// Each `Range<usize>` represents (start_char, end_char).
1277    segments: Vec<std::ops::Range<usize>>,
1278}
1279
1280impl DiscontinuousSpan {
1281    /// Create a new discontinuous span from segments.
1282    ///
1283    /// Segments are sorted and validated (no overlaps).
1284    #[must_use]
1285    pub fn new(mut segments: Vec<std::ops::Range<usize>>) -> Self {
1286        // Sort by start position
1287        segments.sort_by_key(|r| r.start);
1288        Self { segments }
1289    }
1290
1291    /// Create from a single contiguous span.
1292    #[must_use]
1293    #[allow(clippy::single_range_in_vec_init)] // Intentional: contiguous is special case of discontinuous
1294    pub fn contiguous(start: usize, end: usize) -> Self {
1295        Self {
1296            segments: vec![start..end],
1297        }
1298    }
1299
1300    /// Number of segments.
1301    #[must_use]
1302    pub fn num_segments(&self) -> usize {
1303        self.segments.len()
1304    }
1305
1306    /// True if this spans multiple non-adjacent regions.
1307    #[must_use]
1308    pub fn is_discontinuous(&self) -> bool {
1309        self.segments.len() > 1
1310    }
1311
1312    /// True if this is a single contiguous span.
1313    #[must_use]
1314    pub fn is_contiguous(&self) -> bool {
1315        self.segments.len() <= 1
1316    }
1317
1318    /// Get the segments.
1319    #[must_use]
1320    pub fn segments(&self) -> &[std::ops::Range<usize>] {
1321        &self.segments
1322    }
1323
1324    /// Get the overall bounding range (start of first to end of last).
1325    #[must_use]
1326    pub fn bounding_range(&self) -> Option<std::ops::Range<usize>> {
1327        if self.segments.is_empty() {
1328            return None;
1329        }
1330        let start = self.segments.first()?.start;
1331        let end = self.segments.last()?.end;
1332        Some(start..end)
1333    }
1334
1335    /// Total character length (sum of all segments).
1336    ///
1337    #[must_use]
1338    pub fn total_len(&self) -> usize {
1339        self.segments.iter().map(|r| r.end - r.start).sum()
1340    }
1341
1342    /// Extract text from each segment and join with separator.
1343    #[must_use]
1344    pub fn extract_text(&self, text: &str, separator: &str) -> String {
1345        self.segments
1346            .iter()
1347            .map(|r| {
1348                let start = r.start;
1349                let len = r.end.saturating_sub(r.start);
1350                text.chars().skip(start).take(len).collect::<String>()
1351            })
1352            .collect::<Vec<_>>()
1353            .join(separator)
1354    }
1355
1356    /// Check if a character position falls within any segment.
1357    ///
1358    /// # Arguments
1359    ///
1360    /// * `pos` - Character offset to check (Unicode scalar value index)
1361    ///
1362    /// # Returns
1363    ///
1364    /// `true` if the character position falls within any segment of this span.
1365    #[must_use]
1366    pub fn contains(&self, pos: usize) -> bool {
1367        self.segments.iter().any(|r| r.contains(&pos))
1368    }
1369
1370    /// Convert to a regular Span (uses bounding range, loses discontinuity info).
1371    #[must_use]
1372    pub fn to_span(&self) -> Option<Span> {
1373        self.bounding_range().map(|r| Span::Text {
1374            start: r.start,
1375            end: r.end,
1376        })
1377    }
1378}
1379
1380impl From<std::ops::Range<usize>> for DiscontinuousSpan {
1381    fn from(range: std::ops::Range<usize>) -> Self {
1382        Self::contiguous(range.start, range.end)
1383    }
1384}
1385
1386impl Default for Span {
1387    fn default() -> Self {
1388        Self::Text { start: 0, end: 0 }
1389    }
1390}
1391
1392// ============================================================================
1393// Hierarchical Confidence (Coarse-to-Fine)
1394// ============================================================================
1395
1396/// Hierarchical confidence scores for coarse-to-fine extraction.
1397///
1398/// Research (HiNet, InfoHier) shows that extraction benefits from
1399/// decomposed confidence:
1400/// - **Linkage**: "Is there ANY entity here?" (binary, fast filter)
1401/// - **Type**: "What type is it?" (fine-grained classification)
1402/// - **Boundary**: "Where exactly does it start/end?" (span refinement)
1403#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
1404pub struct HierarchicalConfidence {
1405    /// Coarse: probability that this span contains ANY entity (0.0-1.0)
1406    /// Used for early filtering in the TPLinker "handshaking" matrix.
1407    pub linkage: f32,
1408    /// Fine: probability that the type classification is correct (0.0-1.0)
1409    pub type_score: f32,
1410    /// Boundary: confidence in the exact span boundaries (0.0-1.0)
1411    /// Low for entities with fuzzy boundaries (e.g., "the CEO" vs "CEO")
1412    pub boundary: f32,
1413}
1414
1415impl HierarchicalConfidence {
1416    /// Create hierarchical confidence with all scores.
1417    #[must_use]
1418    pub fn new(linkage: f32, type_score: f32, boundary: f32) -> Self {
1419        Self {
1420            linkage: linkage.clamp(0.0, 1.0),
1421            type_score: type_score.clamp(0.0, 1.0),
1422            boundary: boundary.clamp(0.0, 1.0),
1423        }
1424    }
1425
1426    /// Create from a single confidence score (legacy compatibility).
1427    /// Assigns same score to all levels.
1428    #[must_use]
1429    pub fn from_single(confidence: f32) -> Self {
1430        let c = confidence.clamp(0.0, 1.0);
1431        Self {
1432            linkage: c,
1433            type_score: c,
1434            boundary: c,
1435        }
1436    }
1437
1438    /// Calculate combined confidence (geometric mean).
1439    /// Geometric mean penalizes low scores more than arithmetic mean.
1440    #[must_use]
1441    pub fn combined(&self) -> f32 {
1442        (self.linkage * self.type_score * self.boundary).powf(1.0 / 3.0)
1443    }
1444
1445    /// Calculate combined confidence as f64 for legacy compatibility.
1446    #[must_use]
1447    pub fn as_f64(&self) -> f64 {
1448        self.combined() as f64
1449    }
1450
1451    /// Check if passes minimum threshold at all levels.
1452    #[must_use]
1453    pub fn passes_threshold(&self, linkage_min: f32, type_min: f32, boundary_min: f32) -> bool {
1454        self.linkage >= linkage_min && self.type_score >= type_min && self.boundary >= boundary_min
1455    }
1456}
1457
1458impl Default for HierarchicalConfidence {
1459    fn default() -> Self {
1460        Self {
1461            linkage: 1.0,
1462            type_score: 1.0,
1463            boundary: 1.0,
1464        }
1465    }
1466}
1467
1468impl From<f64> for HierarchicalConfidence {
1469    fn from(confidence: f64) -> Self {
1470        Self::from_single(confidence as f32)
1471    }
1472}
1473
1474impl From<f32> for HierarchicalConfidence {
1475    fn from(confidence: f32) -> Self {
1476        Self::from_single(confidence)
1477    }
1478}
1479
1480// ============================================================================
1481// Ragged Batch (ModernBERT Unpadding)
1482// ============================================================================
1483
1484/// A ragged (unpadded) batch for efficient ModernBERT inference.
1485///
1486/// ModernBERT achieves its speed advantage by avoiding padding tokens entirely.
1487/// Instead of `[batch, max_seq_len]`, it uses a single contiguous 1D sequence
1488/// with offset indices to track document boundaries.
1489///
1490/// # Memory Layout
1491///
1492/// ```text
1493/// Traditional (padded):
1494/// [doc1_tok1, doc1_tok2, PAD, PAD, PAD]  <- wasted compute
1495/// [doc2_tok1, doc2_tok2, doc2_tok3, PAD, PAD]
1496///
1497/// Ragged (unpadded):
1498/// [doc1_tok1, doc1_tok2, doc2_tok1, doc2_tok2, doc2_tok3]
1499/// cumulative_offsets: [0, 2, 5]  <- doc1 is [0..2], doc2 is [2..5]
1500/// ```
1501#[derive(Debug, Clone)]
1502pub struct RaggedBatch {
1503    /// Token IDs flattened into a single contiguous array.
1504    /// Shape: `[total_tokens]` (1D, no padding)
1505    pub token_ids: Vec<u32>,
1506    /// Cumulative sequence lengths.
1507    /// Length: batch_size + 1
1508    /// Document i spans tokens \[offsets\[i\]..offsets\[i+1\])
1509    pub cumulative_offsets: Vec<u32>,
1510    /// Maximum sequence length in this batch (for kernel bounds).
1511    pub max_seq_len: usize,
1512}
1513
1514impl RaggedBatch {
1515    /// Create a new ragged batch from sequences.
1516    pub fn from_sequences(sequences: &[Vec<u32>]) -> Self {
1517        let total_tokens: usize = sequences.iter().map(|s| s.len()).sum();
1518        let mut token_ids = Vec::with_capacity(total_tokens);
1519        let mut cumulative_offsets = Vec::with_capacity(sequences.len() + 1);
1520        let mut max_seq_len = 0;
1521
1522        cumulative_offsets.push(0);
1523        for seq in sequences {
1524            token_ids.extend_from_slice(seq);
1525            // Check for overflow: u32::MAX is 4,294,967,295
1526            // If token_ids.len() exceeds this, we'll truncate (which is a bug)
1527            // but in practice, this is unlikely for reasonable batch sizes
1528            let len = token_ids.len();
1529            if len > u32::MAX as usize {
1530                // This would overflow - use saturating cast to prevent panic
1531                // but log a warning as this indicates a problem
1532                log::warn!(
1533                    "Token count {} exceeds u32::MAX, truncating to {}",
1534                    len,
1535                    u32::MAX
1536                );
1537                cumulative_offsets.push(u32::MAX);
1538            } else {
1539                cumulative_offsets.push(len as u32);
1540            }
1541            max_seq_len = max_seq_len.max(seq.len());
1542        }
1543
1544        Self {
1545            token_ids,
1546            cumulative_offsets,
1547            max_seq_len,
1548        }
1549    }
1550
1551    /// Get the number of documents in this batch.
1552    #[must_use]
1553    pub fn batch_size(&self) -> usize {
1554        self.cumulative_offsets.len().saturating_sub(1)
1555    }
1556
1557    /// Get the total number of tokens (no padding).
1558    #[must_use]
1559    pub fn total_tokens(&self) -> usize {
1560        self.token_ids.len()
1561    }
1562
1563    /// Get token range for a specific document.
1564    #[must_use]
1565    pub fn doc_range(&self, doc_idx: usize) -> Option<std::ops::Range<usize>> {
1566        if doc_idx + 1 < self.cumulative_offsets.len() {
1567            let start = self.cumulative_offsets[doc_idx] as usize;
1568            let end = self.cumulative_offsets[doc_idx + 1] as usize;
1569            Some(start..end)
1570        } else {
1571            None
1572        }
1573    }
1574
1575    /// Get tokens for a specific document.
1576    #[must_use]
1577    pub fn doc_tokens(&self, doc_idx: usize) -> Option<&[u32]> {
1578        self.doc_range(doc_idx).map(|r| &self.token_ids[r])
1579    }
1580
1581    /// Calculate memory saved vs padded batch.
1582    #[must_use]
1583    pub fn padding_savings(&self) -> f64 {
1584        let padded_size = self.batch_size() * self.max_seq_len;
1585        if padded_size == 0 {
1586            return 0.0;
1587        }
1588        1.0 - (self.total_tokens() as f64 / padded_size as f64)
1589    }
1590}
1591
1592// ============================================================================
1593// Span Candidate Generation
1594// ============================================================================
1595
1596/// A candidate span for entity extraction.
1597///
1598/// In GLiNER/bi-encoder systems, we generate all possible spans up to a
1599/// maximum width and score them against entity type embeddings.
1600#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
1601pub struct SpanCandidate {
1602    /// Document index in the batch
1603    pub doc_idx: u32,
1604    /// Start token index (within the document)
1605    pub start: u32,
1606    /// End token index (exclusive)
1607    pub end: u32,
1608}
1609
1610impl SpanCandidate {
1611    /// Create a new span candidate.
1612    #[must_use]
1613    pub const fn new(doc_idx: u32, start: u32, end: u32) -> Self {
1614        Self {
1615            doc_idx,
1616            start,
1617            end,
1618        }
1619    }
1620
1621    /// Get span width (number of tokens).
1622    #[must_use]
1623    pub const fn width(&self) -> u32 {
1624        self.end.saturating_sub(self.start)
1625    }
1626}
1627
1628/// Generate all valid span candidates for a ragged batch.
1629///
1630/// This is the "gnarly" operation in GLiNER - efficiently enumerating
1631/// all valid spans without O(N^2) memory allocation.
1632pub fn generate_span_candidates(batch: &RaggedBatch, max_width: usize) -> Vec<SpanCandidate> {
1633    let mut candidates = Vec::new();
1634
1635    for doc_idx in 0..batch.batch_size() {
1636        if let Some(range) = batch.doc_range(doc_idx) {
1637            let doc_len = range.len();
1638            // Generate all spans [i, j) where j - i <= max_width
1639            for start in 0..doc_len {
1640                let max_end = (start + max_width).min(doc_len);
1641                for end in (start + 1)..=max_end {
1642                    candidates.push(SpanCandidate::new(doc_idx as u32, start as u32, end as u32));
1643                }
1644            }
1645        }
1646    }
1647
1648    candidates
1649}
1650
1651/// Generate span candidates with early filtering.
1652///
1653/// Uses a linkage mask to skip low-probability spans (TPLinker optimization).
1654pub fn generate_filtered_candidates(
1655    batch: &RaggedBatch,
1656    max_width: usize,
1657    linkage_mask: &[f32],
1658    threshold: f32,
1659) -> Vec<SpanCandidate> {
1660    let mut candidates = Vec::new();
1661    let mut mask_idx = 0;
1662
1663    for doc_idx in 0..batch.batch_size() {
1664        if let Some(range) = batch.doc_range(doc_idx) {
1665            let doc_len = range.len();
1666            for start in 0..doc_len {
1667                let max_end = (start + max_width).min(doc_len);
1668                for end in (start + 1)..=max_end {
1669                    // Only include if linkage probability exceeds threshold
1670                    if mask_idx < linkage_mask.len() && linkage_mask[mask_idx] >= threshold {
1671                        candidates.push(SpanCandidate::new(
1672                            doc_idx as u32,
1673                            start as u32,
1674                            end as u32,
1675                        ));
1676                    }
1677                    mask_idx += 1;
1678                }
1679            }
1680        }
1681    }
1682
1683    candidates
1684}
1685
1686// ============================================================================
1687// Entity (Extended)
1688// ============================================================================
1689
1690/// A recognized named entity or relation trigger.
1691///
1692/// # Entity Structure
1693///
1694/// ```text
1695/// "Contact John at john@example.com on Jan 15"
1696///          ^^^^    ^^^^^^^^^^^^^^^^    ^^^^^^
1697///          PER     EMAIL               DATE
1698///          |       |                   |
1699///          Named   Contact             Temporal
1700///          (ML)    (Pattern)           (Pattern)
1701/// ```
1702///
1703/// # Core Fields (Stable API)
1704///
1705/// - `text`, `entity_type`, `start`, `end`, `confidence` — always present
1706/// - `normalized`, `provenance` — commonly used optional fields
1707/// - `kb_id`, `canonical_id` — knowledge graph and coreference support
1708///
1709/// # Extended Fields (Research/Experimental)
1710///
1711/// The following fields support advanced research applications but may evolve:
1712///
1713/// | Field | Purpose | Status |
1714/// |-------|---------|--------|
1715/// | `visual_span` | Multi-modal (ColPali) extraction | Experimental |
1716/// | `discontinuous_span` | W2NER non-contiguous entities | Experimental |
1717/// | `valid_from`, `valid_until` | Temporal knowledge graphs | Research |
1718/// | `viewport` | Multi-faceted entity representation | Research |
1719/// | `hierarchical_confidence` | Coarse-to-fine NER | Experimental |
1720///
1721/// These fields are `#[serde(skip_serializing_if = "Option::is_none")]` so they
1722/// have no overhead when unused.
1723///
1724/// # Knowledge Graph Support
1725///
1726/// For GraphRAG and coreference resolution, entities support:
1727/// - `kb_id`: External knowledge base identifier (e.g., Wikidata Q-ID)
1728/// - `canonical_id`: Local coreference cluster ID (links "John" and "he")
1729///
1730/// # Normalization
1731///
1732/// Entities can have a normalized form for downstream processing:
1733/// - Dates: "Jan 15" → "2024-01-15" (ISO 8601)
1734/// - Money: "$1.5M" → "1500000 USD"
1735/// - Locations: "NYC" → "New York City"
1736#[derive(Debug, Clone, Serialize, Deserialize)]
1737pub struct Entity {
1738    /// Entity text (surface form as it appears in source)
1739    pub text: String,
1740    /// Entity type classification
1741    pub entity_type: EntityType,
1742    /// Start position (character offset, NOT byte offset).
1743    ///
1744    /// For Unicode text, character offsets differ from byte offsets.
1745    /// Use `anno::offset::bytes_to_chars` to convert if needed.
1746    pub start: usize,
1747    /// End position (character offset, exclusive).
1748    ///
1749    /// For Unicode text, character offsets differ from byte offsets.
1750    /// Use `anno::offset::bytes_to_chars` to convert if needed.
1751    pub end: usize,
1752    /// Confidence score (0.0-1.0, calibrated)
1753    pub confidence: f64,
1754    /// Normalized/canonical form (e.g., "Jan 15" → "2024-01-15")
1755    #[serde(default, skip_serializing_if = "Option::is_none")]
1756    pub normalized: Option<String>,
1757    /// Provenance: which backend/method produced this entity
1758    #[serde(default, skip_serializing_if = "Option::is_none")]
1759    pub provenance: Option<Provenance>,
1760    /// External knowledge base ID (e.g., "Q7186" for Marie Curie in Wikidata).
1761    /// Used for entity linking and GraphRAG applications.
1762    #[serde(default, skip_serializing_if = "Option::is_none")]
1763    pub kb_id: Option<String>,
1764    /// Local coreference cluster ID.
1765    /// Multiple mentions with the same `canonical_id` refer to the same entity.
1766    /// Example: "Marie Curie" and "she" might share `canonical_id = CanonicalId(42)`.
1767    #[serde(default, skip_serializing_if = "Option::is_none")]
1768    pub canonical_id: Option<super::types::CanonicalId>,
1769    /// Hierarchical confidence (coarse-to-fine).
1770    /// Provides linkage, type, and boundary scores separately.
1771    #[serde(default, skip_serializing_if = "Option::is_none")]
1772    pub hierarchical_confidence: Option<HierarchicalConfidence>,
1773    /// Visual span for multi-modal (ColPali) extraction.
1774    /// When set, provides bounding box location in addition to text offsets.
1775    #[serde(default, skip_serializing_if = "Option::is_none")]
1776    pub visual_span: Option<Span>,
1777    /// Discontinuous span for non-contiguous entity mentions (W2NER support).
1778    /// When set, overrides `start`/`end` for length calculations.
1779    /// Example: "New York and LA \[airports\]" where "airports" modifies both.
1780    #[serde(default, skip_serializing_if = "Option::is_none")]
1781    pub discontinuous_span: Option<DiscontinuousSpan>,
1782    // =========================================================================
1783    // Temporal Validity (Research: Temporal Knowledge Graphs)
1784    // =========================================================================
1785    /// Start of temporal validity interval for this entity assertion.
1786    ///
1787    /// Entities are facts that may change over time:
1788    /// - "Satya Nadella is CEO of Microsoft" is valid from [2014, present]
1789    /// - "Steve Ballmer was CEO of Microsoft" was valid from [2000, 2014]
1790    ///
1791    /// When `None`, the entity is either:
1792    /// - Currently valid (no known end date)
1793    /// - Atemporal (timeless fact like "Paris is in France")
1794    ///
1795    /// # Example
1796    /// ```rust
1797    /// use anno_core::{Entity, EntityType};
1798    /// use chrono::{TimeZone, Utc};
1799    ///
1800    /// let mut entity = Entity::new("CEO of Microsoft", EntityType::Person, 0, 16, 0.9);
1801    /// entity.valid_from = Some(Utc.with_ymd_and_hms(2008, 10, 1, 0, 0, 0).unwrap());
1802    /// ```
1803    #[serde(default, skip_serializing_if = "Option::is_none")]
1804    pub valid_from: Option<chrono::DateTime<chrono::Utc>>,
1805    /// End of temporal validity interval for this entity assertion.
1806    ///
1807    /// When `None` and `valid_from` is set, the fact is currently valid.
1808    /// When both are `None`, the entity is atemporal.
1809    #[serde(default, skip_serializing_if = "Option::is_none")]
1810    pub valid_until: Option<chrono::DateTime<chrono::Utc>>,
1811    // =========================================================================
1812    // Viewport / Context (Research: Entity Manifolds)
1813    // =========================================================================
1814    /// Viewport context for multi-faceted entity representation.
1815    ///
1816    /// The same real-world entity can have different "faces" in different contexts:
1817    /// - "Marie Curie" in an academic context: professor, researcher
1818    /// - "Marie Curie" in a scientific context: physicist, chemist
1819    /// - "Marie Curie" in a personal context: mother, educator
1820    ///
1821    /// This enables "holographic" entity projection at query time:
1822    /// given a query context, project the entity manifold to the relevant viewport.
1823    ///
1824    /// # Example
1825    /// ```rust
1826    /// use anno_core::{Entity, EntityType, EntityViewport};
1827    ///
1828    /// let mut entity = Entity::new("Marie Curie", EntityType::Person, 0, 11, 0.9);
1829    /// entity.viewport = Some(EntityViewport::Academic);
1830    /// ```
1831    #[serde(default, skip_serializing_if = "Option::is_none")]
1832    pub viewport: Option<EntityViewport>,
1833}
1834
1835impl Entity {
1836    /// Create a new entity.
1837    ///
1838    /// ```
1839    /// use anno_core::{Entity, EntityType};
1840    ///
1841    /// let e = Entity::new("Berlin", EntityType::Location, 10, 16, 0.95);
1842    /// assert_eq!(e.text, "Berlin");
1843    /// assert_eq!(e.entity_type, EntityType::Location);
1844    /// assert_eq!((e.start, e.end), (10, 16));
1845    /// ```
1846    #[must_use]
1847    pub fn new(
1848        text: impl Into<String>,
1849        entity_type: EntityType,
1850        start: usize,
1851        end: usize,
1852        confidence: f64,
1853    ) -> Self {
1854        Self {
1855            text: text.into(),
1856            entity_type,
1857            start,
1858            end,
1859            confidence: confidence.clamp(0.0, 1.0),
1860            normalized: None,
1861            provenance: None,
1862            kb_id: None,
1863            canonical_id: None,
1864            hierarchical_confidence: None,
1865            visual_span: None,
1866            discontinuous_span: None,
1867            valid_from: None,
1868            valid_until: None,
1869            viewport: None,
1870        }
1871    }
1872
1873    /// Create a new entity with provenance information.
1874    #[must_use]
1875    pub fn with_provenance(
1876        text: impl Into<String>,
1877        entity_type: EntityType,
1878        start: usize,
1879        end: usize,
1880        confidence: f64,
1881        provenance: Provenance,
1882    ) -> Self {
1883        Self {
1884            text: text.into(),
1885            entity_type,
1886            start,
1887            end,
1888            confidence: confidence.clamp(0.0, 1.0),
1889            normalized: None,
1890            provenance: Some(provenance),
1891            kb_id: None,
1892            canonical_id: None,
1893            hierarchical_confidence: None,
1894            visual_span: None,
1895            discontinuous_span: None,
1896            valid_from: None,
1897            valid_until: None,
1898            viewport: None,
1899        }
1900    }
1901
1902    /// Create an entity with hierarchical confidence scores.
1903    #[must_use]
1904    pub fn with_hierarchical_confidence(
1905        text: impl Into<String>,
1906        entity_type: EntityType,
1907        start: usize,
1908        end: usize,
1909        confidence: HierarchicalConfidence,
1910    ) -> Self {
1911        Self {
1912            text: text.into(),
1913            entity_type,
1914            start,
1915            end,
1916            confidence: confidence.as_f64(),
1917            normalized: None,
1918            provenance: None,
1919            kb_id: None,
1920            canonical_id: None,
1921            hierarchical_confidence: Some(confidence),
1922            visual_span: None,
1923            discontinuous_span: None,
1924            valid_from: None,
1925            valid_until: None,
1926            viewport: None,
1927        }
1928    }
1929
1930    /// Create an entity from a visual bounding box (ColPali multi-modal).
1931    #[must_use]
1932    pub fn from_visual(
1933        text: impl Into<String>,
1934        entity_type: EntityType,
1935        bbox: Span,
1936        confidence: f64,
1937    ) -> Self {
1938        Self {
1939            text: text.into(),
1940            entity_type,
1941            start: 0,
1942            end: 0,
1943            confidence: confidence.clamp(0.0, 1.0),
1944            normalized: None,
1945            provenance: None,
1946            kb_id: None,
1947            canonical_id: None,
1948            hierarchical_confidence: None,
1949            visual_span: Some(bbox),
1950            discontinuous_span: None,
1951            valid_from: None,
1952            valid_until: None,
1953            viewport: None,
1954        }
1955    }
1956
1957    /// Create an entity with default confidence (1.0).
1958    #[must_use]
1959    pub fn with_type(
1960        text: impl Into<String>,
1961        entity_type: EntityType,
1962        start: usize,
1963        end: usize,
1964    ) -> Self {
1965        Self::new(text, entity_type, start, end, 1.0)
1966    }
1967
1968    /// Link this entity to an external knowledge base.
1969    ///
1970    /// # Examples
1971    /// ```
1972    /// use anno_core::{Entity, EntityType};
1973    /// let mut e = Entity::new("Marie Curie", EntityType::Person, 0, 11, 0.95);
1974    /// e.link_to_kb("Q7186");
1975    /// assert_eq!(e.kb_id.as_deref(), Some("Q7186"));
1976    /// ```
1977    pub fn link_to_kb(&mut self, kb_id: impl Into<String>) {
1978        self.kb_id = Some(kb_id.into());
1979    }
1980
1981    /// Assign this entity to a coreference cluster.
1982    ///
1983    /// Entities with the same `canonical_id` refer to the same real-world entity.
1984    pub fn set_canonical(&mut self, canonical_id: impl Into<super::types::CanonicalId>) {
1985        self.canonical_id = Some(canonical_id.into());
1986    }
1987
1988    /// Builder-style method to set canonical ID.
1989    ///
1990    /// # Example
1991    /// ```
1992    /// use anno_core::{CanonicalId, Entity, EntityType};
1993    /// let entity = Entity::new("John", EntityType::Person, 0, 4, 0.9)
1994    ///     .with_canonical_id(42);
1995    /// assert_eq!(entity.canonical_id, Some(CanonicalId::new(42)));
1996    /// ```
1997    #[must_use]
1998    pub fn with_canonical_id(mut self, canonical_id: impl Into<super::types::CanonicalId>) -> Self {
1999        self.canonical_id = Some(canonical_id.into());
2000        self
2001    }
2002
2003    /// Check if this entity is linked to a knowledge base.
2004    #[must_use]
2005    pub fn is_linked(&self) -> bool {
2006        self.kb_id.is_some()
2007    }
2008
2009    /// Check if this entity has coreference information.
2010    #[must_use]
2011    pub fn has_coreference(&self) -> bool {
2012        self.canonical_id.is_some()
2013    }
2014
2015    /// Check if this entity has a discontinuous span.
2016    ///
2017    /// Discontinuous entities span non-contiguous text regions.
2018    /// Example: "New York and LA airports" contains "New York airports"
2019    /// as a discontinuous entity.
2020    #[must_use]
2021    pub fn is_discontinuous(&self) -> bool {
2022        self.discontinuous_span
2023            .as_ref()
2024            .map(|s| s.is_discontinuous())
2025            .unwrap_or(false)
2026    }
2027
2028    /// Get the discontinuous segments if present.
2029    ///
2030    /// Returns `None` if this is a contiguous entity.
2031    #[must_use]
2032    pub fn discontinuous_segments(&self) -> Option<Vec<std::ops::Range<usize>>> {
2033        self.discontinuous_span
2034            .as_ref()
2035            .filter(|s| s.is_discontinuous())
2036            .map(|s| s.segments().to_vec())
2037    }
2038
2039    /// Set a discontinuous span for this entity.
2040    ///
2041    /// This is used by W2NER and similar models that detect non-contiguous mentions.
2042    pub fn set_discontinuous_span(&mut self, span: DiscontinuousSpan) {
2043        // Update start/end to match the bounding range
2044        if let Some(bounding) = span.bounding_range() {
2045            self.start = bounding.start;
2046            self.end = bounding.end;
2047        }
2048        self.discontinuous_span = Some(span);
2049    }
2050
2051    /// Get the total length covered by this entity, in **characters**.
2052    ///
2053    /// - **Contiguous**: `end - start`
2054    /// - **Discontinuous**: sum of segment lengths
2055    ///
2056    /// This is intentionally consistent: all offsets in `anno::core` entity spans
2057    /// are **character offsets** (Unicode scalar values), not byte offsets.
2058    #[must_use]
2059    pub fn total_len(&self) -> usize {
2060        if let Some(ref span) = self.discontinuous_span {
2061            span.segments().iter().map(|r| r.end - r.start).sum()
2062        } else {
2063            self.end.saturating_sub(self.start)
2064        }
2065    }
2066
2067    /// Set the normalized form for this entity.
2068    ///
2069    /// # Examples
2070    ///
2071    /// ```rust
2072    /// use anno_core::{Entity, EntityType};
2073    ///
2074    /// let mut entity = Entity::new("Jan 15", EntityType::Date, 0, 6, 0.95);
2075    /// entity.set_normalized("2024-01-15");
2076    /// assert_eq!(entity.normalized.as_deref(), Some("2024-01-15"));
2077    /// ```
2078    pub fn set_normalized(&mut self, normalized: impl Into<String>) {
2079        self.normalized = Some(normalized.into());
2080    }
2081
2082    /// Get the normalized form, or the original text if not normalized.
2083    #[must_use]
2084    pub fn normalized_or_text(&self) -> &str {
2085        self.normalized.as_deref().unwrap_or(&self.text)
2086    }
2087
2088    /// Get the extraction method, if known.
2089    #[must_use]
2090    pub fn method(&self) -> ExtractionMethod {
2091        self.provenance
2092            .as_ref()
2093            .map_or(ExtractionMethod::Unknown, |p| p.method)
2094    }
2095
2096    /// Get the source backend name, if known.
2097    #[must_use]
2098    pub fn source(&self) -> Option<&str> {
2099        self.provenance.as_ref().map(|p| p.source.as_ref())
2100    }
2101
2102    /// Get the entity category.
2103    #[must_use]
2104    pub fn category(&self) -> EntityCategory {
2105        self.entity_type.category()
2106    }
2107
2108    /// Returns true if this entity was detected via patterns (not ML).
2109    #[must_use]
2110    pub fn is_structured(&self) -> bool {
2111        self.entity_type.pattern_detectable()
2112    }
2113
2114    /// Returns true if this entity required ML for detection.
2115    #[must_use]
2116    pub fn is_named(&self) -> bool {
2117        self.entity_type.requires_ml()
2118    }
2119
2120    /// Check if this entity overlaps with another.
2121    #[must_use]
2122    pub fn overlaps(&self, other: &Entity) -> bool {
2123        !(self.end <= other.start || other.end <= self.start)
2124    }
2125
2126    /// Calculate overlap ratio (IoU) with another entity.
2127    #[must_use]
2128    pub fn overlap_ratio(&self, other: &Entity) -> f64 {
2129        let intersection_start = self.start.max(other.start);
2130        let intersection_end = self.end.min(other.end);
2131
2132        if intersection_start >= intersection_end {
2133            return 0.0;
2134        }
2135
2136        let intersection = (intersection_end - intersection_start) as f64;
2137        let union = ((self.end - self.start) + (other.end - other.start)
2138            - (intersection_end - intersection_start)) as f64;
2139
2140        if union == 0.0 {
2141            return 1.0;
2142        }
2143
2144        intersection / union
2145    }
2146
2147    /// Set hierarchical confidence scores.
2148    pub fn set_hierarchical_confidence(&mut self, confidence: HierarchicalConfidence) {
2149        self.confidence = confidence.as_f64();
2150        self.hierarchical_confidence = Some(confidence);
2151    }
2152
2153    /// Get the linkage confidence (coarse filter score).
2154    #[must_use]
2155    pub fn linkage_confidence(&self) -> f32 {
2156        self.hierarchical_confidence
2157            .map_or(self.confidence as f32, |h| h.linkage)
2158    }
2159
2160    /// Get the type classification confidence.
2161    #[must_use]
2162    pub fn type_confidence(&self) -> f32 {
2163        self.hierarchical_confidence
2164            .map_or(self.confidence as f32, |h| h.type_score)
2165    }
2166
2167    /// Get the boundary confidence.
2168    #[must_use]
2169    pub fn boundary_confidence(&self) -> f32 {
2170        self.hierarchical_confidence
2171            .map_or(self.confidence as f32, |h| h.boundary)
2172    }
2173
2174    /// Check if this entity has visual location (multi-modal).
2175    #[must_use]
2176    pub fn is_visual(&self) -> bool {
2177        self.visual_span.is_some()
2178    }
2179
2180    /// Get the text span (start, end).
2181    #[must_use]
2182    pub const fn text_span(&self) -> (usize, usize) {
2183        (self.start, self.end)
2184    }
2185
2186    /// Get the span length.
2187    #[must_use]
2188    pub const fn span_len(&self) -> usize {
2189        self.end.saturating_sub(self.start)
2190    }
2191
2192    /// Create a unified TextSpan with both byte and char offsets.
2193    ///
2194    /// This is useful when you need to work with both offset systems.
2195    /// The `text` parameter must be the original source text from which
2196    /// this entity was extracted.
2197    ///
2198    /// # Arguments
2199    /// * `source_text` - The original text (needed to compute byte offsets)
2200    ///
2201    /// # Returns
2202    /// A TextSpan with both byte and char offsets.
2203    ///
2204    /// # Note
2205    ///
2206    /// This method requires the offset conversion utilities from the `anno` crate.
2207    /// Use `anno::offset::char_to_byte_offsets()` directly for now.
2208    ///
2209    /// # Example
2210    /// ```rust,ignore
2211    /// use anno_core::{Entity, EntityType};
2212    ///
2213    /// let (byte_start, byte_end) = char_to_byte_offsets(text, entity.start, entity.end);
2214    /// ```
2215    #[allow(dead_code)]
2216    #[doc(hidden)]
2217    pub fn to_text_span(&self, _source_text: &str) -> serde_json::Value {
2218        unimplemented!("Use anno::offset utilities directly - see method docs")
2219    }
2220
2221    /// Set visual span for multi-modal extraction.
2222    pub fn set_visual_span(&mut self, span: Span) {
2223        self.visual_span = Some(span);
2224    }
2225
2226    /// Safely extract text from source using character offsets.
2227    ///
2228    /// Entity stores character offsets, not byte offsets. This method
2229    /// correctly extracts text by iterating over characters.
2230    ///
2231    /// # Arguments
2232    /// * `source_text` - The original text from which this entity was extracted
2233    ///
2234    /// # Returns
2235    /// The extracted text, or empty string if offsets are invalid
2236    ///
2237    /// # Example
2238    /// ```rust
2239    /// use anno_core::{Entity, EntityType};
2240    ///
2241    /// let text = "Hello, 日本!";
2242    /// let entity = Entity::new("日本", EntityType::Location, 7, 9, 0.95);
2243    /// assert_eq!(entity.extract_text(text), "日本");
2244    /// ```
2245    #[must_use]
2246    pub fn extract_text(&self, source_text: &str) -> String {
2247        // Performance: Use cached length if available, but fallback to counting
2248        // For single entity extraction, this is fine. For batch operations,
2249        // use extract_text_with_len with pre-computed length.
2250        let char_count = source_text.chars().count();
2251        self.extract_text_with_len(source_text, char_count)
2252    }
2253
2254    /// Extract text with pre-computed text length (performance optimization).
2255    ///
2256    /// Use this when validating/clamping multiple entities from the same text
2257    /// to avoid recalculating `text.chars().count()` for each entity.
2258    ///
2259    /// # Arguments
2260    /// * `source_text` - The original text
2261    /// * `text_char_count` - Pre-computed character count (from `text.chars().count()`)
2262    ///
2263    /// # Returns
2264    /// The extracted text, or empty string if offsets are invalid
2265    #[must_use]
2266    pub fn extract_text_with_len(&self, source_text: &str, text_char_count: usize) -> String {
2267        if self.start >= text_char_count || self.end > text_char_count || self.start >= self.end {
2268            return String::new();
2269        }
2270        source_text
2271            .chars()
2272            .skip(self.start)
2273            .take(self.end - self.start)
2274            .collect()
2275    }
2276
2277    // =========================================================================
2278    // Temporal Validity Methods
2279    // =========================================================================
2280
2281    /// Set the temporal validity start for this entity assertion.
2282    ///
2283    /// # Example
2284    /// ```rust
2285    /// use anno_core::{Entity, EntityType};
2286    /// use chrono::{TimeZone, Utc};
2287    ///
2288    /// let mut entity = Entity::new("CEO", EntityType::Person, 0, 3, 0.9);
2289    /// entity.set_valid_from(Utc.with_ymd_and_hms(2008, 10, 1, 0, 0, 0).unwrap());
2290    /// assert!(entity.is_temporal());
2291    /// ```
2292    pub fn set_valid_from(&mut self, dt: chrono::DateTime<chrono::Utc>) {
2293        self.valid_from = Some(dt);
2294    }
2295
2296    /// Set the temporal validity end for this entity assertion.
2297    pub fn set_valid_until(&mut self, dt: chrono::DateTime<chrono::Utc>) {
2298        self.valid_until = Some(dt);
2299    }
2300
2301    /// Set both temporal bounds at once.
2302    pub fn set_temporal_range(
2303        &mut self,
2304        from: chrono::DateTime<chrono::Utc>,
2305        until: chrono::DateTime<chrono::Utc>,
2306    ) {
2307        self.valid_from = Some(from);
2308        self.valid_until = Some(until);
2309    }
2310
2311    /// Check if this entity has temporal validity information.
2312    #[must_use]
2313    pub fn is_temporal(&self) -> bool {
2314        self.valid_from.is_some() || self.valid_until.is_some()
2315    }
2316
2317    /// Check if this entity was valid at a specific point in time.
2318    ///
2319    /// Returns `true` if:
2320    /// - No temporal bounds are set (atemporal entity)
2321    /// - The timestamp falls within [valid_from, valid_until]
2322    ///
2323    /// # Example
2324    /// ```rust
2325    /// use anno_core::{Entity, EntityType};
2326    /// use chrono::{TimeZone, Utc};
2327    ///
2328    /// let mut entity = Entity::new("CEO of Microsoft", EntityType::Person, 0, 16, 0.9);
2329    /// entity.set_valid_from(Utc.with_ymd_and_hms(2008, 1, 1, 0, 0, 0).unwrap());
2330    /// entity.set_valid_until(Utc.with_ymd_and_hms(2023, 12, 31, 0, 0, 0).unwrap());
2331    ///
2332    /// let query_2015 = Utc.with_ymd_and_hms(2015, 6, 1, 0, 0, 0).unwrap();
2333    /// let query_2005 = Utc.with_ymd_and_hms(2005, 6, 1, 0, 0, 0).unwrap();
2334    ///
2335    /// assert!(entity.valid_at(&query_2015));
2336    /// assert!(!entity.valid_at(&query_2005));
2337    /// ```
2338    #[must_use]
2339    pub fn valid_at(&self, timestamp: &chrono::DateTime<chrono::Utc>) -> bool {
2340        match (&self.valid_from, &self.valid_until) {
2341            (None, None) => true,                      // Atemporal - always valid
2342            (Some(from), None) => timestamp >= from,   // Started, still valid
2343            (None, Some(until)) => timestamp <= until, // Unknown start, ended
2344            (Some(from), Some(until)) => timestamp >= from && timestamp <= until,
2345        }
2346    }
2347
2348    /// Check if this entity is currently valid (at the current time).
2349    #[must_use]
2350    pub fn is_currently_valid(&self) -> bool {
2351        self.valid_at(&chrono::Utc::now())
2352    }
2353
2354    // =========================================================================
2355    // Viewport/Context Methods
2356    // =========================================================================
2357
2358    /// Set the viewport context for this entity.
2359    ///
2360    /// # Example
2361    /// ```rust
2362    /// use anno_core::{Entity, EntityType, EntityViewport};
2363    ///
2364    /// let mut entity = Entity::new("Marie Curie", EntityType::Person, 0, 11, 0.9);
2365    /// entity.set_viewport(EntityViewport::Academic);
2366    /// assert!(entity.has_viewport());
2367    /// ```
2368    pub fn set_viewport(&mut self, viewport: EntityViewport) {
2369        self.viewport = Some(viewport);
2370    }
2371
2372    /// Check if this entity has a viewport context.
2373    #[must_use]
2374    pub fn has_viewport(&self) -> bool {
2375        self.viewport.is_some()
2376    }
2377
2378    /// Get the viewport, defaulting to General if not set.
2379    #[must_use]
2380    pub fn viewport_or_default(&self) -> EntityViewport {
2381        self.viewport.clone().unwrap_or_default()
2382    }
2383
2384    /// Check if this entity matches a viewport context.
2385    ///
2386    /// Returns true if:
2387    /// - The entity has no viewport (matches any)
2388    /// - The entity's viewport matches the query
2389    #[must_use]
2390    pub fn matches_viewport(&self, query_viewport: &EntityViewport) -> bool {
2391        match &self.viewport {
2392            None => true, // No viewport = matches any
2393            Some(v) => v == query_viewport,
2394        }
2395    }
2396
2397    /// Create a builder for fluent entity construction.
2398    #[must_use]
2399    pub fn builder(text: impl Into<String>, entity_type: EntityType) -> EntityBuilder {
2400        EntityBuilder::new(text, entity_type)
2401    }
2402
2403    // =========================================================================
2404    // Validation Methods (Production Quality)
2405    // =========================================================================
2406
2407    /// Validate this entity against the source text.
2408    ///
2409    /// Returns a list of validation issues. Empty list means the entity is valid.
2410    ///
2411    /// # Checks Performed
2412    ///
2413    /// 1. **Span bounds**: `start < end`, both within text length
2414    /// 2. **Text match**: `text` matches the span in source
2415    /// 3. **Confidence range**: `confidence` in [0.0, 1.0]
2416    /// 4. **Type consistency**: Custom types have non-empty names
2417    /// 5. **Discontinuous consistency**: If present, segments are valid
2418    ///
2419    /// # Example
2420    ///
2421    /// ```rust
2422    /// use anno_core::{Entity, EntityType};
2423    ///
2424    /// let text = "John works at Apple";
2425    /// let entity = Entity::new("John", EntityType::Person, 0, 4, 0.95);
2426    ///
2427    /// let issues = entity.validate(text);
2428    /// assert!(issues.is_empty(), "Entity should be valid");
2429    ///
2430    /// // Invalid entity: span doesn't match text
2431    /// let bad = Entity::new("Jane", EntityType::Person, 0, 4, 0.95);
2432    /// let issues = bad.validate(text);
2433    /// assert!(!issues.is_empty(), "Entity text doesn't match span");
2434    /// ```
2435    #[must_use]
2436    pub fn validate(&self, source_text: &str) -> Vec<ValidationIssue> {
2437        // Performance: Calculate length once, delegate to optimized version
2438        let char_count = source_text.chars().count();
2439        self.validate_with_len(source_text, char_count)
2440    }
2441
2442    /// Validate entity with pre-computed text length (performance optimization).
2443    ///
2444    /// Use this when validating multiple entities from the same text to avoid
2445    /// recalculating `text.chars().count()` for each entity.
2446    ///
2447    /// # Arguments
2448    /// * `source_text` - The original text
2449    /// * `text_char_count` - Pre-computed character count (from `text.chars().count()`)
2450    ///
2451    /// # Returns
2452    /// Vector of validation issues (empty if valid)
2453    #[must_use]
2454    pub fn validate_with_len(
2455        &self,
2456        source_text: &str,
2457        text_char_count: usize,
2458    ) -> Vec<ValidationIssue> {
2459        let mut issues = Vec::new();
2460
2461        // 1. Span bounds
2462        if self.start >= self.end {
2463            issues.push(ValidationIssue::InvalidSpan {
2464                start: self.start,
2465                end: self.end,
2466                reason: "start must be less than end".to_string(),
2467            });
2468        }
2469
2470        if self.end > text_char_count {
2471            issues.push(ValidationIssue::SpanOutOfBounds {
2472                end: self.end,
2473                text_len: text_char_count,
2474            });
2475        }
2476
2477        // 2. Text match (only if span is valid)
2478        if self.start < self.end && self.end <= text_char_count {
2479            let actual = self.extract_text_with_len(source_text, text_char_count);
2480            if actual != self.text {
2481                issues.push(ValidationIssue::TextMismatch {
2482                    expected: self.text.clone(),
2483                    actual,
2484                    start: self.start,
2485                    end: self.end,
2486                });
2487            }
2488        }
2489
2490        // 3. Confidence range
2491        if !(0.0..=1.0).contains(&self.confidence) {
2492            issues.push(ValidationIssue::InvalidConfidence {
2493                value: self.confidence,
2494            });
2495        }
2496
2497        // 4. Type consistency
2498        if let EntityType::Custom { ref name, .. } = self.entity_type {
2499            if name.is_empty() {
2500                issues.push(ValidationIssue::InvalidType {
2501                    reason: "Custom entity type has empty name".to_string(),
2502                });
2503            }
2504        }
2505
2506        // 5. Discontinuous span consistency
2507        if let Some(ref disc_span) = self.discontinuous_span {
2508            for (i, seg) in disc_span.segments().iter().enumerate() {
2509                if seg.start >= seg.end {
2510                    issues.push(ValidationIssue::InvalidSpan {
2511                        start: seg.start,
2512                        end: seg.end,
2513                        reason: format!("discontinuous segment {} is invalid", i),
2514                    });
2515                }
2516                if seg.end > text_char_count {
2517                    issues.push(ValidationIssue::SpanOutOfBounds {
2518                        end: seg.end,
2519                        text_len: text_char_count,
2520                    });
2521                }
2522            }
2523        }
2524
2525        issues
2526    }
2527
2528    /// Check if this entity is valid against the source text.
2529    ///
2530    /// Convenience method that returns `true` if `validate()` returns empty.
2531    #[must_use]
2532    pub fn is_valid(&self, source_text: &str) -> bool {
2533        self.validate(source_text).is_empty()
2534    }
2535
2536    /// Validate a batch of entities efficiently.
2537    ///
2538    /// Returns a map of entity index -> validation issues.
2539    /// Only entities with issues are included.
2540    ///
2541    /// # Example
2542    ///
2543    /// ```rust
2544    /// use anno_core::{Entity, EntityType};
2545    ///
2546    /// let text = "John and Jane work at Apple";
2547    /// let entities = vec![
2548    ///     Entity::new("John", EntityType::Person, 0, 4, 0.95),
2549    ///     Entity::new("Wrong", EntityType::Person, 9, 13, 0.8),
2550    /// ];
2551    ///
2552    /// let issues = Entity::validate_batch(&entities, text);
2553    /// assert!(issues.is_empty() || issues.contains_key(&1)); // Second entity might fail
2554    /// ```
2555    #[must_use]
2556    pub fn validate_batch(
2557        entities: &[Entity],
2558        source_text: &str,
2559    ) -> std::collections::HashMap<usize, Vec<ValidationIssue>> {
2560        entities
2561            .iter()
2562            .enumerate()
2563            .filter_map(|(idx, entity)| {
2564                let issues = entity.validate(source_text);
2565                if issues.is_empty() {
2566                    None
2567                } else {
2568                    Some((idx, issues))
2569                }
2570            })
2571            .collect()
2572    }
2573}
2574
2575/// Validation issue found during entity validation.
2576#[derive(Debug, Clone, PartialEq)]
2577pub enum ValidationIssue {
2578    /// Span bounds are invalid (start >= end).
2579    InvalidSpan {
2580        /// Start position of the invalid span.
2581        start: usize,
2582        /// End position of the invalid span.
2583        end: usize,
2584        /// Description of why the span is invalid.
2585        reason: String,
2586    },
2587    /// Span extends beyond text length.
2588    SpanOutOfBounds {
2589        /// End position that exceeds the text.
2590        end: usize,
2591        /// Actual length of the text.
2592        text_len: usize,
2593    },
2594    /// Entity text doesn't match the span in source.
2595    TextMismatch {
2596        /// Text stored in the entity.
2597        expected: String,
2598        /// Text found at the span in source.
2599        actual: String,
2600        /// Start position of the span.
2601        start: usize,
2602        /// End position of the span.
2603        end: usize,
2604    },
2605    /// Confidence is outside [0.0, 1.0].
2606    InvalidConfidence {
2607        /// The invalid confidence value.
2608        value: f64,
2609    },
2610    /// Entity type is invalid.
2611    InvalidType {
2612        /// Description of why the type is invalid.
2613        reason: String,
2614    },
2615}
2616
2617impl std::fmt::Display for ValidationIssue {
2618    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
2619        match self {
2620            ValidationIssue::InvalidSpan { start, end, reason } => {
2621                write!(f, "Invalid span [{}, {}): {}", start, end, reason)
2622            }
2623            ValidationIssue::SpanOutOfBounds { end, text_len } => {
2624                write!(f, "Span end {} exceeds text length {}", end, text_len)
2625            }
2626            ValidationIssue::TextMismatch {
2627                expected,
2628                actual,
2629                start,
2630                end,
2631            } => {
2632                write!(
2633                    f,
2634                    "Text mismatch at [{}, {}): expected '{}', got '{}'",
2635                    start, end, expected, actual
2636                )
2637            }
2638            ValidationIssue::InvalidConfidence { value } => {
2639                write!(f, "Confidence {} outside [0.0, 1.0]", value)
2640            }
2641            ValidationIssue::InvalidType { reason } => {
2642                write!(f, "Invalid entity type: {}", reason)
2643            }
2644        }
2645    }
2646}
2647
2648/// Fluent builder for constructing entities with optional fields.
2649///
2650/// # Example
2651///
2652/// ```rust
2653/// use anno_core::{Entity, EntityType, Provenance};
2654///
2655/// let entity = Entity::builder("Marie Curie", EntityType::Person)
2656///     .span(0, 11)
2657///     .confidence(0.95)
2658///     .kb_id("Q7186")
2659///     .provenance(Provenance::ml("bert", 0.95))
2660///     .build();
2661/// ```
2662#[derive(Debug, Clone)]
2663pub struct EntityBuilder {
2664    text: String,
2665    entity_type: EntityType,
2666    start: usize,
2667    end: usize,
2668    confidence: f64,
2669    normalized: Option<String>,
2670    provenance: Option<Provenance>,
2671    kb_id: Option<String>,
2672    canonical_id: Option<super::types::CanonicalId>,
2673    hierarchical_confidence: Option<HierarchicalConfidence>,
2674    visual_span: Option<Span>,
2675    discontinuous_span: Option<DiscontinuousSpan>,
2676    valid_from: Option<chrono::DateTime<chrono::Utc>>,
2677    valid_until: Option<chrono::DateTime<chrono::Utc>>,
2678    viewport: Option<EntityViewport>,
2679}
2680
2681impl EntityBuilder {
2682    /// Create a new builder.
2683    #[must_use]
2684    pub fn new(text: impl Into<String>, entity_type: EntityType) -> Self {
2685        Self {
2686            text: text.into(),
2687            entity_type,
2688            start: 0,
2689            end: 0,
2690            confidence: 1.0,
2691            normalized: None,
2692            provenance: None,
2693            kb_id: None,
2694            canonical_id: None,
2695            hierarchical_confidence: None,
2696            visual_span: None,
2697            discontinuous_span: None,
2698            valid_from: None,
2699            valid_until: None,
2700            viewport: None,
2701        }
2702    }
2703
2704    /// Set span offsets.
2705    #[must_use]
2706    pub const fn span(mut self, start: usize, end: usize) -> Self {
2707        self.start = start;
2708        self.end = end;
2709        self
2710    }
2711
2712    /// Set confidence score.
2713    #[must_use]
2714    pub fn confidence(mut self, confidence: f64) -> Self {
2715        self.confidence = confidence.clamp(0.0, 1.0);
2716        self
2717    }
2718
2719    /// Set hierarchical confidence.
2720    #[must_use]
2721    pub fn hierarchical_confidence(mut self, confidence: HierarchicalConfidence) -> Self {
2722        self.confidence = confidence.as_f64();
2723        self.hierarchical_confidence = Some(confidence);
2724        self
2725    }
2726
2727    /// Set normalized form.
2728    #[must_use]
2729    pub fn normalized(mut self, normalized: impl Into<String>) -> Self {
2730        self.normalized = Some(normalized.into());
2731        self
2732    }
2733
2734    /// Set provenance.
2735    #[must_use]
2736    pub fn provenance(mut self, provenance: Provenance) -> Self {
2737        self.provenance = Some(provenance);
2738        self
2739    }
2740
2741    /// Set knowledge base ID.
2742    #[must_use]
2743    pub fn kb_id(mut self, kb_id: impl Into<String>) -> Self {
2744        self.kb_id = Some(kb_id.into());
2745        self
2746    }
2747
2748    /// Set canonical (coreference) ID.
2749    #[must_use]
2750    pub const fn canonical_id(mut self, canonical_id: u64) -> Self {
2751        self.canonical_id = Some(super::types::CanonicalId::new(canonical_id));
2752        self
2753    }
2754
2755    /// Set visual span.
2756    #[must_use]
2757    pub fn visual_span(mut self, span: Span) -> Self {
2758        self.visual_span = Some(span);
2759        self
2760    }
2761
2762    /// Set discontinuous span for non-contiguous entities.
2763    ///
2764    /// This automatically updates `start` and `end` to the bounding range.
2765    #[must_use]
2766    pub fn discontinuous_span(mut self, span: DiscontinuousSpan) -> Self {
2767        // Update start/end to bounding range
2768        if let Some(bounding) = span.bounding_range() {
2769            self.start = bounding.start;
2770            self.end = bounding.end;
2771        }
2772        self.discontinuous_span = Some(span);
2773        self
2774    }
2775
2776    /// Set temporal validity start (when this entity assertion became true).
2777    ///
2778    /// # Example
2779    /// ```rust
2780    /// use anno_core::{EntityBuilder, EntityType};
2781    /// use chrono::{TimeZone, Utc};
2782    ///
2783    /// let entity = EntityBuilder::new("CEO of Microsoft", EntityType::Person)
2784    ///     .span(0, 12)
2785    ///     .valid_from(Utc.with_ymd_and_hms(2008, 10, 1, 0, 0, 0).unwrap())
2786    ///     .build();
2787    /// assert!(entity.valid_from.is_some());
2788    /// ```
2789    #[must_use]
2790    pub fn valid_from(mut self, dt: chrono::DateTime<chrono::Utc>) -> Self {
2791        self.valid_from = Some(dt);
2792        self
2793    }
2794
2795    /// Set temporal validity end (when this entity assertion stopped being true).
2796    #[must_use]
2797    pub fn valid_until(mut self, dt: chrono::DateTime<chrono::Utc>) -> Self {
2798        self.valid_until = Some(dt);
2799        self
2800    }
2801
2802    /// Set temporal validity range (convenience method).
2803    #[must_use]
2804    pub fn temporal_range(
2805        mut self,
2806        from: chrono::DateTime<chrono::Utc>,
2807        until: chrono::DateTime<chrono::Utc>,
2808    ) -> Self {
2809        self.valid_from = Some(from);
2810        self.valid_until = Some(until);
2811        self
2812    }
2813
2814    /// Set the viewport context for multi-faceted entity representation.
2815    ///
2816    /// # Example
2817    /// ```rust
2818    /// use anno_core::{EntityBuilder, EntityType, EntityViewport};
2819    ///
2820    /// let entity = EntityBuilder::new("Marie Curie", EntityType::Person)
2821    ///     .span(0, 11)
2822    ///     .viewport(EntityViewport::Academic)
2823    ///     .build();
2824    /// assert_eq!(entity.viewport, Some(EntityViewport::Academic));
2825    /// ```
2826    #[must_use]
2827    pub fn viewport(mut self, viewport: EntityViewport) -> Self {
2828        self.viewport = Some(viewport);
2829        self
2830    }
2831
2832    /// Build the entity.
2833    #[must_use]
2834    pub fn build(self) -> Entity {
2835        Entity {
2836            text: self.text,
2837            entity_type: self.entity_type,
2838            start: self.start,
2839            end: self.end,
2840            confidence: self.confidence,
2841            normalized: self.normalized,
2842            provenance: self.provenance,
2843            kb_id: self.kb_id,
2844            canonical_id: self.canonical_id,
2845            hierarchical_confidence: self.hierarchical_confidence,
2846            visual_span: self.visual_span,
2847            discontinuous_span: self.discontinuous_span,
2848            valid_from: self.valid_from,
2849            valid_until: self.valid_until,
2850            viewport: self.viewport,
2851        }
2852    }
2853}
2854
2855// ============================================================================
2856// Relation (for Knowledge Graph Construction)
2857// ============================================================================
2858
2859/// A relation between two entities, forming a knowledge graph triple.
2860///
2861/// In the GLiNER bi-encoder paradigm, relations are detected just like entities:
2862/// the relation trigger text ("CEO of", "located in") is matched against
2863/// relation type labels in the same latent space.
2864///
2865/// # Structure
2866///
2867/// ```text
2868/// Triple: (Head, Relation, Tail)
2869///
2870/// "Marie Curie worked at the Sorbonne"
2871///  ^^^^^^^^^^^ ~~~~~~~~~ ^^^^^^^^
2872///  Head        Rel       Tail
2873///  (Person)  (Employment)  (Organization)
2874/// ```
2875///
2876/// # TPLinker/Joint Extraction
2877///
2878/// For joint extraction, relations are extracted in a single pass with entities.
2879/// The `trigger_span` captures the text that indicates the relation.
2880#[derive(Debug, Clone, Serialize, Deserialize)]
2881pub struct Relation {
2882    /// The source entity (head of the triple)
2883    pub head: Entity,
2884    /// The target entity (tail of the triple)
2885    pub tail: Entity,
2886    /// Relation type label (e.g., "EMPLOYMENT", "LOCATED_IN", "FOUNDED_BY")
2887    pub relation_type: String,
2888    /// Optional trigger span: the text that indicates this relation
2889    /// For "CEO of", this would be the span covering "CEO of"
2890    pub trigger_span: Option<(usize, usize)>,
2891    /// Confidence score for this relation (0.0-1.0)
2892    pub confidence: f64,
2893}
2894
2895impl Relation {
2896    /// Create a new relation between two entities.
2897    #[must_use]
2898    pub fn new(
2899        head: Entity,
2900        tail: Entity,
2901        relation_type: impl Into<String>,
2902        confidence: f64,
2903    ) -> Self {
2904        Self {
2905            head,
2906            tail,
2907            relation_type: relation_type.into(),
2908            trigger_span: None,
2909            confidence: confidence.clamp(0.0, 1.0),
2910        }
2911    }
2912
2913    /// Create a relation with an explicit trigger span.
2914    #[must_use]
2915    pub fn with_trigger(
2916        head: Entity,
2917        tail: Entity,
2918        relation_type: impl Into<String>,
2919        trigger_start: usize,
2920        trigger_end: usize,
2921        confidence: f64,
2922    ) -> Self {
2923        Self {
2924            head,
2925            tail,
2926            relation_type: relation_type.into(),
2927            trigger_span: Some((trigger_start, trigger_end)),
2928            confidence: confidence.clamp(0.0, 1.0),
2929        }
2930    }
2931
2932    /// Convert to a triple string representation (for debugging/display).
2933    #[must_use]
2934    pub fn as_triple(&self) -> String {
2935        format!(
2936            "({}, {}, {})",
2937            self.head.text, self.relation_type, self.tail.text
2938        )
2939    }
2940
2941    /// Check if the head and tail entities are adjacent (within n tokens).
2942    /// Useful for filtering spurious long-distance relations.
2943    #[must_use]
2944    pub fn span_distance(&self) -> usize {
2945        if self.head.end <= self.tail.start {
2946            self.tail.start.saturating_sub(self.head.end)
2947        } else if self.tail.end <= self.head.start {
2948            self.head.start.saturating_sub(self.tail.end)
2949        } else {
2950            0 // Overlapping spans
2951        }
2952    }
2953}
2954
2955#[cfg(test)]
2956mod tests {
2957    #![allow(clippy::unwrap_used)] // unwrap() is acceptable in test code
2958    use super::*;
2959
2960    #[test]
2961    fn test_entity_type_roundtrip() {
2962        let types = [
2963            EntityType::Person,
2964            EntityType::Organization,
2965            EntityType::Location,
2966            EntityType::Date,
2967            EntityType::Money,
2968            EntityType::Percent,
2969        ];
2970
2971        for t in types {
2972            let label = t.as_label();
2973            let parsed = EntityType::from_label(label);
2974            assert_eq!(t, parsed);
2975        }
2976    }
2977
2978    #[test]
2979    fn test_entity_overlap() {
2980        let e1 = Entity::new("John", EntityType::Person, 0, 4, 0.9);
2981        let e2 = Entity::new("Smith", EntityType::Person, 5, 10, 0.9);
2982        let e3 = Entity::new("John Smith", EntityType::Person, 0, 10, 0.9);
2983
2984        assert!(!e1.overlaps(&e2)); // No overlap
2985        assert!(e1.overlaps(&e3)); // e1 is contained in e3
2986        assert!(e3.overlaps(&e2)); // e3 contains e2
2987    }
2988
2989    #[test]
2990    fn test_confidence_clamping() {
2991        let e1 = Entity::new("test", EntityType::Person, 0, 4, 1.5);
2992        assert!((e1.confidence - 1.0).abs() < f64::EPSILON);
2993
2994        let e2 = Entity::new("test", EntityType::Person, 0, 4, -0.5);
2995        assert!(e2.confidence.abs() < f64::EPSILON);
2996    }
2997
2998    #[test]
2999    fn test_entity_categories() {
3000        // Agent/Org/Place entities require ML
3001        assert_eq!(EntityType::Person.category(), EntityCategory::Agent);
3002        assert_eq!(
3003            EntityType::Organization.category(),
3004            EntityCategory::Organization
3005        );
3006        assert_eq!(EntityType::Location.category(), EntityCategory::Place);
3007        assert!(EntityType::Person.requires_ml());
3008        assert!(!EntityType::Person.pattern_detectable());
3009
3010        // Temporal entities are pattern-detectable
3011        assert_eq!(EntityType::Date.category(), EntityCategory::Temporal);
3012        assert_eq!(EntityType::Time.category(), EntityCategory::Temporal);
3013        assert!(EntityType::Date.pattern_detectable());
3014        assert!(!EntityType::Date.requires_ml());
3015
3016        // Numeric entities are pattern-detectable
3017        assert_eq!(EntityType::Money.category(), EntityCategory::Numeric);
3018        assert_eq!(EntityType::Percent.category(), EntityCategory::Numeric);
3019        assert!(EntityType::Money.pattern_detectable());
3020
3021        // Contact entities are pattern-detectable
3022        assert_eq!(EntityType::Email.category(), EntityCategory::Contact);
3023        assert_eq!(EntityType::Url.category(), EntityCategory::Contact);
3024        assert_eq!(EntityType::Phone.category(), EntityCategory::Contact);
3025        assert!(EntityType::Email.pattern_detectable());
3026    }
3027
3028    #[test]
3029    fn test_new_types_roundtrip() {
3030        let types = [
3031            EntityType::Time,
3032            EntityType::Email,
3033            EntityType::Url,
3034            EntityType::Phone,
3035            EntityType::Quantity,
3036            EntityType::Cardinal,
3037            EntityType::Ordinal,
3038        ];
3039
3040        for t in types {
3041            let label = t.as_label();
3042            let parsed = EntityType::from_label(label);
3043            assert_eq!(t, parsed, "Roundtrip failed for {}", label);
3044        }
3045    }
3046
3047    #[test]
3048    fn test_custom_entity_type() {
3049        let disease = EntityType::custom("DISEASE", EntityCategory::Agent);
3050        assert_eq!(disease.as_label(), "DISEASE");
3051        assert!(disease.requires_ml());
3052
3053        let product_id = EntityType::custom("PRODUCT_ID", EntityCategory::Misc);
3054        assert_eq!(product_id.as_label(), "PRODUCT_ID");
3055        assert!(!product_id.requires_ml());
3056        assert!(!product_id.pattern_detectable());
3057    }
3058
3059    #[test]
3060    fn test_entity_normalization() {
3061        let mut e = Entity::new("Jan 15", EntityType::Date, 0, 6, 0.95);
3062        assert!(e.normalized.is_none());
3063        assert_eq!(e.normalized_or_text(), "Jan 15");
3064
3065        e.set_normalized("2024-01-15");
3066        assert_eq!(e.normalized.as_deref(), Some("2024-01-15"));
3067        assert_eq!(e.normalized_or_text(), "2024-01-15");
3068    }
3069
3070    #[test]
3071    fn test_entity_helpers() {
3072        let named = Entity::new("John", EntityType::Person, 0, 4, 0.9);
3073        assert!(named.is_named());
3074        assert!(!named.is_structured());
3075        assert_eq!(named.category(), EntityCategory::Agent);
3076
3077        let structured = Entity::new("$100", EntityType::Money, 0, 4, 0.95);
3078        assert!(!structured.is_named());
3079        assert!(structured.is_structured());
3080        assert_eq!(structured.category(), EntityCategory::Numeric);
3081    }
3082
3083    #[test]
3084    fn test_knowledge_linking() {
3085        let mut entity = Entity::new("Marie Curie", EntityType::Person, 0, 11, 0.95);
3086        assert!(!entity.is_linked());
3087        assert!(!entity.has_coreference());
3088
3089        entity.link_to_kb("Q7186"); // Wikidata ID
3090        assert!(entity.is_linked());
3091        assert_eq!(entity.kb_id.as_deref(), Some("Q7186"));
3092
3093        entity.set_canonical(42);
3094        assert!(entity.has_coreference());
3095        assert_eq!(
3096            entity.canonical_id,
3097            Some(crate::core::types::CanonicalId::new(42))
3098        );
3099    }
3100
3101    #[test]
3102    fn test_relation_creation() {
3103        let head = Entity::new("Marie Curie", EntityType::Person, 0, 11, 0.95);
3104        let tail = Entity::new("Sorbonne", EntityType::Organization, 24, 32, 0.90);
3105
3106        let relation = Relation::new(head.clone(), tail.clone(), "WORKED_AT", 0.85);
3107        assert_eq!(relation.relation_type, "WORKED_AT");
3108        assert_eq!(relation.as_triple(), "(Marie Curie, WORKED_AT, Sorbonne)");
3109        assert!(relation.trigger_span.is_none());
3110
3111        // With trigger span
3112        let relation2 = Relation::with_trigger(head, tail, "EMPLOYMENT", 13, 19, 0.85);
3113        assert_eq!(relation2.trigger_span, Some((13, 19)));
3114    }
3115
3116    #[test]
3117    fn test_relation_span_distance() {
3118        // Head at 0-11, tail at 24-32 -> distance is 24-11 = 13
3119        let head = Entity::new("Marie Curie", EntityType::Person, 0, 11, 0.95);
3120        let tail = Entity::new("Sorbonne", EntityType::Organization, 24, 32, 0.90);
3121        let relation = Relation::new(head, tail, "WORKED_AT", 0.85);
3122        assert_eq!(relation.span_distance(), 13);
3123    }
3124
3125    #[test]
3126    fn test_relation_category() {
3127        // Relation types should be categorized as Relation
3128        let rel_type = EntityType::custom("CEO_OF", EntityCategory::Relation);
3129        assert_eq!(rel_type.category(), EntityCategory::Relation);
3130        assert!(rel_type.category().is_relation());
3131        assert!(rel_type.requires_ml()); // Relations require ML
3132    }
3133
3134    // ========================================================================
3135    // Span Tests
3136    // ========================================================================
3137
3138    #[test]
3139    fn test_span_text() {
3140        let span = Span::text(10, 20);
3141        assert!(span.is_text());
3142        assert!(!span.is_visual());
3143        assert_eq!(span.text_offsets(), Some((10, 20)));
3144        assert_eq!(span.len(), 10);
3145        assert!(!span.is_empty());
3146    }
3147
3148    #[test]
3149    fn test_span_bbox() {
3150        let span = Span::bbox(0.1, 0.2, 0.3, 0.4);
3151        assert!(!span.is_text());
3152        assert!(span.is_visual());
3153        assert_eq!(span.text_offsets(), None);
3154        assert_eq!(span.len(), 0); // No text length
3155    }
3156
3157    #[test]
3158    fn test_span_bbox_with_page() {
3159        let span = Span::bbox_on_page(0.1, 0.2, 0.3, 0.4, 5);
3160        if let Span::BoundingBox { page, .. } = span {
3161            assert_eq!(page, Some(5));
3162        } else {
3163            panic!("Expected BoundingBox");
3164        }
3165    }
3166
3167    #[test]
3168    fn test_span_hybrid() {
3169        let bbox = Span::bbox(0.1, 0.2, 0.3, 0.4);
3170        let hybrid = Span::Hybrid {
3171            start: 10,
3172            end: 20,
3173            bbox: Box::new(bbox),
3174        };
3175        assert!(hybrid.is_text());
3176        assert!(hybrid.is_visual());
3177        assert_eq!(hybrid.text_offsets(), Some((10, 20)));
3178        assert_eq!(hybrid.len(), 10);
3179    }
3180
3181    // ========================================================================
3182    // Hierarchical Confidence Tests
3183    // ========================================================================
3184
3185    #[test]
3186    fn test_hierarchical_confidence_new() {
3187        let hc = HierarchicalConfidence::new(0.9, 0.8, 0.7);
3188        assert!((hc.linkage - 0.9).abs() < f32::EPSILON);
3189        assert!((hc.type_score - 0.8).abs() < f32::EPSILON);
3190        assert!((hc.boundary - 0.7).abs() < f32::EPSILON);
3191    }
3192
3193    #[test]
3194    fn test_hierarchical_confidence_clamping() {
3195        let hc = HierarchicalConfidence::new(1.5, -0.5, 0.5);
3196        assert!((hc.linkage - 1.0).abs() < f32::EPSILON);
3197        assert!(hc.type_score.abs() < f32::EPSILON);
3198        assert!((hc.boundary - 0.5).abs() < f32::EPSILON);
3199    }
3200
3201    #[test]
3202    fn test_hierarchical_confidence_from_single() {
3203        let hc = HierarchicalConfidence::from_single(0.8);
3204        assert!((hc.linkage - 0.8).abs() < f32::EPSILON);
3205        assert!((hc.type_score - 0.8).abs() < f32::EPSILON);
3206        assert!((hc.boundary - 0.8).abs() < f32::EPSILON);
3207    }
3208
3209    #[test]
3210    fn test_hierarchical_confidence_combined() {
3211        let hc = HierarchicalConfidence::new(1.0, 1.0, 1.0);
3212        assert!((hc.combined() - 1.0).abs() < f32::EPSILON);
3213
3214        let hc2 = HierarchicalConfidence::new(0.8, 0.8, 0.8);
3215        assert!((hc2.combined() - 0.8).abs() < f32::EPSILON);
3216
3217        // Geometric mean: (0.5 * 0.5 * 0.5)^(1/3) = 0.5
3218        let hc3 = HierarchicalConfidence::new(0.5, 0.5, 0.5);
3219        assert!((hc3.combined() - 0.5).abs() < 0.001);
3220    }
3221
3222    #[test]
3223    fn test_hierarchical_confidence_threshold() {
3224        let hc = HierarchicalConfidence::new(0.9, 0.8, 0.7);
3225        assert!(hc.passes_threshold(0.5, 0.5, 0.5));
3226        assert!(hc.passes_threshold(0.9, 0.8, 0.7));
3227        assert!(!hc.passes_threshold(0.95, 0.8, 0.7)); // linkage too high
3228        assert!(!hc.passes_threshold(0.9, 0.85, 0.7)); // type too high
3229    }
3230
3231    #[test]
3232    fn test_hierarchical_confidence_from_f64() {
3233        let hc: HierarchicalConfidence = 0.85_f64.into();
3234        assert!((hc.linkage - 0.85).abs() < 0.001);
3235    }
3236
3237    // ========================================================================
3238    // RaggedBatch Tests
3239    // ========================================================================
3240
3241    #[test]
3242    fn test_ragged_batch_from_sequences() {
3243        let seqs = vec![vec![1, 2, 3], vec![4, 5], vec![6, 7, 8, 9]];
3244        let batch = RaggedBatch::from_sequences(&seqs);
3245
3246        assert_eq!(batch.batch_size(), 3);
3247        assert_eq!(batch.total_tokens(), 9);
3248        assert_eq!(batch.max_seq_len, 4);
3249        assert_eq!(batch.cumulative_offsets, vec![0, 3, 5, 9]);
3250    }
3251
3252    #[test]
3253    fn test_ragged_batch_doc_range() {
3254        let seqs = vec![vec![1, 2, 3], vec![4, 5]];
3255        let batch = RaggedBatch::from_sequences(&seqs);
3256
3257        assert_eq!(batch.doc_range(0), Some(0..3));
3258        assert_eq!(batch.doc_range(1), Some(3..5));
3259        assert_eq!(batch.doc_range(2), None);
3260    }
3261
3262    #[test]
3263    fn test_ragged_batch_doc_tokens() {
3264        let seqs = vec![vec![1, 2, 3], vec![4, 5]];
3265        let batch = RaggedBatch::from_sequences(&seqs);
3266
3267        assert_eq!(batch.doc_tokens(0), Some(&[1, 2, 3][..]));
3268        assert_eq!(batch.doc_tokens(1), Some(&[4, 5][..]));
3269    }
3270
3271    #[test]
3272    fn test_ragged_batch_padding_savings() {
3273        // 3 docs: [3, 2, 4] tokens, max = 4
3274        // Padded: 3 * 4 = 12, actual: 9
3275        // Savings: 1 - 9/12 = 0.25
3276        let seqs = vec![vec![1, 2, 3], vec![4, 5], vec![6, 7, 8, 9]];
3277        let batch = RaggedBatch::from_sequences(&seqs);
3278        let savings = batch.padding_savings();
3279        assert!((savings - 0.25).abs() < 0.001);
3280    }
3281
3282    // ========================================================================
3283    // SpanCandidate Tests
3284    // ========================================================================
3285
3286    #[test]
3287    fn test_span_candidate() {
3288        let sc = SpanCandidate::new(0, 5, 10);
3289        assert_eq!(sc.doc_idx, 0);
3290        assert_eq!(sc.start, 5);
3291        assert_eq!(sc.end, 10);
3292        assert_eq!(sc.width(), 5);
3293    }
3294
3295    #[test]
3296    fn test_generate_span_candidates() {
3297        let seqs = vec![vec![1, 2, 3]]; // doc with 3 tokens
3298        let batch = RaggedBatch::from_sequences(&seqs);
3299        let candidates = generate_span_candidates(&batch, 2);
3300
3301        // With max_width=2: [0,1], [1,2], [2,3], [0,2], [1,3]
3302        // = spans: (0,1), (0,2), (1,2), (1,3), (2,3)
3303        assert_eq!(candidates.len(), 5);
3304
3305        // Verify all candidates are valid
3306        for c in &candidates {
3307            assert_eq!(c.doc_idx, 0);
3308            assert!(c.end as usize <= 3);
3309            assert!(c.width() as usize <= 2);
3310        }
3311    }
3312
3313    #[test]
3314    fn test_generate_filtered_candidates() {
3315        let seqs = vec![vec![1, 2, 3]];
3316        let batch = RaggedBatch::from_sequences(&seqs);
3317
3318        // With max_width=2, we have 5 candidates
3319        // Set mask: only first 2 pass threshold
3320        let mask = vec![0.9, 0.9, 0.1, 0.1, 0.1];
3321        let candidates = generate_filtered_candidates(&batch, 2, &mask, 0.5);
3322
3323        assert_eq!(candidates.len(), 2);
3324    }
3325
3326    // ========================================================================
3327    // EntityBuilder Tests
3328    // ========================================================================
3329
3330    #[test]
3331    fn test_entity_builder_basic() {
3332        let entity = Entity::builder("John", EntityType::Person)
3333            .span(0, 4)
3334            .confidence(0.95)
3335            .build();
3336
3337        assert_eq!(entity.text, "John");
3338        assert_eq!(entity.entity_type, EntityType::Person);
3339        assert_eq!(entity.start, 0);
3340        assert_eq!(entity.end, 4);
3341        assert!((entity.confidence - 0.95).abs() < f64::EPSILON);
3342    }
3343
3344    #[test]
3345    fn test_entity_builder_full() {
3346        let entity = Entity::builder("Marie Curie", EntityType::Person)
3347            .span(0, 11)
3348            .confidence(0.95)
3349            .kb_id("Q7186")
3350            .canonical_id(42)
3351            .normalized("Marie Salomea Skłodowska Curie")
3352            .provenance(Provenance::ml("bert", 0.95))
3353            .build();
3354
3355        assert_eq!(entity.text, "Marie Curie");
3356        assert_eq!(entity.kb_id.as_deref(), Some("Q7186"));
3357        assert_eq!(
3358            entity.canonical_id,
3359            Some(crate::core::types::CanonicalId::new(42))
3360        );
3361        assert_eq!(
3362            entity.normalized.as_deref(),
3363            Some("Marie Salomea Skłodowska Curie")
3364        );
3365        assert!(entity.provenance.is_some());
3366    }
3367
3368    #[test]
3369    fn test_entity_builder_hierarchical() {
3370        let hc = HierarchicalConfidence::new(0.9, 0.8, 0.7);
3371        let entity = Entity::builder("test", EntityType::Person)
3372            .span(0, 4)
3373            .hierarchical_confidence(hc)
3374            .build();
3375
3376        assert!(entity.hierarchical_confidence.is_some());
3377        assert!((entity.linkage_confidence() - 0.9).abs() < 0.001);
3378        assert!((entity.type_confidence() - 0.8).abs() < 0.001);
3379        assert!((entity.boundary_confidence() - 0.7).abs() < 0.001);
3380    }
3381
3382    #[test]
3383    fn test_entity_builder_visual() {
3384        let bbox = Span::bbox(0.1, 0.2, 0.3, 0.4);
3385        let entity = Entity::builder("receipt item", EntityType::Money)
3386            .visual_span(bbox)
3387            .confidence(0.9)
3388            .build();
3389
3390        assert!(entity.is_visual());
3391        assert!(entity.visual_span.is_some());
3392    }
3393
3394    // ========================================================================
3395    // Entity Helper Method Tests
3396    // ========================================================================
3397
3398    #[test]
3399    fn test_entity_hierarchical_confidence_helpers() {
3400        let mut entity = Entity::new("test", EntityType::Person, 0, 4, 0.8);
3401
3402        // Without hierarchical confidence, falls back to main confidence
3403        assert!((entity.linkage_confidence() - 0.8).abs() < 0.001);
3404        assert!((entity.type_confidence() - 0.8).abs() < 0.001);
3405        assert!((entity.boundary_confidence() - 0.8).abs() < 0.001);
3406
3407        // Set hierarchical confidence
3408        entity.set_hierarchical_confidence(HierarchicalConfidence::new(0.95, 0.85, 0.75));
3409        assert!((entity.linkage_confidence() - 0.95).abs() < 0.001);
3410        assert!((entity.type_confidence() - 0.85).abs() < 0.001);
3411        assert!((entity.boundary_confidence() - 0.75).abs() < 0.001);
3412    }
3413
3414    #[test]
3415    fn test_entity_from_visual() {
3416        let entity = Entity::from_visual(
3417            "receipt total",
3418            EntityType::Money,
3419            Span::bbox(0.5, 0.8, 0.2, 0.05),
3420            0.92,
3421        );
3422
3423        assert!(entity.is_visual());
3424        assert_eq!(entity.start, 0);
3425        assert_eq!(entity.end, 0);
3426        assert!((entity.confidence - 0.92).abs() < f64::EPSILON);
3427    }
3428
3429    #[test]
3430    fn test_entity_span_helpers() {
3431        let entity = Entity::new("test", EntityType::Person, 10, 20, 0.9);
3432        assert_eq!(entity.text_span(), (10, 20));
3433        assert_eq!(entity.span_len(), 10);
3434    }
3435
3436    // ========================================================================
3437    // Provenance Tests
3438    // ========================================================================
3439
3440    #[test]
3441    fn test_provenance_pattern() {
3442        let prov = Provenance::pattern("EMAIL");
3443        assert_eq!(prov.method, ExtractionMethod::Pattern);
3444        assert_eq!(prov.pattern.as_deref(), Some("EMAIL"));
3445        assert_eq!(prov.raw_confidence, Some(1.0)); // Patterns are deterministic
3446    }
3447
3448    #[test]
3449    fn test_provenance_ml() {
3450        let prov = Provenance::ml("bert-ner", 0.87);
3451        assert_eq!(prov.method, ExtractionMethod::Neural);
3452        assert_eq!(prov.source.as_ref(), "bert-ner");
3453        assert_eq!(prov.raw_confidence, Some(0.87));
3454    }
3455
3456    #[test]
3457    fn test_provenance_with_version() {
3458        let prov = Provenance::ml("gliner", 0.92).with_version("v2.1.0");
3459
3460        assert_eq!(prov.model_version.as_deref(), Some("v2.1.0"));
3461        assert_eq!(prov.source.as_ref(), "gliner");
3462    }
3463
3464    #[test]
3465    fn test_provenance_with_timestamp() {
3466        let prov = Provenance::pattern("DATE").with_timestamp("2024-01-15T10:30:00Z");
3467
3468        assert_eq!(prov.timestamp.as_deref(), Some("2024-01-15T10:30:00Z"));
3469    }
3470
3471    #[test]
3472    fn test_provenance_builder_chain() {
3473        let prov = Provenance::ml("modernbert-ner", 0.95)
3474            .with_version("v1.0.0")
3475            .with_timestamp("2024-11-27T12:00:00Z");
3476
3477        assert_eq!(prov.method, ExtractionMethod::Neural);
3478        assert_eq!(prov.source.as_ref(), "modernbert-ner");
3479        assert_eq!(prov.raw_confidence, Some(0.95));
3480        assert_eq!(prov.model_version.as_deref(), Some("v1.0.0"));
3481        assert_eq!(prov.timestamp.as_deref(), Some("2024-11-27T12:00:00Z"));
3482    }
3483
3484    #[test]
3485    fn test_provenance_serialization() {
3486        let prov = Provenance::ml("test", 0.9)
3487            .with_version("v1.0")
3488            .with_timestamp("2024-01-01");
3489
3490        let json = serde_json::to_string(&prov).unwrap();
3491        assert!(json.contains("model_version"));
3492        assert!(json.contains("v1.0"));
3493
3494        let restored: Provenance = serde_json::from_str(&json).unwrap();
3495        assert_eq!(restored.model_version.as_deref(), Some("v1.0"));
3496        assert_eq!(restored.timestamp.as_deref(), Some("2024-01-01"));
3497    }
3498}
3499
3500#[cfg(test)]
3501mod proptests {
3502    #![allow(clippy::unwrap_used)] // unwrap() is acceptable in property tests
3503    use super::*;
3504    use proptest::prelude::*;
3505
3506    proptest! {
3507        #[test]
3508        fn confidence_always_clamped(conf in -10.0f64..10.0) {
3509            let e = Entity::new("test", EntityType::Person, 0, 4, conf);
3510            prop_assert!(e.confidence >= 0.0);
3511            prop_assert!(e.confidence <= 1.0);
3512        }
3513
3514        #[test]
3515        fn entity_type_roundtrip(label in "[A-Z]{3,10}") {
3516            let et = EntityType::from_label(&label);
3517            let back = EntityType::from_label(et.as_label());
3518            // Other types may round-trip to themselves or normalize
3519            prop_assert!(matches!(back, EntityType::Other(_)) || back == et);
3520        }
3521
3522        #[test]
3523        fn overlap_is_symmetric(
3524            s1 in 0usize..100,
3525            len1 in 1usize..50,
3526            s2 in 0usize..100,
3527            len2 in 1usize..50,
3528        ) {
3529            let e1 = Entity::new("a", EntityType::Person, s1, s1 + len1, 1.0);
3530            let e2 = Entity::new("b", EntityType::Person, s2, s2 + len2, 1.0);
3531            prop_assert_eq!(e1.overlaps(&e2), e2.overlaps(&e1));
3532        }
3533
3534        #[test]
3535        fn overlap_ratio_bounded(
3536            s1 in 0usize..100,
3537            len1 in 1usize..50,
3538            s2 in 0usize..100,
3539            len2 in 1usize..50,
3540        ) {
3541            let e1 = Entity::new("a", EntityType::Person, s1, s1 + len1, 1.0);
3542            let e2 = Entity::new("b", EntityType::Person, s2, s2 + len2, 1.0);
3543            let ratio = e1.overlap_ratio(&e2);
3544            prop_assert!(ratio >= 0.0);
3545            prop_assert!(ratio <= 1.0);
3546        }
3547
3548        #[test]
3549        fn self_overlap_ratio_is_one(s in 0usize..100, len in 1usize..50) {
3550            let e = Entity::new("test", EntityType::Person, s, s + len, 1.0);
3551            let ratio = e.overlap_ratio(&e);
3552            prop_assert!((ratio - 1.0).abs() < 1e-10);
3553        }
3554
3555        #[test]
3556        fn hierarchical_confidence_always_clamped(
3557            linkage in -2.0f32..2.0,
3558            type_score in -2.0f32..2.0,
3559            boundary in -2.0f32..2.0,
3560        ) {
3561            let hc = HierarchicalConfidence::new(linkage, type_score, boundary);
3562            prop_assert!(hc.linkage >= 0.0 && hc.linkage <= 1.0);
3563            prop_assert!(hc.type_score >= 0.0 && hc.type_score <= 1.0);
3564            prop_assert!(hc.boundary >= 0.0 && hc.boundary <= 1.0);
3565            prop_assert!(hc.combined() >= 0.0 && hc.combined() <= 1.0);
3566        }
3567
3568        #[test]
3569        fn span_candidate_width_consistent(
3570            doc in 0u32..10,
3571            start in 0u32..100,
3572            end in 1u32..100,
3573        ) {
3574            let actual_end = start.max(end);
3575            let sc = SpanCandidate::new(doc, start, actual_end);
3576            prop_assert_eq!(sc.width(), actual_end.saturating_sub(start));
3577        }
3578
3579        #[test]
3580        fn ragged_batch_preserves_tokens(
3581            seq_lens in proptest::collection::vec(1usize..10, 1..5),
3582        ) {
3583            // Create sequences with sequential token IDs
3584            let mut counter = 0u32;
3585            let seqs: Vec<Vec<u32>> = seq_lens.iter().map(|&len| {
3586                let seq: Vec<u32> = (counter..counter + len as u32).collect();
3587                counter += len as u32;
3588                seq
3589            }).collect();
3590
3591            let batch = RaggedBatch::from_sequences(&seqs);
3592
3593            // Verify batch properties
3594            prop_assert_eq!(batch.batch_size(), seqs.len());
3595            prop_assert_eq!(batch.total_tokens(), seq_lens.iter().sum::<usize>());
3596
3597            // Verify each doc can be retrieved correctly
3598            for (i, seq) in seqs.iter().enumerate() {
3599                let doc_tokens = batch.doc_tokens(i).unwrap();
3600                prop_assert_eq!(doc_tokens, seq.as_slice());
3601            }
3602        }
3603
3604        #[test]
3605        fn span_text_offsets_consistent(start in 0usize..100, len in 0usize..50) {
3606            let end = start + len;
3607            let span = Span::text(start, end);
3608            let (s, e) = span.text_offsets().unwrap();
3609            prop_assert_eq!(s, start);
3610            prop_assert_eq!(e, end);
3611            prop_assert_eq!(span.len(), len);
3612        }
3613
3614        // =================================================================
3615        // Property tests for core type invariants
3616        // =================================================================
3617
3618        /// Entity with start < end always passes the span validity check in validate().
3619        #[test]
3620        fn entity_span_validity(
3621            start in 0usize..10000,
3622            len in 1usize..500,
3623            conf in 0.0f64..=1.0,
3624        ) {
3625            let end = start + len;
3626            // Build a source text long enough to cover the span
3627            let text_content: String = "x".repeat(end);
3628            let entity_text: String = text_content.chars().skip(start).take(len).collect();
3629            let e = Entity::new(&entity_text, EntityType::Person, start, end, conf);
3630            let issues = e.validate(&text_content);
3631            // No InvalidSpan or SpanOutOfBounds issues
3632            for issue in &issues {
3633                match issue {
3634                    ValidationIssue::InvalidSpan { .. } => {
3635                        prop_assert!(false, "start < end should never produce InvalidSpan");
3636                    }
3637                    ValidationIssue::SpanOutOfBounds { .. } => {
3638                        prop_assert!(false, "span within text should never produce SpanOutOfBounds");
3639                    }
3640                    _ => {} // TextMismatch or others are fine to check separately
3641                }
3642            }
3643        }
3644
3645        /// EntityType::from_label(et.as_label()) == et for all standard (non-Custom, non-Other) types.
3646        #[test]
3647        fn entity_type_label_roundtrip_standard(
3648            idx in 0usize..13,
3649        ) {
3650            let standard_types = [
3651                EntityType::Person,
3652                EntityType::Organization,
3653                EntityType::Location,
3654                EntityType::Date,
3655                EntityType::Time,
3656                EntityType::Money,
3657                EntityType::Percent,
3658                EntityType::Quantity,
3659                EntityType::Cardinal,
3660                EntityType::Ordinal,
3661                EntityType::Email,
3662                EntityType::Url,
3663                EntityType::Phone,
3664            ];
3665            let et = &standard_types[idx];
3666            let label = et.as_label();
3667            let roundtripped = EntityType::from_label(label);
3668            prop_assert_eq!(&roundtripped, et,
3669                "from_label(as_label()) must roundtrip for {:?} (label={:?})", et, label);
3670        }
3671
3672        /// Span containment: if span A contains span B, then A.start <= B.start && A.end >= B.end.
3673        #[test]
3674        fn span_containment_property(
3675            a_start in 0usize..5000,
3676            a_len in 1usize..5000,
3677            b_offset in 0usize..5000,
3678            b_len in 1usize..5000,
3679        ) {
3680            let a_end = a_start + a_len;
3681            let b_start = a_start + (b_offset % a_len); // B starts within A
3682            let b_end_candidate = b_start + b_len;
3683
3684            // Only test the containment invariant when B is actually inside A
3685            if b_start >= a_start && b_end_candidate <= a_end {
3686                // B is contained in A
3687                prop_assert!(a_start <= b_start);
3688                prop_assert!(a_end >= b_end_candidate);
3689
3690                // Also verify via Entity overlap: A must overlap B if A contains B
3691                let ea = Entity::new("a", EntityType::Person, a_start, a_end, 1.0);
3692                let eb = Entity::new("b", EntityType::Person, b_start, b_end_candidate, 1.0);
3693                prop_assert!(ea.overlaps(&eb),
3694                    "containing span must overlap contained span");
3695            }
3696        }
3697
3698        /// Serde roundtrip preserves all fields of Entity.
3699        #[test]
3700        fn entity_serde_roundtrip(
3701            start in 0usize..10000,
3702            len in 1usize..500,
3703            conf in 0.0f64..=1.0,
3704            type_idx in 0usize..5,
3705        ) {
3706            let end = start + len;
3707            let types = [
3708                EntityType::Person,
3709                EntityType::Organization,
3710                EntityType::Location,
3711                EntityType::Date,
3712                EntityType::Email,
3713            ];
3714            let et = types[type_idx].clone();
3715            let text = format!("entity_{}", start);
3716            let e = Entity::new(&text, et, start, end, conf);
3717
3718            let json = serde_json::to_string(&e).unwrap();
3719            let e2: Entity = serde_json::from_str(&json).unwrap();
3720
3721            prop_assert_eq!(&e.text, &e2.text);
3722            prop_assert_eq!(&e.entity_type, &e2.entity_type);
3723            prop_assert_eq!(e.start, e2.start);
3724            prop_assert_eq!(e.end, e2.end);
3725            // f64 roundtrip through JSON: compare with tolerance
3726            prop_assert!((e.confidence - e2.confidence).abs() < 1e-10,
3727                "confidence roundtrip: {} vs {}", e.confidence, e2.confidence);
3728            prop_assert_eq!(&e.normalized, &e2.normalized);
3729            prop_assert_eq!(&e.kb_id, &e2.kb_id);
3730        }
3731
3732        /// DiscontinuousSpan: total_len() == sum of individual segment lengths.
3733        #[test]
3734        fn discontinuous_span_total_length(
3735            segments in proptest::collection::vec(
3736                (0usize..5000, 1usize..500),
3737                1..6
3738            ),
3739        ) {
3740            let ranges: Vec<std::ops::Range<usize>> = segments.iter()
3741                .map(|&(start, len)| start..start + len)
3742                .collect();
3743            let expected_sum: usize = ranges.iter().map(|r| r.end - r.start).sum();
3744            let span = DiscontinuousSpan::new(ranges);
3745            prop_assert_eq!(span.total_len(), expected_sum,
3746                "total_len must equal sum of segment lengths");
3747        }
3748    }
3749
3750    // ========================================================================
3751    // EntityViewport Tests
3752    // ========================================================================
3753
3754    #[test]
3755    fn test_entity_viewport_as_str() {
3756        assert_eq!(EntityViewport::Business.as_str(), "business");
3757        assert_eq!(EntityViewport::Legal.as_str(), "legal");
3758        assert_eq!(EntityViewport::Technical.as_str(), "technical");
3759        assert_eq!(EntityViewport::Academic.as_str(), "academic");
3760        assert_eq!(EntityViewport::Personal.as_str(), "personal");
3761        assert_eq!(EntityViewport::Political.as_str(), "political");
3762        assert_eq!(EntityViewport::Media.as_str(), "media");
3763        assert_eq!(EntityViewport::Historical.as_str(), "historical");
3764        assert_eq!(EntityViewport::General.as_str(), "general");
3765        assert_eq!(
3766            EntityViewport::Custom("custom".to_string()).as_str(),
3767            "custom"
3768        );
3769    }
3770
3771    #[test]
3772    fn test_entity_viewport_is_professional() {
3773        assert!(EntityViewport::Business.is_professional());
3774        assert!(EntityViewport::Legal.is_professional());
3775        assert!(EntityViewport::Technical.is_professional());
3776        assert!(EntityViewport::Academic.is_professional());
3777        assert!(EntityViewport::Political.is_professional());
3778
3779        assert!(!EntityViewport::Personal.is_professional());
3780        assert!(!EntityViewport::Media.is_professional());
3781        assert!(!EntityViewport::Historical.is_professional());
3782        assert!(!EntityViewport::General.is_professional());
3783        assert!(!EntityViewport::Custom("test".to_string()).is_professional());
3784    }
3785
3786    #[test]
3787    fn test_entity_viewport_from_str() {
3788        assert_eq!(
3789            "business".parse::<EntityViewport>().unwrap(),
3790            EntityViewport::Business
3791        );
3792        assert_eq!(
3793            "financial".parse::<EntityViewport>().unwrap(),
3794            EntityViewport::Business
3795        );
3796        assert_eq!(
3797            "corporate".parse::<EntityViewport>().unwrap(),
3798            EntityViewport::Business
3799        );
3800
3801        assert_eq!(
3802            "legal".parse::<EntityViewport>().unwrap(),
3803            EntityViewport::Legal
3804        );
3805        assert_eq!(
3806            "law".parse::<EntityViewport>().unwrap(),
3807            EntityViewport::Legal
3808        );
3809
3810        assert_eq!(
3811            "technical".parse::<EntityViewport>().unwrap(),
3812            EntityViewport::Technical
3813        );
3814        assert_eq!(
3815            "engineering".parse::<EntityViewport>().unwrap(),
3816            EntityViewport::Technical
3817        );
3818
3819        assert_eq!(
3820            "academic".parse::<EntityViewport>().unwrap(),
3821            EntityViewport::Academic
3822        );
3823        assert_eq!(
3824            "research".parse::<EntityViewport>().unwrap(),
3825            EntityViewport::Academic
3826        );
3827
3828        assert_eq!(
3829            "personal".parse::<EntityViewport>().unwrap(),
3830            EntityViewport::Personal
3831        );
3832        assert_eq!(
3833            "biographical".parse::<EntityViewport>().unwrap(),
3834            EntityViewport::Personal
3835        );
3836
3837        assert_eq!(
3838            "political".parse::<EntityViewport>().unwrap(),
3839            EntityViewport::Political
3840        );
3841        assert_eq!(
3842            "policy".parse::<EntityViewport>().unwrap(),
3843            EntityViewport::Political
3844        );
3845
3846        assert_eq!(
3847            "media".parse::<EntityViewport>().unwrap(),
3848            EntityViewport::Media
3849        );
3850        assert_eq!(
3851            "press".parse::<EntityViewport>().unwrap(),
3852            EntityViewport::Media
3853        );
3854
3855        assert_eq!(
3856            "historical".parse::<EntityViewport>().unwrap(),
3857            EntityViewport::Historical
3858        );
3859        assert_eq!(
3860            "history".parse::<EntityViewport>().unwrap(),
3861            EntityViewport::Historical
3862        );
3863
3864        assert_eq!(
3865            "general".parse::<EntityViewport>().unwrap(),
3866            EntityViewport::General
3867        );
3868        assert_eq!(
3869            "generic".parse::<EntityViewport>().unwrap(),
3870            EntityViewport::General
3871        );
3872        assert_eq!(
3873            "".parse::<EntityViewport>().unwrap(),
3874            EntityViewport::General
3875        );
3876
3877        // Custom viewport
3878        assert_eq!(
3879            "custom_viewport".parse::<EntityViewport>().unwrap(),
3880            EntityViewport::Custom("custom_viewport".to_string())
3881        );
3882    }
3883
3884    #[test]
3885    fn test_entity_viewport_from_str_case_insensitive() {
3886        assert_eq!(
3887            "BUSINESS".parse::<EntityViewport>().unwrap(),
3888            EntityViewport::Business
3889        );
3890        assert_eq!(
3891            "Business".parse::<EntityViewport>().unwrap(),
3892            EntityViewport::Business
3893        );
3894        assert_eq!(
3895            "BuSiNeSs".parse::<EntityViewport>().unwrap(),
3896            EntityViewport::Business
3897        );
3898    }
3899
3900    #[test]
3901    fn test_entity_viewport_display() {
3902        assert_eq!(format!("{}", EntityViewport::Business), "business");
3903        assert_eq!(format!("{}", EntityViewport::Academic), "academic");
3904        assert_eq!(
3905            format!("{}", EntityViewport::Custom("test".to_string())),
3906            "test"
3907        );
3908    }
3909
3910    #[test]
3911    fn test_entity_viewport_methods() {
3912        let mut entity = Entity::new("Marie Curie", EntityType::Person, 0, 11, 0.9);
3913
3914        // Initially no viewport
3915        assert!(!entity.has_viewport());
3916        assert_eq!(entity.viewport_or_default(), EntityViewport::General);
3917        assert!(entity.matches_viewport(&EntityViewport::Academic)); // No viewport matches any
3918
3919        // Set viewport
3920        entity.set_viewport(EntityViewport::Academic);
3921        assert!(entity.has_viewport());
3922        assert_eq!(entity.viewport_or_default(), EntityViewport::Academic);
3923        assert!(entity.matches_viewport(&EntityViewport::Academic));
3924        assert!(!entity.matches_viewport(&EntityViewport::Business));
3925    }
3926
3927    #[test]
3928    fn test_entity_builder_with_viewport() {
3929        let entity = Entity::builder("Marie Curie", EntityType::Person)
3930            .span(0, 11)
3931            .viewport(EntityViewport::Academic)
3932            .build();
3933
3934        assert_eq!(entity.viewport, Some(EntityViewport::Academic));
3935        assert!(entity.has_viewport());
3936    }
3937
3938    // ========================================================================
3939    // EntityCategory Tests
3940    // ========================================================================
3941
3942    #[test]
3943    fn test_entity_category_requires_ml() {
3944        assert!(EntityCategory::Agent.requires_ml());
3945        assert!(EntityCategory::Organization.requires_ml());
3946        assert!(EntityCategory::Place.requires_ml());
3947        assert!(EntityCategory::Creative.requires_ml());
3948        assert!(EntityCategory::Relation.requires_ml());
3949
3950        assert!(!EntityCategory::Temporal.requires_ml());
3951        assert!(!EntityCategory::Numeric.requires_ml());
3952        assert!(!EntityCategory::Contact.requires_ml());
3953        assert!(!EntityCategory::Misc.requires_ml());
3954    }
3955
3956    #[test]
3957    fn test_entity_category_pattern_detectable() {
3958        assert!(EntityCategory::Temporal.pattern_detectable());
3959        assert!(EntityCategory::Numeric.pattern_detectable());
3960        assert!(EntityCategory::Contact.pattern_detectable());
3961
3962        assert!(!EntityCategory::Agent.pattern_detectable());
3963        assert!(!EntityCategory::Organization.pattern_detectable());
3964        assert!(!EntityCategory::Place.pattern_detectable());
3965        assert!(!EntityCategory::Creative.pattern_detectable());
3966        assert!(!EntityCategory::Relation.pattern_detectable());
3967        assert!(!EntityCategory::Misc.pattern_detectable());
3968    }
3969
3970    #[test]
3971    fn test_entity_category_is_relation() {
3972        assert!(EntityCategory::Relation.is_relation());
3973
3974        assert!(!EntityCategory::Agent.is_relation());
3975        assert!(!EntityCategory::Organization.is_relation());
3976        assert!(!EntityCategory::Place.is_relation());
3977        assert!(!EntityCategory::Temporal.is_relation());
3978        assert!(!EntityCategory::Numeric.is_relation());
3979        assert!(!EntityCategory::Contact.is_relation());
3980        assert!(!EntityCategory::Creative.is_relation());
3981        assert!(!EntityCategory::Misc.is_relation());
3982    }
3983
3984    #[test]
3985    fn test_entity_category_as_str() {
3986        assert_eq!(EntityCategory::Agent.as_str(), "agent");
3987        assert_eq!(EntityCategory::Organization.as_str(), "organization");
3988        assert_eq!(EntityCategory::Place.as_str(), "place");
3989        assert_eq!(EntityCategory::Creative.as_str(), "creative");
3990        assert_eq!(EntityCategory::Temporal.as_str(), "temporal");
3991        assert_eq!(EntityCategory::Numeric.as_str(), "numeric");
3992        assert_eq!(EntityCategory::Contact.as_str(), "contact");
3993        assert_eq!(EntityCategory::Relation.as_str(), "relation");
3994        assert_eq!(EntityCategory::Misc.as_str(), "misc");
3995    }
3996
3997    #[test]
3998    fn test_entity_category_display() {
3999        assert_eq!(format!("{}", EntityCategory::Agent), "agent");
4000        assert_eq!(format!("{}", EntityCategory::Temporal), "temporal");
4001        assert_eq!(format!("{}", EntityCategory::Relation), "relation");
4002    }
4003}