Skip to main content

anno_core/core/
entity.rs

1//! Entity types and structures for NER.
2//!
3//! # Design Philosophy (Research-Aligned)
4//!
5//! This module implements entity types informed by modern NER research:
6//!
7//! - **GLiNER/Bi-Encoder**: Entity types are *labels to match against*, not fixed classes.
8//!   Relations ("CEO of") are entities too - they're just labels in the same latent space.
9//!
10//! - **TPLinker/Joint Extraction**: Entities and relations can be extracted in a single pass.
11//!   The type system supports relation triggers as first-class mentions.
12//!
13//! - **Knowledge Graphs**: Entities can link to external knowledge bases (`kb_id`) for
14//!   coreference resolution and GraphRAG applications.
15//!
16//! # Type Hierarchy
17//!
18//! ```text
19//! Mention
20//! ├── Entity (single span)
21//! │   ├── Named (ML): Person, Organization, Location
22//! │   ├── Temporal (Pattern): Date, Time
23//! │   ├── Numeric (Pattern): Money, Percent, Quantity, Cardinal, Ordinal
24//! │   └── Contact (Pattern): Email, Url, Phone
25//! │
26//! └── Relation (connects entities)
27//!     └── Trigger text: "CEO of", "located in", "born on"
28//! ```
29//!
30//! # Design Principles
31//!
32//! 1. **Bi-encoder compatible**: Types are semantic labels, not fixed enums
33//! 2. **Joint extraction**: Relations are mentions with trigger spans
34//! 3. **Knowledge linking**: `kb_id` for connecting to external KBs
35//! 4. **Hierarchical confidence**: Coarse (linkage) + fine (type) scores
36//! 5. **Multi-modal ready**: Spans can be text offsets or visual bboxes
37
38use super::confidence::Confidence;
39use super::types::{MentionType, PhiFeatures};
40use serde::{Deserialize, Serialize};
41use std::borrow::Cow;
42
43// ============================================================================
44// Entity Category (OntoNotes-inspired)
45// ============================================================================
46
47/// Category of entity based on detection characteristics and semantics.
48///
49/// Based on OntoNotes 5.0 categories with extensions for:
50/// - Structured data (Contact, patterns)
51/// - Knowledge graphs (Relation, for TPLinker/GLiNER joint extraction)
52#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
53#[non_exhaustive]
54pub enum EntityCategory {
55    /// Named entities for people/groups (ML-required).
56    /// Types: Person, NORP (nationalities/religious/political groups)
57    Agent,
58    /// Named entities for organizations/facilities (ML-required).
59    /// Types: Organization, Facility
60    Organization,
61    /// Named entities for places (ML-required).
62    /// Types: GPE (geo-political), Location (geographic)
63    Place,
64    /// Named entities for creative/conceptual (ML-required).
65    /// Types: Event, Product, WorkOfArt, Law, Language
66    Creative,
67    /// Temporal entities (pattern-detectable).
68    /// Types: Date, Time
69    Temporal,
70    /// Numeric entities (pattern-detectable).
71    /// Types: Money, Percent, Quantity, Cardinal, Ordinal
72    Numeric,
73    /// Contact/identifier entities (pattern-detectable).
74    /// Types: Email, Url, Phone
75    Contact,
76    /// Relation triggers for knowledge graph construction (ML-required).
77    /// Examples: "CEO of", "located in", "founded by"
78    /// In GLiNER bi-encoder, relations are just another label to match.
79    Relation,
80    /// Miscellaneous/unknown category
81    Misc,
82}
83
84impl EntityCategory {
85    /// Returns true if this category requires ML for detection.
86    #[must_use]
87    pub const fn requires_ml(&self) -> bool {
88        matches!(
89            self,
90            EntityCategory::Agent
91                | EntityCategory::Organization
92                | EntityCategory::Place
93                | EntityCategory::Creative
94                | EntityCategory::Relation
95        )
96    }
97
98    /// Returns true if this category can be detected via patterns.
99    #[must_use]
100    pub const fn pattern_detectable(&self) -> bool {
101        matches!(
102            self,
103            EntityCategory::Temporal | EntityCategory::Numeric | EntityCategory::Contact
104        )
105    }
106
107    /// Returns true if this is a relation (for knowledge graph construction).
108    #[must_use]
109    pub const fn is_relation(&self) -> bool {
110        matches!(self, EntityCategory::Relation)
111    }
112
113    /// Returns OntoNotes-compatible category name.
114    #[must_use]
115    pub const fn as_str(&self) -> &'static str {
116        match self {
117            EntityCategory::Agent => "agent",
118            EntityCategory::Organization => "organization",
119            EntityCategory::Place => "place",
120            EntityCategory::Creative => "creative",
121            EntityCategory::Temporal => "temporal",
122            EntityCategory::Numeric => "numeric",
123            EntityCategory::Contact => "contact",
124            EntityCategory::Relation => "relation",
125            EntityCategory::Misc => "misc",
126        }
127    }
128}
129
130impl std::fmt::Display for EntityCategory {
131    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
132        write!(f, "{}", self.as_str())
133    }
134}
135
136// ============================================================================
137// Entity Viewport (Research: Entity Manifolds)
138// ============================================================================
139
140/// Viewport context for multi-faceted entity representation.
141///
142/// # Research Background
143///
144/// The concept of "Entity Viewports" comes from the observation that
145/// real-world entities are not monolithic - they present different
146/// facets depending on context:
147///
148/// - "Marie Curie" in an **Academic** context: physicist, Nobel laureate
149/// - "Marie Curie" in a **Technical** context: radioactivity researcher, X-ray pioneer
150/// - "Marie Curie" in a **Personal** context: mother, immigrant, educator
151/// - "Marie Curie" in a **Medical** context: founder of mobile X-ray units
152///
153/// Rather than collapsing all information into a single vector,
154/// the viewport model preserves these distinctions and enables
155/// "projection" at query time.
156///
157/// # Usage in RAG Systems
158///
159/// When answering "What were Curie's scientific contributions?", retrieve
160/// facts from the `Academic` viewport. When answering "What was Curie's
161/// personal life like?", retrieve from `Personal`.
162///
163/// # Example
164///
165/// ```rust
166/// use anno_core::{Entity, EntityType, EntityViewport};
167///
168/// let mut entity = Entity::new("Marie Curie", EntityType::Person, 0, 11, 0.9);
169/// entity.viewport = Some(EntityViewport::Academic);
170/// assert!(entity.viewport.as_ref().unwrap().is_professional());
171/// ```
172#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize, Default)]
173#[non_exhaustive]
174pub enum EntityViewport {
175    /// Business/financial context (CEO, revenue, market cap)
176    Business,
177    /// Legal context (lawsuits, settlements, compliance)
178    Legal,
179    /// Technical/engineering context (patents, inventions, code)
180    Technical,
181    /// Academic/research context (publications, citations, grants)
182    Academic,
183    /// Personal/biographical context (family, hobbies, background)
184    Personal,
185    /// Political context (lobbying, donations, policy positions)
186    Political,
187    /// Media/public relations context (interviews, statements, PR)
188    Media,
189    /// Historical context (past roles, timeline events)
190    Historical,
191    /// Generic/unspecified context
192    #[default]
193    General,
194    /// Custom viewport with a descriptive label
195    Custom(String),
196}
197
198impl EntityViewport {
199    /// Human-readable label for the viewport.
200    #[must_use]
201    pub fn as_str(&self) -> &str {
202        match self {
203            EntityViewport::Business => "business",
204            EntityViewport::Legal => "legal",
205            EntityViewport::Technical => "technical",
206            EntityViewport::Academic => "academic",
207            EntityViewport::Personal => "personal",
208            EntityViewport::Political => "political",
209            EntityViewport::Media => "media",
210            EntityViewport::Historical => "historical",
211            EntityViewport::General => "general",
212            EntityViewport::Custom(s) => s,
213        }
214    }
215
216    /// Is this a professional/work-related viewport?
217    #[must_use]
218    pub const fn is_professional(&self) -> bool {
219        matches!(
220            self,
221            EntityViewport::Business
222                | EntityViewport::Legal
223                | EntityViewport::Technical
224                | EntityViewport::Academic
225                | EntityViewport::Political
226        )
227    }
228}
229
230impl std::str::FromStr for EntityViewport {
231    type Err = std::convert::Infallible;
232
233    fn from_str(s: &str) -> Result<Self, Self::Err> {
234        Ok(match s.to_lowercase().as_str() {
235            "business" | "financial" | "corporate" => EntityViewport::Business,
236            "legal" | "law" | "compliance" => EntityViewport::Legal,
237            "technical" | "engineering" | "tech" => EntityViewport::Technical,
238            "academic" | "research" | "scholarly" => EntityViewport::Academic,
239            "personal" | "biographical" | "private" => EntityViewport::Personal,
240            "political" | "policy" | "government" => EntityViewport::Political,
241            "media" | "press" | "pr" | "public_relations" => EntityViewport::Media,
242            "historical" | "history" | "past" => EntityViewport::Historical,
243            "general" | "generic" | "" => EntityViewport::General,
244            other => EntityViewport::Custom(other.to_string()),
245        })
246    }
247}
248
249impl std::fmt::Display for EntityViewport {
250    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
251        write!(f, "{}", self.as_str())
252    }
253}
254
255// ============================================================================
256// Entity Type
257// ============================================================================
258
259/// Entity type classification.
260///
261/// Organized into categories:
262/// - **Named** (ML-required): Person, Organization, Location
263/// - **Temporal** (pattern): Date, Time
264/// - **Numeric** (pattern): Money, Percent, Quantity, Cardinal, Ordinal
265/// - **Contact** (pattern): Email, Url, Phone
266///
267/// # Examples
268///
269/// ```
270/// use anno_core::EntityType;
271///
272/// let ty = EntityType::Email;
273/// assert!(ty.category().pattern_detectable());
274/// assert!(!ty.category().requires_ml());
275///
276/// let ty = EntityType::Person;
277/// assert!(ty.category().requires_ml());
278/// ```
279#[derive(Debug, Clone, PartialEq, Eq, Hash)]
280#[non_exhaustive]
281pub enum EntityType {
282    // === Named Entities (ML-required) ===
283    /// Person name (PER) - requires ML/context
284    Person,
285    /// Organization name (ORG) - requires ML/context
286    Organization,
287    /// Location/Place (LOC/GPE) - requires ML/context
288    Location,
289
290    // === Temporal Entities (Pattern-detectable) ===
291    /// Date expression (DATE) - pattern-detectable
292    Date,
293    /// Time expression (TIME) - pattern-detectable
294    Time,
295
296    // === Numeric Entities (Pattern-detectable) ===
297    /// Monetary value (MONEY) - pattern-detectable
298    Money,
299    /// Percentage (PERCENT) - pattern-detectable
300    Percent,
301    /// Quantity with unit (QUANTITY) - pattern-detectable
302    Quantity,
303    /// Cardinal number (CARDINAL) - pattern-detectable
304    Cardinal,
305    /// Ordinal number (ORDINAL) - pattern-detectable
306    Ordinal,
307
308    // === Contact Entities (Pattern-detectable) ===
309    /// Email address - pattern-detectable
310    Email,
311    /// URL/URI - pattern-detectable
312    Url,
313    /// Phone number - pattern-detectable
314    Phone,
315
316    // === Extensibility ===
317    /// Domain-specific custom type with explicit category
318    Custom {
319        /// Type name (e.g., "DISEASE", "PRODUCT", "EVENT")
320        name: String,
321        /// Category for this custom type
322        category: EntityCategory,
323    },
324
325    /// Legacy catch-all for unknown types.
326    ///
327    /// **Deprecated**: use `EntityType::custom(name, category)` instead.
328    /// Retained only for serde backward compatibility with existing data.
329    /// Deserialization of `{"Other":"X"}` now routes to `Custom { name: "X", category: Misc }`.
330    #[deprecated(note = "use EntityType::custom(name, EntityCategory::Misc) instead")]
331    Other(String),
332}
333
334impl EntityType {
335    /// Get the category of this entity type.
336    #[must_use]
337    pub fn category(&self) -> EntityCategory {
338        match self {
339            // Agent entities (people/groups)
340            EntityType::Person => EntityCategory::Agent,
341            // Organization entities
342            EntityType::Organization => EntityCategory::Organization,
343            // Place entities (locations)
344            EntityType::Location => EntityCategory::Place,
345            // Temporal entities
346            EntityType::Date | EntityType::Time => EntityCategory::Temporal,
347            // Numeric entities
348            EntityType::Money
349            | EntityType::Percent
350            | EntityType::Quantity
351            | EntityType::Cardinal
352            | EntityType::Ordinal => EntityCategory::Numeric,
353            // Contact entities
354            EntityType::Email | EntityType::Url | EntityType::Phone => EntityCategory::Contact,
355            // Custom with explicit category
356            EntityType::Custom { category, .. } => *category,
357            // Legacy Other -- kept for exhaustiveness (variant is #[deprecated])
358            #[allow(deprecated)]
359            EntityType::Other(_) => EntityCategory::Misc,
360        }
361    }
362
363    /// Returns true if this entity type requires ML for detection.
364    #[must_use]
365    pub fn requires_ml(&self) -> bool {
366        self.category().requires_ml()
367    }
368
369    /// Returns true if this entity type can be detected via patterns.
370    #[must_use]
371    pub fn pattern_detectable(&self) -> bool {
372        self.category().pattern_detectable()
373    }
374
375    /// Convert to standard label string (CoNLL/OntoNotes format).
376    ///
377    /// ```
378    /// use anno_core::EntityType;
379    ///
380    /// assert_eq!(EntityType::Person.as_label(), "PER");
381    /// assert_eq!(EntityType::Location.as_label(), "LOC");
382    /// ```
383    #[must_use]
384    pub fn as_label(&self) -> &str {
385        match self {
386            EntityType::Person => "PER",
387            EntityType::Organization => "ORG",
388            EntityType::Location => "LOC",
389            EntityType::Date => "DATE",
390            EntityType::Time => "TIME",
391            EntityType::Money => "MONEY",
392            EntityType::Percent => "PERCENT",
393            EntityType::Quantity => "QUANTITY",
394            EntityType::Cardinal => "CARDINAL",
395            EntityType::Ordinal => "ORDINAL",
396            EntityType::Email => "EMAIL",
397            EntityType::Url => "URL",
398            EntityType::Phone => "PHONE",
399            EntityType::Custom { name, .. } => name.as_str(),
400            #[allow(deprecated)]
401            EntityType::Other(s) => s.as_str(),
402        }
403    }
404
405    /// Parse from standard label string.
406    ///
407    /// Handles various formats: CoNLL (PER), OntoNotes (PERSON), BIO (B-PER).
408    ///
409    /// ```
410    /// use anno_core::EntityType;
411    ///
412    /// assert_eq!(EntityType::from_label("PER"), EntityType::Person);
413    /// assert_eq!(EntityType::from_label("B-ORG"), EntityType::Organization);
414    /// assert_eq!(EntityType::from_label("PERSON"), EntityType::Person);
415    /// ```
416    #[must_use]
417    pub fn from_label(label: &str) -> Self {
418        // Strip BIO prefix if present
419        let label = label
420            .strip_prefix("B-")
421            .or_else(|| label.strip_prefix("I-"))
422            .or_else(|| label.strip_prefix("E-"))
423            .or_else(|| label.strip_prefix("S-"))
424            .unwrap_or(label);
425
426        match label.to_uppercase().as_str() {
427            // Named entities (multiple variations)
428            "PER" | "PERSON" => EntityType::Person,
429            "ORG" | "ORGANIZATION" | "COMPANY" | "CORPORATION" => EntityType::Organization,
430            "LOC" | "LOCATION" | "GPE" | "GEO-LOC" => EntityType::Location,
431            // WNUT / FewNERD specific types (common in social media / Wikipedia)
432            "FACILITY" | "FAC" | "BUILDING" => {
433                EntityType::custom("BUILDING", EntityCategory::Place)
434            }
435            "PRODUCT" | "PROD" => EntityType::custom("PRODUCT", EntityCategory::Misc),
436            "EVENT" => EntityType::custom("EVENT", EntityCategory::Creative),
437            "CREATIVE-WORK" | "WORK_OF_ART" | "ART" => {
438                EntityType::custom("CREATIVE_WORK", EntityCategory::Creative)
439            }
440            "GROUP" | "NORP" => EntityType::custom("GROUP", EntityCategory::Agent),
441            // Temporal
442            "DATE" => EntityType::Date,
443            "TIME" => EntityType::Time,
444            // Numeric
445            "MONEY" | "CURRENCY" => EntityType::Money,
446            "PERCENT" | "PERCENTAGE" => EntityType::Percent,
447            "QUANTITY" => EntityType::Quantity,
448            "CARDINAL" => EntityType::Cardinal,
449            "ORDINAL" => EntityType::Ordinal,
450            // Contact
451            "EMAIL" => EntityType::Email,
452            "URL" | "URI" => EntityType::Url,
453            "PHONE" | "TELEPHONE" => EntityType::Phone,
454            // MISC variations
455            "MISC" | "MISCELLANEOUS" | "OTHER" => EntityType::custom("MISC", EntityCategory::Misc),
456            // Biomedical types
457            "DISEASE" | "DISORDER" => EntityType::custom("DISEASE", EntityCategory::Misc),
458            "CHEMICAL" | "DRUG" => EntityType::custom("CHEMICAL", EntityCategory::Misc),
459            "GENE" => EntityType::custom("GENE", EntityCategory::Misc),
460            "PROTEIN" => EntityType::custom("PROTEIN", EntityCategory::Misc),
461            // Unknown -> Custom with Misc category
462            other => EntityType::custom(other, EntityCategory::Misc),
463        }
464    }
465
466    /// Create a custom domain-specific entity type.
467    ///
468    /// # Examples
469    ///
470    /// ```
471    /// use anno_core::{EntityType, EntityCategory};
472    ///
473    /// let disease = EntityType::custom("DISEASE", EntityCategory::Agent);
474    /// assert!(disease.requires_ml());
475    ///
476    /// let product_id = EntityType::custom("PRODUCT_ID", EntityCategory::Misc);
477    /// assert!(!product_id.requires_ml());
478    /// ```
479    #[must_use]
480    pub fn custom(name: impl Into<String>, category: EntityCategory) -> Self {
481        EntityType::Custom {
482            name: name.into(),
483            category,
484        }
485    }
486}
487
488impl std::fmt::Display for EntityType {
489    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
490        write!(f, "{}", self.as_label())
491    }
492}
493
494impl std::str::FromStr for EntityType {
495    type Err = std::convert::Infallible;
496
497    /// Parse from standard label string. Never fails -- unknown labels become `Custom`.
498    fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
499        Ok(Self::from_label(s))
500    }
501}
502
503// Flatten EntityType to its label string for JSON serialization.
504// `Custom { name: "MISC", .. }` -> `"MISC"`, `Person` -> `"PER"`, etc.
505// Deserialization accepts both the flat string (new format) and the
506// tagged-enum object (backward compat with existing serialized data).
507impl Serialize for EntityType {
508    fn serialize<S: serde::Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
509        serializer.serialize_str(self.as_label())
510    }
511}
512
513impl<'de> Deserialize<'de> for EntityType {
514    fn deserialize<D: serde::Deserializer<'de>>(deserializer: D) -> Result<Self, D::Error> {
515        struct EntityTypeVisitor;
516
517        impl<'de> serde::de::Visitor<'de> for EntityTypeVisitor {
518            type Value = EntityType;
519
520            fn expecting(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
521                f.write_str("a string label or a tagged enum object")
522            }
523
524            // New flat format: "PER", "ORG", "MISC", etc.
525            fn visit_str<E: serde::de::Error>(self, v: &str) -> Result<EntityType, E> {
526                Ok(EntityType::from_label(v))
527            }
528
529            // Backward-compat: {"Custom":{"name":"MISC","category":"Misc"}}
530            // or {"Other":"foo"} or "Person" (unit variant as map key)
531            fn visit_map<A: serde::de::MapAccess<'de>>(
532                self,
533                mut map: A,
534            ) -> Result<EntityType, A::Error> {
535                let key: String = map
536                    .next_key()?
537                    .ok_or_else(|| serde::de::Error::custom("empty object"))?;
538                match key.as_str() {
539                    "Custom" => {
540                        #[derive(Deserialize)]
541                        struct CustomFields {
542                            name: String,
543                            category: EntityCategory,
544                        }
545                        let fields: CustomFields = map.next_value()?;
546                        Ok(EntityType::Custom {
547                            name: fields.name,
548                            category: fields.category,
549                        })
550                    }
551                    "Other" => {
552                        // Route legacy Other to Custom with Misc category
553                        let val: String = map.next_value()?;
554                        Ok(EntityType::custom(val, EntityCategory::Misc))
555                    }
556                    // Unit variants serialized as {"Person":null} etc.
557                    variant => {
558                        // Consume the value (null or unit)
559                        let _: serde::de::IgnoredAny = map.next_value()?;
560                        Ok(EntityType::from_label(variant))
561                    }
562                }
563            }
564        }
565
566        deserializer.deserialize_any(EntityTypeVisitor)
567    }
568}
569
570// =============================================================================
571// Type Mapping for Domain-Specific Datasets
572// =============================================================================
573
574/// Maps domain-specific entity types to standard NER types.
575///
576/// # Research Context (Familiarity paper, arXiv:2412.10121)
577///
578/// Type mapping creates "label overlap" between training and evaluation:
579/// - Mapping ACTOR → Person increases overlap
580/// - This can inflate zero-shot F1 scores
581///
582/// Use `LabelShift::from_type_sets()` to quantify how much overlap exists.
583/// High overlap (>80%) means the evaluation is NOT truly zero-shot.
584///
585/// # When to Use TypeMapper
586///
587/// - Cross-dataset comparison (normalize schemas for fair eval)
588/// - Domain adaptation (map new labels to known types)
589///
590/// # When NOT to Use TypeMapper
591///
592/// - True zero-shot evaluation (keep labels distinct)
593/// - Measuring generalization (overlap hides generalization failures)
594///
595/// # Example
596///
597/// ```rust
598/// use anno_core::{TypeMapper, EntityType, EntityCategory};
599///
600/// // MIT Movie dataset mapping
601/// let mut mapper = TypeMapper::new();
602/// mapper.add("ACTOR", EntityType::Person);
603/// mapper.add("DIRECTOR", EntityType::Person);
604/// mapper.add("TITLE", EntityType::custom("WORK_OF_ART", EntityCategory::Creative));
605///
606/// assert_eq!(mapper.map("ACTOR"), Some(&EntityType::Person));
607/// assert_eq!(mapper.normalize("DIRECTOR"), EntityType::Person);
608/// ```
609#[derive(Debug, Clone, Default)]
610pub struct TypeMapper {
611    mappings: std::collections::HashMap<String, EntityType>,
612}
613
614impl TypeMapper {
615    /// Create empty mapper.
616    #[must_use]
617    pub fn new() -> Self {
618        Self::default()
619    }
620
621    /// Create mapper for MIT Movie dataset.
622    #[must_use]
623    pub fn mit_movie() -> Self {
624        let mut mapper = Self::new();
625        // Map to standard types where possible
626        mapper.add("ACTOR", EntityType::Person);
627        mapper.add("DIRECTOR", EntityType::Person);
628        mapper.add("CHARACTER", EntityType::Person);
629        mapper.add(
630            "TITLE",
631            EntityType::custom("WORK_OF_ART", EntityCategory::Creative),
632        );
633        mapper.add("GENRE", EntityType::custom("GENRE", EntityCategory::Misc));
634        mapper.add("YEAR", EntityType::Date);
635        mapper.add("RATING", EntityType::custom("RATING", EntityCategory::Misc));
636        mapper.add("PLOT", EntityType::custom("PLOT", EntityCategory::Misc));
637        mapper
638    }
639
640    /// Create mapper for MIT Restaurant dataset.
641    #[must_use]
642    pub fn mit_restaurant() -> Self {
643        let mut mapper = Self::new();
644        mapper.add("RESTAURANT_NAME", EntityType::Organization);
645        mapper.add("LOCATION", EntityType::Location);
646        mapper.add(
647            "CUISINE",
648            EntityType::custom("CUISINE", EntityCategory::Misc),
649        );
650        mapper.add("DISH", EntityType::custom("DISH", EntityCategory::Misc));
651        mapper.add("PRICE", EntityType::Money);
652        mapper.add(
653            "AMENITY",
654            EntityType::custom("AMENITY", EntityCategory::Misc),
655        );
656        mapper.add("HOURS", EntityType::Time);
657        mapper
658    }
659
660    /// Create mapper for biomedical datasets (BC5CDR, NCBI).
661    #[must_use]
662    pub fn biomedical() -> Self {
663        let mut mapper = Self::new();
664        mapper.add(
665            "DISEASE",
666            EntityType::custom("DISEASE", EntityCategory::Agent),
667        );
668        mapper.add(
669            "CHEMICAL",
670            EntityType::custom("CHEMICAL", EntityCategory::Misc),
671        );
672        mapper.add("DRUG", EntityType::custom("DRUG", EntityCategory::Misc));
673        mapper.add("GENE", EntityType::custom("GENE", EntityCategory::Misc));
674        mapper.add(
675            "PROTEIN",
676            EntityType::custom("PROTEIN", EntityCategory::Misc),
677        );
678        // GENIA types
679        mapper.add("DNA", EntityType::custom("DNA", EntityCategory::Misc));
680        mapper.add("RNA", EntityType::custom("RNA", EntityCategory::Misc));
681        mapper.add(
682            "cell_line",
683            EntityType::custom("CELL_LINE", EntityCategory::Misc),
684        );
685        mapper.add(
686            "cell_type",
687            EntityType::custom("CELL_TYPE", EntityCategory::Misc),
688        );
689        mapper
690    }
691
692    /// Create mapper for social media NER datasets (TweetNER7, etc.).
693    #[must_use]
694    pub fn social_media() -> Self {
695        let mut mapper = Self::new();
696        // TweetNER7 types
697        mapper.add("person", EntityType::Person);
698        mapper.add("corporation", EntityType::Organization);
699        mapper.add("location", EntityType::Location);
700        mapper.add("group", EntityType::Organization);
701        mapper.add(
702            "product",
703            EntityType::custom("PRODUCT", EntityCategory::Misc),
704        );
705        mapper.add(
706            "creative_work",
707            EntityType::custom("WORK_OF_ART", EntityCategory::Creative),
708        );
709        mapper.add("event", EntityType::custom("EVENT", EntityCategory::Misc));
710        mapper
711    }
712
713    /// Create mapper for manufacturing domain datasets (FabNER, etc.).
714    #[must_use]
715    pub fn manufacturing() -> Self {
716        let mut mapper = Self::new();
717        // FabNER entity types
718        mapper.add("MATE", EntityType::custom("MATERIAL", EntityCategory::Misc));
719        mapper.add("MANP", EntityType::custom("PROCESS", EntityCategory::Misc));
720        mapper.add("MACEQ", EntityType::custom("MACHINE", EntityCategory::Misc));
721        mapper.add(
722            "APPL",
723            EntityType::custom("APPLICATION", EntityCategory::Misc),
724        );
725        mapper.add("FEAT", EntityType::custom("FEATURE", EntityCategory::Misc));
726        mapper.add(
727            "PARA",
728            EntityType::custom("PARAMETER", EntityCategory::Misc),
729        );
730        mapper.add("PRO", EntityType::custom("PROPERTY", EntityCategory::Misc));
731        mapper.add(
732            "CHAR",
733            EntityType::custom("CHARACTERISTIC", EntityCategory::Misc),
734        );
735        mapper.add(
736            "ENAT",
737            EntityType::custom("ENABLING_TECHNOLOGY", EntityCategory::Misc),
738        );
739        mapper.add(
740            "CONPRI",
741            EntityType::custom("CONCEPT_PRINCIPLE", EntityCategory::Misc),
742        );
743        mapper.add(
744            "BIOP",
745            EntityType::custom("BIO_PROCESS", EntityCategory::Misc),
746        );
747        mapper.add(
748            "MANS",
749            EntityType::custom("MAN_STANDARD", EntityCategory::Misc),
750        );
751        mapper
752    }
753
754    /// Add a mapping from source label to target type.
755    pub fn add(&mut self, source: impl Into<String>, target: EntityType) {
756        self.mappings.insert(source.into().to_uppercase(), target);
757    }
758
759    /// Get mapped type for a label (returns None if not mapped).
760    #[must_use]
761    pub fn map(&self, label: &str) -> Option<&EntityType> {
762        self.mappings.get(&label.to_uppercase())
763    }
764
765    /// Normalize a label to EntityType, using mapping if available.
766    ///
767    /// Falls back to `EntityType::from_label()` if no mapping exists.
768    #[must_use]
769    pub fn normalize(&self, label: &str) -> EntityType {
770        self.map(label)
771            .cloned()
772            .unwrap_or_else(|| EntityType::from_label(label))
773    }
774
775    /// Check if a label is mapped.
776    #[must_use]
777    pub fn contains(&self, label: &str) -> bool {
778        self.mappings.contains_key(&label.to_uppercase())
779    }
780
781    /// Get all source labels.
782    pub fn labels(&self) -> impl Iterator<Item = &String> {
783        self.mappings.keys()
784    }
785}
786
787/// Extraction method used to identify an entity.
788///
789/// # Research Context
790///
791/// Different extraction methods have different strengths:
792///
793/// | Method | Precision | Recall | Generalization | Use Case |
794/// |--------|-----------|--------|----------------|----------|
795/// | Pattern | Very High | Low | N/A (format-based) | Dates, emails, money |
796/// | Neural | High | High | Good | General NER |
797/// | Lexicon | Very High | Low | None | Closed-domain entities |
798/// | SoftLexicon | Medium | High | Good for rare types | Low-resource NER |
799/// | GatedEnsemble | Highest | Highest | Contextual | Short texts, domain shift |
800///
801/// See `docs/` for repo-local notes and entry points.
802#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, Default)]
803#[non_exhaustive]
804pub enum ExtractionMethod {
805    /// Regex pattern matching (high precision for structured data like dates, money).
806    /// Does not generalize - only detects format-based entities.
807    Pattern,
808
809    /// Neural model inference (BERT, GLiNER, etc.).
810    /// The recommended default for general NER. Generalizes to unseen entities.
811    #[default]
812    Neural,
813
814    /// Exact lexicon/gazetteer lookup (deprecated approach).
815    /// High precision on known entities, zero recall on novel entities.
816    /// Only use for closed domains (stock tickers, medical codes).
817    #[deprecated(since = "0.2.0", note = "Use Neural or GatedEnsemble instead")]
818    Lexicon,
819
820    /// Embedding-based soft lexicon matching.
821    /// Useful for low-resource languages and rare entity types.
822    /// See: Rijhwani et al. (2020) "Soft Gazetteers for Low-Resource NER"
823    SoftLexicon,
824
825    /// Gated ensemble: neural + lexicon with learned weighting.
826    /// Model learns when to trust lexicon vs. context.
827    /// See: Nie et al. (2021) "GEMNET: Effective Gated Gazetteer Representations"
828    GatedEnsemble,
829
830    /// Multiple methods agreed on this entity (high confidence).
831    Consensus,
832
833    /// Heuristic-based extraction (capitalization, word shape, context).
834    /// Used by heuristic backends that don't use neural models.
835    Heuristic,
836
837    /// Unknown or unspecified extraction method.
838    Unknown,
839
840    /// Legacy rule-based extraction (for backward compatibility).
841    #[deprecated(since = "0.2.0", note = "Use Heuristic or Pattern instead")]
842    Rule,
843
844    /// Legacy alias for Neural (for backward compatibility).
845    #[deprecated(since = "0.2.0", note = "Use Neural instead")]
846    ML,
847
848    /// Legacy alias for Consensus (for backward compatibility).
849    #[deprecated(since = "0.2.0", note = "Use Consensus instead")]
850    Ensemble,
851}
852
853impl ExtractionMethod {
854    /// Returns true if this extraction method produces probabilistically calibrated
855    /// confidence scores suitable for calibration analysis (ECE, Brier score, etc.).
856    ///
857    /// # Calibrated Methods
858    ///
859    /// - **Neural**: Softmax outputs are intended to be probabilistic (though may need
860    ///   temperature scaling for true calibration)
861    /// - **GatedEnsemble**: Produces learned probability estimates
862    /// - **SoftLexicon**: Embedding similarity is pseudo-probabilistic
863    ///
864    /// # Uncalibrated Methods
865    ///
866    /// - **Pattern**: Binary (match/no-match); confidence is typically hardcoded
867    /// - **Heuristic**: Arbitrary scores from hand-crafted rules
868    /// - **Lexicon**: Binary exact match
869    /// - **Consensus**: Agreement count, not a probability
870    ///
871    /// # Example
872    ///
873    /// ```rust
874    /// use anno_core::ExtractionMethod;
875    ///
876    /// assert!(ExtractionMethod::Neural.is_calibrated());
877    /// assert!(!ExtractionMethod::Pattern.is_calibrated());
878    /// assert!(!ExtractionMethod::Heuristic.is_calibrated());
879    /// ```
880    #[must_use]
881    pub const fn is_calibrated(&self) -> bool {
882        #[allow(deprecated)]
883        match self {
884            ExtractionMethod::Neural => true,
885            ExtractionMethod::GatedEnsemble => true,
886            ExtractionMethod::SoftLexicon => true,
887            ExtractionMethod::ML => true, // Legacy alias for Neural
888            // Everything else is not calibrated
889            ExtractionMethod::Pattern => false,
890            ExtractionMethod::Lexicon => false,
891            ExtractionMethod::Consensus => false,
892            ExtractionMethod::Heuristic => false,
893            ExtractionMethod::Unknown => false,
894            ExtractionMethod::Rule => false,
895            ExtractionMethod::Ensemble => false,
896        }
897    }
898
899    /// Returns the confidence interpretation for this extraction method.
900    ///
901    /// This helps users understand what the confidence score means:
902    /// - `"probability"`: Score approximates P(correct)
903    /// - `"heuristic_score"`: Score is a non-probabilistic quality measure
904    /// - `"binary"`: Score is 0 or 1 (or a fixed value for matches)
905    #[must_use]
906    pub const fn confidence_interpretation(&self) -> &'static str {
907        #[allow(deprecated)]
908        match self {
909            ExtractionMethod::Neural | ExtractionMethod::ML => "probability",
910            ExtractionMethod::GatedEnsemble | ExtractionMethod::SoftLexicon => "probability",
911            ExtractionMethod::Pattern | ExtractionMethod::Lexicon => "binary",
912            ExtractionMethod::Heuristic | ExtractionMethod::Rule => "heuristic_score",
913            ExtractionMethod::Consensus | ExtractionMethod::Ensemble => "agreement_ratio",
914            ExtractionMethod::Unknown => "unknown",
915        }
916    }
917}
918
919impl std::fmt::Display for ExtractionMethod {
920    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
921        #[allow(deprecated)]
922        match self {
923            ExtractionMethod::Pattern => write!(f, "pattern"),
924            ExtractionMethod::Neural => write!(f, "neural"),
925            ExtractionMethod::Lexicon => write!(f, "lexicon"),
926            ExtractionMethod::SoftLexicon => write!(f, "soft_lexicon"),
927            ExtractionMethod::GatedEnsemble => write!(f, "gated_ensemble"),
928            ExtractionMethod::Consensus => write!(f, "consensus"),
929            ExtractionMethod::Heuristic => write!(f, "heuristic"),
930            ExtractionMethod::Unknown => write!(f, "unknown"),
931            ExtractionMethod::Rule => write!(f, "heuristic"), // Legacy alias
932            ExtractionMethod::ML => write!(f, "neural"),      // Legacy alias
933            ExtractionMethod::Ensemble => write!(f, "consensus"), // Legacy alias
934        }
935    }
936}
937
938// =============================================================================
939// Lexicon Traits
940// =============================================================================
941
942/// Exact-match lexicon/gazetteer for entity lookup.
943///
944/// # Research Context
945///
946/// Gazetteers (lists of known entities) are a classic NER technique. Modern research
947/// suggests they are most valuable when:
948///
949/// 1. **Domain is closed**: Stock tickers, medical codes, known product catalogs
950/// 2. **Text is short**: where context is insufficient
951/// 3. **Used as features**: Input to neural model, not final output (Song et al. 2020)
952///
953/// They're harmful when:
954/// 1. **Domain is open**: Novel entities not in the list get missed
955/// 2. **Used as authority**: Hardcoded lookups inflate test scores but fail in production
956///
957/// # When to Use
958///
959/// ```text
960/// Decision: Should I use a Lexicon?
961///
962/// Is entity type CLOSED (fixed, known list)?
963/// ├─ Yes: Lexicon is appropriate
964/// │       Examples: stock tickers, ICD-10 codes, country names
965/// └─ No:  Use Neural extraction instead
966///         Examples: person names, organization names, products
967/// ```
968///
969/// # Example
970///
971/// ```rust
972/// use anno_core::{Lexicon, EntityType, HashMapLexicon};
973///
974/// // Create a domain-specific lexicon
975/// let mut lexicon = HashMapLexicon::new("stock_tickers");
976/// lexicon.insert("AAPL", EntityType::Organization, 0.99);
977/// lexicon.insert("GOOGL", EntityType::Organization, 0.99);
978///
979/// // Lookup
980/// if let Some((entity_type, confidence)) = lexicon.lookup("AAPL") {
981///     assert_eq!(entity_type, EntityType::Organization);
982///     assert!(confidence > 0.9);
983/// }
984/// ```
985///
986/// # References
987///
988/// - Song et al. (2020). "Improving Neural NER with Gazetteers"
989/// - Nie et al. (2021). "GEMNET: Effective Gated Gazetteer Representations"
990/// - Rijhwani et al. (2020). "Soft Gazetteers for Low-Resource NER"
991pub trait Lexicon: Send + Sync {
992    /// Lookup an exact string, returning entity type and confidence if found.
993    ///
994    /// Returns `None` if the text is not in the lexicon.
995    fn lookup(&self, text: &str) -> Option<(EntityType, Confidence)>;
996
997    /// Check if the lexicon contains this exact string.
998    fn contains(&self, text: &str) -> bool {
999        self.lookup(text).is_some()
1000    }
1001
1002    /// Get the lexicon source identifier (for provenance tracking).
1003    fn source(&self) -> &str;
1004
1005    /// Get approximate number of entries (for debugging/metrics).
1006    fn len(&self) -> usize;
1007
1008    /// Check if lexicon is empty.
1009    fn is_empty(&self) -> bool {
1010        self.len() == 0
1011    }
1012}
1013
1014/// Simple HashMap-based lexicon implementation.
1015///
1016/// Suitable for small to medium lexicons (<100k entries).
1017/// For larger lexicons, consider a trie-based or FST implementation.
1018#[derive(Debug, Clone)]
1019pub struct HashMapLexicon {
1020    entries: std::collections::HashMap<String, (EntityType, Confidence)>,
1021    source: String,
1022}
1023
1024impl HashMapLexicon {
1025    /// Create a new empty lexicon with the given source identifier.
1026    #[must_use]
1027    pub fn new(source: impl Into<String>) -> Self {
1028        Self {
1029            entries: std::collections::HashMap::new(),
1030            source: source.into(),
1031        }
1032    }
1033
1034    /// Insert an entry into the lexicon.
1035    pub fn insert(
1036        &mut self,
1037        text: impl Into<String>,
1038        entity_type: EntityType,
1039        confidence: impl Into<Confidence>,
1040    ) {
1041        self.entries
1042            .insert(text.into(), (entity_type, confidence.into()));
1043    }
1044
1045    /// Create from an iterator of (text, type, confidence) tuples.
1046    pub fn from_iter<I, S, C>(source: impl Into<String>, entries: I) -> Self
1047    where
1048        I: IntoIterator<Item = (S, EntityType, C)>,
1049        S: Into<String>,
1050        C: Into<Confidence>,
1051    {
1052        let mut lexicon = Self::new(source);
1053        for (text, entity_type, conf) in entries {
1054            lexicon.insert(text, entity_type, conf);
1055        }
1056        lexicon
1057    }
1058
1059    /// Get all entries as an iterator (for debugging).
1060    pub fn entries(&self) -> impl Iterator<Item = (&str, &EntityType, Confidence)> {
1061        self.entries.iter().map(|(k, (t, c))| (k.as_str(), t, *c))
1062    }
1063}
1064
1065impl Lexicon for HashMapLexicon {
1066    fn lookup(&self, text: &str) -> Option<(EntityType, Confidence)> {
1067        self.entries.get(text).cloned()
1068    }
1069
1070    fn source(&self) -> &str {
1071        &self.source
1072    }
1073
1074    fn len(&self) -> usize {
1075        self.entries.len()
1076    }
1077}
1078
1079/// Provenance information for an extracted entity.
1080///
1081/// Tracks where an entity came from for debugging, explainability,
1082/// and confidence calibration in hybrid/ensemble systems.
1083#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Default)]
1084pub struct Provenance {
1085    /// Name of the backend that produced this entity (e.g., "pattern", "bert-onnx")
1086    pub source: Cow<'static, str>,
1087    /// Extraction method used
1088    pub method: ExtractionMethod,
1089    /// Specific pattern/rule name (for pattern/rule-based extraction)
1090    pub pattern: Option<Cow<'static, str>>,
1091    /// Raw confidence from the source model (before any calibration)
1092    pub raw_confidence: Option<Confidence>,
1093    /// Model version for reproducibility (e.g., "gliner-v2.1", "bert-base-uncased-2024-01")
1094    #[serde(default, skip_serializing_if = "Option::is_none")]
1095    pub model_version: Option<Cow<'static, str>>,
1096    /// Timestamp when extraction occurred (ISO 8601)
1097    #[serde(default, skip_serializing_if = "Option::is_none")]
1098    pub timestamp: Option<String>,
1099}
1100
1101impl Provenance {
1102    /// Create provenance for regex-based extraction.
1103    #[must_use]
1104    pub fn pattern(pattern_name: &'static str) -> Self {
1105        Self {
1106            source: Cow::Borrowed("pattern"),
1107            method: ExtractionMethod::Pattern,
1108            pattern: Some(Cow::Borrowed(pattern_name)),
1109            raw_confidence: Some(Confidence::ONE), // Patterns are deterministic
1110            model_version: None,
1111            timestamp: None,
1112        }
1113    }
1114
1115    /// Create provenance for ML-based extraction.
1116    ///
1117    /// Accepts both static strings and owned strings:
1118    /// ```rust
1119    /// use anno_core::Provenance;
1120    ///
1121    /// // Static string (zero allocation)
1122    /// let p1 = Provenance::ml("gliner", 0.95);
1123    ///
1124    /// // Owned string (dynamic model name)
1125    /// let model_name = "bert-base";
1126    /// let p2 = Provenance::ml(model_name.to_string(), 0.95);
1127    /// ```
1128    #[must_use]
1129    pub fn ml(model_name: impl Into<Cow<'static, str>>, confidence: impl Into<Confidence>) -> Self {
1130        Self {
1131            source: model_name.into(),
1132            method: ExtractionMethod::Neural,
1133            pattern: None,
1134            raw_confidence: Some(confidence.into()),
1135            model_version: None,
1136            timestamp: None,
1137        }
1138    }
1139
1140    /// Deprecated: Use `ml()` instead which now accepts both static and owned strings.
1141    #[deprecated(
1142        since = "0.2.1",
1143        note = "Use ml() instead, it now accepts owned strings"
1144    )]
1145    #[must_use]
1146    pub fn ml_owned(model_name: impl Into<String>, confidence: impl Into<Confidence>) -> Self {
1147        Self::ml(Cow::Owned(model_name.into()), confidence)
1148    }
1149
1150    /// Create provenance for ensemble/hybrid extraction.
1151    #[must_use]
1152    pub fn ensemble(sources: &'static str) -> Self {
1153        Self {
1154            source: Cow::Borrowed(sources),
1155            method: ExtractionMethod::Consensus,
1156            pattern: None,
1157            raw_confidence: None,
1158            model_version: None,
1159            timestamp: None,
1160        }
1161    }
1162
1163    /// Create provenance with model version for reproducibility.
1164    #[must_use]
1165    pub fn with_version(mut self, version: &'static str) -> Self {
1166        self.model_version = Some(Cow::Borrowed(version));
1167        self
1168    }
1169
1170    /// Create provenance with timestamp.
1171    #[must_use]
1172    pub fn with_timestamp(mut self, timestamp: impl Into<String>) -> Self {
1173        self.timestamp = Some(timestamp.into());
1174        self
1175    }
1176}
1177
1178// ============================================================================
1179// Span Types (Multi-Modal Support)
1180// ============================================================================
1181
1182/// A span locator for text and visual modalities.
1183///
1184/// `Span` is a **simplified subset** of [`grounded::Location`] designed for
1185/// the detection layer (`Entity`). It covers the most common cases:
1186///
1187/// - Text offsets (traditional NER)
1188/// - Bounding boxes (visual document understanding)
1189/// - Hybrid (OCR with both text and visual location)
1190///
1191/// # Relationship to `Location`
1192///
1193/// | `Span` variant | `Location` equivalent |
1194/// |----------------|-----------------------|
1195/// | `Text` | `Location::Text` |
1196/// | `BoundingBox` | `Location::BoundingBox` |
1197/// | `Hybrid` | `Location::TextWithBbox` |
1198///
1199/// For modalities not covered by `Span` (temporal, cuboid, genomic, discontinuous),
1200/// use `Location` directly via the canonical `Signal` → `Track` → `Identity` pipeline.
1201///
1202/// # Conversion
1203///
1204/// - `Span → Location`: Always succeeds via `Location::from(&span)`
1205/// - `Location → Span`: Use `location.to_span()`, returns `None` for unsupported variants
1206///
1207/// [`grounded::Location`]: super::grounded::Location
1208#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
1209pub enum Span {
1210    /// Text span with **character offsets** (start, end).
1211    ///
1212    /// Offsets are Unicode scalar value indices (what `text.chars()` counts),
1213    /// consistent with `Entity.start/end` and `grounded::Location::Text`.
1214    Text {
1215        /// Start character offset (inclusive)
1216        start: usize,
1217        /// End character offset (exclusive)
1218        end: usize,
1219    },
1220    /// Visual bounding box (normalized 0.0-1.0 coordinates)
1221    /// For ColPali: image patch locations
1222    BoundingBox {
1223        /// X coordinate (normalized 0.0-1.0)
1224        x: f32,
1225        /// Y coordinate (normalized 0.0-1.0)
1226        y: f32,
1227        /// Width (normalized 0.0-1.0)
1228        width: f32,
1229        /// Height (normalized 0.0-1.0)
1230        height: f32,
1231        /// Optional page number (for multi-page documents)
1232        page: Option<u32>,
1233    },
1234    /// Hybrid: both text and visual location (for OCR-verified extraction)
1235    Hybrid {
1236        /// Start character offset (inclusive)
1237        start: usize,
1238        /// End character offset (exclusive)
1239        end: usize,
1240        /// Bounding box for visual location
1241        bbox: Box<Span>,
1242    },
1243}
1244
1245impl Span {
1246    /// Create a text span.
1247    #[must_use]
1248    pub const fn text(start: usize, end: usize) -> Self {
1249        Self::Text { start, end }
1250    }
1251
1252    /// Create a bounding box span with normalized coordinates.
1253    #[must_use]
1254    pub fn bbox(x: f32, y: f32, width: f32, height: f32) -> Self {
1255        Self::BoundingBox {
1256            x,
1257            y,
1258            width,
1259            height,
1260            page: None,
1261        }
1262    }
1263
1264    /// Create a bounding box with page number.
1265    #[must_use]
1266    pub fn bbox_on_page(x: f32, y: f32, width: f32, height: f32, page: u32) -> Self {
1267        Self::BoundingBox {
1268            x,
1269            y,
1270            width,
1271            height,
1272            page: Some(page),
1273        }
1274    }
1275
1276    /// Check if this is a text span.
1277    #[must_use]
1278    pub const fn is_text(&self) -> bool {
1279        matches!(self, Self::Text { .. } | Self::Hybrid { .. })
1280    }
1281
1282    /// Check if this has visual location.
1283    #[must_use]
1284    pub const fn is_visual(&self) -> bool {
1285        matches!(self, Self::BoundingBox { .. } | Self::Hybrid { .. })
1286    }
1287
1288    /// Get text offsets if available.
1289    #[must_use]
1290    pub const fn text_offsets(&self) -> Option<(usize, usize)> {
1291        match self {
1292            Self::Text { start, end } => Some((*start, *end)),
1293            Self::Hybrid { start, end, .. } => Some((*start, *end)),
1294            Self::BoundingBox { .. } => None,
1295        }
1296    }
1297
1298    /// Calculate span length for text spans.
1299    #[must_use]
1300    pub fn len(&self) -> usize {
1301        match self {
1302            Self::Text { start, end } => end.saturating_sub(*start),
1303            Self::Hybrid { start, end, .. } => end.saturating_sub(*start),
1304            Self::BoundingBox { .. } => 0,
1305        }
1306    }
1307
1308    /// Check if span is empty.
1309    #[must_use]
1310    pub fn is_empty(&self) -> bool {
1311        self.len() == 0
1312    }
1313}
1314
1315// ============================================================================
1316// Discontinuous Spans (W2NER/ACE-style)
1317// ============================================================================
1318
1319/// A discontinuous span representing non-contiguous entity mentions.
1320///
1321/// Some entities span multiple non-adjacent text regions:
1322/// - "severe \[pain\] in the \[abdomen\]" → "severe abdominal pain"
1323/// - "the \[president\] ... \[Obama\]" → coreference
1324///
1325/// This is required for:
1326/// - **Medical NER**: Anatomical modifiers separated from findings
1327/// - **Legal NER**: Parties referenced across clauses
1328/// - **W2NER**: Word-word relation grids that detect discontinuous entities
1329///
1330/// # Offset Unit (CRITICAL)
1331///
1332/// `DiscontinuousSpan` uses **character offsets** (Unicode scalar value indices),
1333/// consistent with [`Entity::start`](super::entity::Entity::start) /
1334/// [`Entity::end`](super::entity::Entity::end) and `anno::core::grounded::Location`.
1335///
1336/// This is intentionally *not* byte offsets. If you have byte offsets (from regex,
1337/// `str::find`, tokenizers, etc.), convert them to character offsets first (see
1338/// `anno::offset::SpanConverter` in the `anno` crate).
1339///
1340/// # Example
1341///
1342/// ```rust
1343/// use anno_core::DiscontinuousSpan;
1344///
1345/// // "severe pain in the abdomen" where "severe" modifies "pain"
1346/// // but they're separated by other words
1347/// let span = DiscontinuousSpan::new(vec![
1348///     0..6,   // "severe"
1349///     12..16, // "pain"
1350/// ]);
1351///
1352/// assert_eq!(span.num_segments(), 2);
1353/// assert!(span.is_discontinuous());
1354/// ```
1355#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
1356pub struct DiscontinuousSpan {
1357    /// Non-overlapping segments, sorted by start position.
1358    /// Each `Range<usize>` represents (start_char, end_char).
1359    segments: Vec<std::ops::Range<usize>>,
1360}
1361
1362impl DiscontinuousSpan {
1363    /// Create a new discontinuous span from segments.
1364    ///
1365    /// Segments are sorted and validated (no overlaps).
1366    #[must_use]
1367    pub fn new(mut segments: Vec<std::ops::Range<usize>>) -> Self {
1368        // Sort by start position
1369        segments.sort_by_key(|r| r.start);
1370        Self { segments }
1371    }
1372
1373    /// Create from a single contiguous span.
1374    #[must_use]
1375    #[allow(clippy::single_range_in_vec_init)] // Intentional: contiguous is special case of discontinuous
1376    pub fn contiguous(start: usize, end: usize) -> Self {
1377        Self {
1378            segments: vec![start..end],
1379        }
1380    }
1381
1382    /// Number of segments.
1383    #[must_use]
1384    pub fn num_segments(&self) -> usize {
1385        self.segments.len()
1386    }
1387
1388    /// True if this spans multiple non-adjacent regions.
1389    #[must_use]
1390    pub fn is_discontinuous(&self) -> bool {
1391        self.segments.len() > 1
1392    }
1393
1394    /// True if this is a single contiguous span.
1395    #[must_use]
1396    pub fn is_contiguous(&self) -> bool {
1397        self.segments.len() <= 1
1398    }
1399
1400    /// Get the segments.
1401    #[must_use]
1402    pub fn segments(&self) -> &[std::ops::Range<usize>] {
1403        &self.segments
1404    }
1405
1406    /// Get the overall bounding range (start of first to end of last).
1407    #[must_use]
1408    pub fn bounding_range(&self) -> Option<std::ops::Range<usize>> {
1409        if self.segments.is_empty() {
1410            return None;
1411        }
1412        let start = self.segments.first()?.start;
1413        let end = self.segments.last()?.end;
1414        Some(start..end)
1415    }
1416
1417    /// Total character length (sum of all segments).
1418    ///
1419    #[must_use]
1420    pub fn total_len(&self) -> usize {
1421        self.segments.iter().map(|r| r.end - r.start).sum()
1422    }
1423
1424    /// Extract text from each segment and join with separator.
1425    #[must_use]
1426    pub fn extract_text(&self, text: &str, separator: &str) -> String {
1427        self.segments
1428            .iter()
1429            .map(|r| {
1430                let start = r.start;
1431                let len = r.end.saturating_sub(r.start);
1432                text.chars().skip(start).take(len).collect::<String>()
1433            })
1434            .collect::<Vec<_>>()
1435            .join(separator)
1436    }
1437
1438    /// Check if a character position falls within any segment.
1439    ///
1440    /// # Arguments
1441    ///
1442    /// * `pos` - Character offset to check (Unicode scalar value index)
1443    ///
1444    /// # Returns
1445    ///
1446    /// `true` if the character position falls within any segment of this span.
1447    #[must_use]
1448    pub fn contains(&self, pos: usize) -> bool {
1449        self.segments.iter().any(|r| r.contains(&pos))
1450    }
1451
1452    /// Convert to a regular Span (uses bounding range, loses discontinuity info).
1453    #[must_use]
1454    pub fn to_span(&self) -> Option<Span> {
1455        self.bounding_range().map(|r| Span::Text {
1456            start: r.start,
1457            end: r.end,
1458        })
1459    }
1460}
1461
1462impl From<std::ops::Range<usize>> for DiscontinuousSpan {
1463    fn from(range: std::ops::Range<usize>) -> Self {
1464        Self::contiguous(range.start, range.end)
1465    }
1466}
1467
1468impl Default for Span {
1469    fn default() -> Self {
1470        Self::Text { start: 0, end: 0 }
1471    }
1472}
1473
1474// ============================================================================
1475// Hierarchical Confidence (Coarse-to-Fine)
1476// ============================================================================
1477
1478/// Hierarchical confidence scores for coarse-to-fine extraction.
1479///
1480/// Research (HiNet, InfoHier) shows that extraction benefits from
1481/// decomposed confidence:
1482/// - **Linkage**: "Is there ANY entity here?" (binary, fast filter)
1483/// - **Type**: "What type is it?" (fine-grained classification)
1484/// - **Boundary**: "Where exactly does it start/end?" (span refinement)
1485#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
1486pub struct HierarchicalConfidence {
1487    /// Coarse: probability that this span contains ANY entity (0.0-1.0)
1488    /// Used for early filtering in the TPLinker "handshaking" matrix.
1489    pub linkage: f32,
1490    /// Fine: probability that the type classification is correct (0.0-1.0)
1491    pub type_score: f32,
1492    /// Boundary: confidence in the exact span boundaries (0.0-1.0)
1493    /// Low for entities with fuzzy boundaries (e.g., "the CEO" vs "CEO")
1494    pub boundary: f32,
1495}
1496
1497impl HierarchicalConfidence {
1498    /// Create hierarchical confidence with all scores.
1499    #[must_use]
1500    pub fn new(linkage: f32, type_score: f32, boundary: f32) -> Self {
1501        Self {
1502            linkage: linkage.clamp(0.0, 1.0),
1503            type_score: type_score.clamp(0.0, 1.0),
1504            boundary: boundary.clamp(0.0, 1.0),
1505        }
1506    }
1507
1508    /// Create from a single confidence score (legacy compatibility).
1509    /// Assigns same score to all levels.
1510    #[must_use]
1511    pub fn from_single(confidence: f32) -> Self {
1512        let c = confidence.clamp(0.0, 1.0);
1513        Self {
1514            linkage: c,
1515            type_score: c,
1516            boundary: c,
1517        }
1518    }
1519
1520    /// Calculate combined confidence (geometric mean).
1521    /// Geometric mean penalizes low scores more than arithmetic mean.
1522    #[must_use]
1523    pub fn combined(&self) -> f32 {
1524        (self.linkage * self.type_score * self.boundary).powf(1.0 / 3.0)
1525    }
1526
1527    /// Calculate combined confidence as f64 for legacy compatibility.
1528    #[must_use]
1529    pub fn as_f64(&self) -> f64 {
1530        self.combined() as f64
1531    }
1532
1533    /// Check if passes minimum threshold at all levels.
1534    #[must_use]
1535    pub fn passes_threshold(&self, linkage_min: f32, type_min: f32, boundary_min: f32) -> bool {
1536        self.linkage >= linkage_min && self.type_score >= type_min && self.boundary >= boundary_min
1537    }
1538}
1539
1540impl Default for HierarchicalConfidence {
1541    fn default() -> Self {
1542        Self {
1543            linkage: 1.0,
1544            type_score: 1.0,
1545            boundary: 1.0,
1546        }
1547    }
1548}
1549
1550impl From<f64> for HierarchicalConfidence {
1551    fn from(confidence: f64) -> Self {
1552        Self::from_single(confidence as f32)
1553    }
1554}
1555
1556impl From<f32> for HierarchicalConfidence {
1557    fn from(confidence: f32) -> Self {
1558        Self::from_single(confidence)
1559    }
1560}
1561
1562// ============================================================================
1563// Ragged Batch (ModernBERT Unpadding)
1564// ============================================================================
1565
1566/// A ragged (unpadded) batch for efficient ModernBERT inference.
1567///
1568/// ModernBERT achieves its speed advantage by avoiding padding tokens entirely.
1569/// Instead of `[batch, max_seq_len]`, it uses a single contiguous 1D sequence
1570/// with offset indices to track document boundaries.
1571///
1572/// # Memory Layout
1573///
1574/// ```text
1575/// Traditional (padded):
1576/// [doc1_tok1, doc1_tok2, PAD, PAD, PAD]  <- wasted compute
1577/// [doc2_tok1, doc2_tok2, doc2_tok3, PAD, PAD]
1578///
1579/// Ragged (unpadded):
1580/// [doc1_tok1, doc1_tok2, doc2_tok1, doc2_tok2, doc2_tok3]
1581/// cumulative_offsets: [0, 2, 5]  <- doc1 is [0..2], doc2 is [2..5]
1582/// ```
1583#[derive(Debug, Clone)]
1584pub struct RaggedBatch {
1585    /// Token IDs flattened into a single contiguous array.
1586    /// Shape: `[total_tokens]` (1D, no padding)
1587    pub token_ids: Vec<u32>,
1588    /// Cumulative sequence lengths.
1589    /// Length: batch_size + 1
1590    /// Document i spans tokens \[offsets\[i\]..offsets\[i+1\])
1591    pub cumulative_offsets: Vec<u32>,
1592    /// Maximum sequence length in this batch (for kernel bounds).
1593    pub max_seq_len: usize,
1594}
1595
1596impl RaggedBatch {
1597    /// Create a new ragged batch from sequences.
1598    pub fn from_sequences(sequences: &[Vec<u32>]) -> Self {
1599        let total_tokens: usize = sequences.iter().map(|s| s.len()).sum();
1600        let mut token_ids = Vec::with_capacity(total_tokens);
1601        let mut cumulative_offsets = Vec::with_capacity(sequences.len() + 1);
1602        let mut max_seq_len = 0;
1603
1604        cumulative_offsets.push(0);
1605        for seq in sequences {
1606            token_ids.extend_from_slice(seq);
1607            // Check for overflow: u32::MAX is 4,294,967,295
1608            // If token_ids.len() exceeds this, we'll truncate (which is a bug)
1609            // but in practice, this is unlikely for reasonable batch sizes
1610            let len = token_ids.len();
1611            if len > u32::MAX as usize {
1612                // This would overflow - use saturating cast to prevent panic
1613                // but log a warning as this indicates a problem
1614                log::warn!(
1615                    "Token count {} exceeds u32::MAX, truncating to {}",
1616                    len,
1617                    u32::MAX
1618                );
1619                cumulative_offsets.push(u32::MAX);
1620            } else {
1621                cumulative_offsets.push(len as u32);
1622            }
1623            max_seq_len = max_seq_len.max(seq.len());
1624        }
1625
1626        Self {
1627            token_ids,
1628            cumulative_offsets,
1629            max_seq_len,
1630        }
1631    }
1632
1633    /// Get the number of documents in this batch.
1634    #[must_use]
1635    pub fn batch_size(&self) -> usize {
1636        self.cumulative_offsets.len().saturating_sub(1)
1637    }
1638
1639    /// Get the total number of tokens (no padding).
1640    #[must_use]
1641    pub fn total_tokens(&self) -> usize {
1642        self.token_ids.len()
1643    }
1644
1645    /// Get token range for a specific document.
1646    #[must_use]
1647    pub fn doc_range(&self, doc_idx: usize) -> Option<std::ops::Range<usize>> {
1648        if doc_idx + 1 < self.cumulative_offsets.len() {
1649            let start = self.cumulative_offsets[doc_idx] as usize;
1650            let end = self.cumulative_offsets[doc_idx + 1] as usize;
1651            Some(start..end)
1652        } else {
1653            None
1654        }
1655    }
1656
1657    /// Get tokens for a specific document.
1658    #[must_use]
1659    pub fn doc_tokens(&self, doc_idx: usize) -> Option<&[u32]> {
1660        self.doc_range(doc_idx).map(|r| &self.token_ids[r])
1661    }
1662
1663    /// Calculate memory saved vs padded batch.
1664    #[must_use]
1665    pub fn padding_savings(&self) -> f64 {
1666        let padded_size = self.batch_size() * self.max_seq_len;
1667        if padded_size == 0 {
1668            return 0.0;
1669        }
1670        1.0 - (self.total_tokens() as f64 / padded_size as f64)
1671    }
1672}
1673
1674// ============================================================================
1675// Span Candidate Generation
1676// ============================================================================
1677
1678/// A candidate span for entity extraction.
1679///
1680/// In GLiNER/bi-encoder systems, we generate all possible spans up to a
1681/// maximum width and score them against entity type embeddings.
1682#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
1683pub struct SpanCandidate {
1684    /// Document index in the batch
1685    pub doc_idx: u32,
1686    /// Start token index (within the document)
1687    pub start: u32,
1688    /// End token index (exclusive)
1689    pub end: u32,
1690}
1691
1692impl SpanCandidate {
1693    /// Create a new span candidate.
1694    #[must_use]
1695    pub const fn new(doc_idx: u32, start: u32, end: u32) -> Self {
1696        Self {
1697            doc_idx,
1698            start,
1699            end,
1700        }
1701    }
1702
1703    /// Get span width (number of tokens).
1704    #[must_use]
1705    pub const fn width(&self) -> u32 {
1706        self.end.saturating_sub(self.start)
1707    }
1708}
1709
1710/// Generate all valid span candidates for a ragged batch.
1711///
1712/// This is the "gnarly" operation in GLiNER - efficiently enumerating
1713/// all valid spans without O(N^2) memory allocation.
1714pub fn generate_span_candidates(batch: &RaggedBatch, max_width: usize) -> Vec<SpanCandidate> {
1715    let mut candidates = Vec::new();
1716
1717    for doc_idx in 0..batch.batch_size() {
1718        if let Some(range) = batch.doc_range(doc_idx) {
1719            let doc_len = range.len();
1720            // Generate all spans [i, j) where j - i <= max_width
1721            for start in 0..doc_len {
1722                let max_end = (start + max_width).min(doc_len);
1723                for end in (start + 1)..=max_end {
1724                    candidates.push(SpanCandidate::new(doc_idx as u32, start as u32, end as u32));
1725                }
1726            }
1727        }
1728    }
1729
1730    candidates
1731}
1732
1733/// Generate span candidates with early filtering.
1734///
1735/// Uses a linkage mask to skip low-probability spans (TPLinker optimization).
1736pub fn generate_filtered_candidates(
1737    batch: &RaggedBatch,
1738    max_width: usize,
1739    linkage_mask: &[f32],
1740    threshold: f32,
1741) -> Vec<SpanCandidate> {
1742    let mut candidates = Vec::new();
1743    let mut mask_idx = 0;
1744
1745    for doc_idx in 0..batch.batch_size() {
1746        if let Some(range) = batch.doc_range(doc_idx) {
1747            let doc_len = range.len();
1748            for start in 0..doc_len {
1749                let max_end = (start + max_width).min(doc_len);
1750                for end in (start + 1)..=max_end {
1751                    // Only include if linkage probability exceeds threshold
1752                    if mask_idx < linkage_mask.len() && linkage_mask[mask_idx] >= threshold {
1753                        candidates.push(SpanCandidate::new(
1754                            doc_idx as u32,
1755                            start as u32,
1756                            end as u32,
1757                        ));
1758                    }
1759                    mask_idx += 1;
1760                }
1761            }
1762        }
1763    }
1764
1765    candidates
1766}
1767
1768// ============================================================================
1769// Entity (Extended)
1770// ============================================================================
1771
1772/// A recognized named entity or relation trigger.
1773///
1774/// # Entity Structure
1775///
1776/// ```text
1777/// "Contact John at john@example.com on Jan 15"
1778///          ^^^^    ^^^^^^^^^^^^^^^^    ^^^^^^
1779///          PER     EMAIL               DATE
1780///          |       |                   |
1781///          Named   Contact             Temporal
1782///          (ML)    (Pattern)           (Pattern)
1783/// ```
1784///
1785/// # Core Fields (Stable API)
1786///
1787/// - `text`, `entity_type`, `start`, `end`, `confidence` — always present
1788/// - `normalized`, `provenance` — commonly used optional fields
1789/// - `kb_id`, `canonical_id` — knowledge graph and coreference support
1790///
1791/// # Extended Fields (Research/Experimental)
1792///
1793/// The following fields support advanced research applications but may evolve:
1794///
1795/// | Field | Purpose | Status |
1796/// |-------|---------|--------|
1797/// | `visual_span` | Multi-modal (ColPali) extraction | Experimental |
1798/// | `discontinuous_span` | W2NER non-contiguous entities | Experimental |
1799/// | `valid_from`, `valid_until` | Temporal knowledge graphs | Research |
1800/// | `viewport` | Multi-faceted entity representation | Research |
1801/// | `hierarchical_confidence` | Coarse-to-fine NER | Experimental |
1802///
1803/// These fields are `#[serde(skip_serializing_if = "Option::is_none")]` so they
1804/// have no overhead when unused.
1805///
1806/// # Knowledge Graph Support
1807///
1808/// For GraphRAG and coreference resolution, entities support:
1809/// - `kb_id`: External knowledge base identifier (e.g., Wikidata Q-ID)
1810/// - `canonical_id`: Local coreference cluster ID (links "John" and "he")
1811///
1812/// # Normalization
1813///
1814/// Entities can have a normalized form for downstream processing:
1815/// - Dates: "Jan 15" → "2024-01-15" (ISO 8601)
1816/// - Money: "$1.5M" → "1500000 USD"
1817/// - Locations: "NYC" → "New York City"
1818#[derive(Debug, Clone, Serialize, Deserialize)]
1819pub struct Entity {
1820    /// Entity text (surface form as it appears in source)
1821    pub text: String,
1822    /// Entity type classification
1823    pub entity_type: EntityType,
1824    /// Start position (character offset, NOT byte offset).
1825    ///
1826    /// For Unicode text, character offsets differ from byte offsets.
1827    /// Use `anno::offset::bytes_to_chars` to convert if needed.
1828    pub start: usize,
1829    /// End position (character offset, exclusive).
1830    ///
1831    /// For Unicode text, character offsets differ from byte offsets.
1832    /// Use `anno::offset::bytes_to_chars` to convert if needed.
1833    pub end: usize,
1834    /// Confidence score (0.0-1.0, calibrated).
1835    ///
1836    /// Construction via [`Confidence::new`] clamps to `[0.0, 1.0]`.
1837    /// Use `.value()` or `Into<f64>` to extract the raw score.
1838    pub confidence: Confidence,
1839    /// Normalized/canonical form (e.g., "Jan 15" → "2024-01-15")
1840    #[serde(default, skip_serializing_if = "Option::is_none")]
1841    pub normalized: Option<String>,
1842    /// Provenance: which backend/method produced this entity
1843    #[serde(default, skip_serializing_if = "Option::is_none")]
1844    pub provenance: Option<Provenance>,
1845    /// External knowledge base ID (e.g., "Q7186" for Marie Curie in Wikidata).
1846    /// Used for entity linking and GraphRAG applications.
1847    #[serde(default, skip_serializing_if = "Option::is_none")]
1848    pub kb_id: Option<String>,
1849    /// Local coreference cluster ID.
1850    /// Multiple mentions with the same `canonical_id` refer to the same entity.
1851    /// Example: "Marie Curie" and "she" might share `canonical_id = CanonicalId(42)`.
1852    #[serde(default, skip_serializing_if = "Option::is_none")]
1853    pub canonical_id: Option<super::types::CanonicalId>,
1854    /// Hierarchical confidence (coarse-to-fine).
1855    /// Provides linkage, type, and boundary scores separately.
1856    #[serde(default, skip_serializing_if = "Option::is_none")]
1857    pub hierarchical_confidence: Option<HierarchicalConfidence>,
1858    /// Visual span for multi-modal (ColPali) extraction.
1859    /// When set, provides bounding box location in addition to text offsets.
1860    #[serde(default, skip_serializing_if = "Option::is_none")]
1861    pub visual_span: Option<Span>,
1862    /// Discontinuous span for non-contiguous entity mentions (W2NER support).
1863    /// When set, overrides `start`/`end` for length calculations.
1864    /// Example: "New York and LA \[airports\]" where "airports" modifies both.
1865    #[serde(default, skip_serializing_if = "Option::is_none")]
1866    pub discontinuous_span: Option<DiscontinuousSpan>,
1867    // =========================================================================
1868    // Temporal Validity (Research: Temporal Knowledge Graphs)
1869    // =========================================================================
1870    /// Start of temporal validity interval for this entity assertion.
1871    ///
1872    /// Entities are facts that may change over time:
1873    /// - "Satya Nadella is CEO of Microsoft" is valid from [2014, present]
1874    /// - "Steve Ballmer was CEO of Microsoft" was valid from [2000, 2014]
1875    ///
1876    /// When `None`, the entity is either:
1877    /// - Currently valid (no known end date)
1878    /// - Atemporal (timeless fact like "Paris is in France")
1879    ///
1880    /// # Example
1881    /// ```rust
1882    /// use anno_core::{Entity, EntityType};
1883    /// use chrono::{TimeZone, Utc};
1884    ///
1885    /// let mut entity = Entity::new("CEO of Microsoft", EntityType::Person, 0, 16, 0.9);
1886    /// entity.valid_from = Some(Utc.with_ymd_and_hms(2008, 10, 1, 0, 0, 0).unwrap());
1887    /// ```
1888    #[serde(default, skip_serializing_if = "Option::is_none")]
1889    pub valid_from: Option<chrono::DateTime<chrono::Utc>>,
1890    /// End of temporal validity interval for this entity assertion.
1891    ///
1892    /// When `None` and `valid_from` is set, the fact is currently valid.
1893    /// When both are `None`, the entity is atemporal.
1894    #[serde(default, skip_serializing_if = "Option::is_none")]
1895    pub valid_until: Option<chrono::DateTime<chrono::Utc>>,
1896    // =========================================================================
1897    // Viewport / Context (Research: Entity Manifolds)
1898    // =========================================================================
1899    /// Viewport context for multi-faceted entity representation.
1900    ///
1901    /// The same real-world entity can have different "faces" in different contexts:
1902    /// - "Marie Curie" in an academic context: professor, researcher
1903    /// - "Marie Curie" in a scientific context: physicist, chemist
1904    /// - "Marie Curie" in a personal context: mother, educator
1905    ///
1906    /// This enables "holographic" entity projection at query time:
1907    /// given a query context, project the entity manifold to the relevant viewport.
1908    ///
1909    /// # Example
1910    /// ```rust
1911    /// use anno_core::{Entity, EntityType, EntityViewport};
1912    ///
1913    /// let mut entity = Entity::new("Marie Curie", EntityType::Person, 0, 11, 0.9);
1914    /// entity.viewport = Some(EntityViewport::Academic);
1915    /// ```
1916    #[serde(default, skip_serializing_if = "Option::is_none")]
1917    pub viewport: Option<EntityViewport>,
1918    /// Phi-features (person, number, gender) for morphological agreement.
1919    ///
1920    /// Used for coreference constraints and zero pronoun resolution.
1921    /// In pro-drop languages (Arabic, Spanish, Japanese), verb morphology
1922    /// encodes subject features even when the pronoun is dropped.
1923    #[serde(default, skip_serializing_if = "Option::is_none")]
1924    pub phi_features: Option<PhiFeatures>,
1925    /// Mention type classification (Proper, Nominal, Pronominal, Zero).
1926    ///
1927    /// Classifies the referring expression type for coreference resolution.
1928    /// Follows the Accessibility Hierarchy (Ariel 1990):
1929    /// Proper > Nominal > Pronominal > Zero.
1930    #[serde(default, skip_serializing_if = "Option::is_none")]
1931    pub mention_type: Option<MentionType>,
1932}
1933
1934impl Entity {
1935    /// Create a new entity.
1936    ///
1937    /// ```
1938    /// use anno_core::{Entity, EntityType};
1939    ///
1940    /// let e = Entity::new("Berlin", EntityType::Location, 10, 16, 0.95);
1941    /// assert_eq!(e.text, "Berlin");
1942    /// assert_eq!(e.entity_type, EntityType::Location);
1943    /// assert_eq!((e.start, e.end), (10, 16));
1944    /// ```
1945    #[must_use]
1946    pub fn new(
1947        text: impl Into<String>,
1948        entity_type: EntityType,
1949        start: usize,
1950        end: usize,
1951        confidence: impl Into<Confidence>,
1952    ) -> Self {
1953        Self {
1954            text: text.into(),
1955            entity_type,
1956            start,
1957            end,
1958            confidence: confidence.into(),
1959            normalized: None,
1960            provenance: None,
1961            kb_id: None,
1962            canonical_id: None,
1963            hierarchical_confidence: None,
1964            visual_span: None,
1965            discontinuous_span: None,
1966            valid_from: None,
1967            valid_until: None,
1968            viewport: None,
1969            phi_features: None,
1970            mention_type: None,
1971        }
1972    }
1973
1974    /// Create a new entity with provenance information.
1975    #[must_use]
1976    pub fn with_provenance(
1977        text: impl Into<String>,
1978        entity_type: EntityType,
1979        start: usize,
1980        end: usize,
1981        confidence: impl Into<Confidence>,
1982        provenance: Provenance,
1983    ) -> Self {
1984        Self {
1985            text: text.into(),
1986            entity_type,
1987            start,
1988            end,
1989            confidence: confidence.into(),
1990            normalized: None,
1991            provenance: Some(provenance),
1992            kb_id: None,
1993            canonical_id: None,
1994            hierarchical_confidence: None,
1995            visual_span: None,
1996            discontinuous_span: None,
1997            valid_from: None,
1998            valid_until: None,
1999            viewport: None,
2000            phi_features: None,
2001            mention_type: None,
2002        }
2003    }
2004
2005    /// Create an entity with hierarchical confidence scores.
2006    #[must_use]
2007    pub fn with_hierarchical_confidence(
2008        text: impl Into<String>,
2009        entity_type: EntityType,
2010        start: usize,
2011        end: usize,
2012        confidence: HierarchicalConfidence,
2013    ) -> Self {
2014        Self {
2015            text: text.into(),
2016            entity_type,
2017            start,
2018            end,
2019            confidence: Confidence::new(confidence.as_f64()),
2020            normalized: None,
2021            provenance: None,
2022            kb_id: None,
2023            canonical_id: None,
2024            hierarchical_confidence: Some(confidence),
2025            visual_span: None,
2026            discontinuous_span: None,
2027            valid_from: None,
2028            valid_until: None,
2029            viewport: None,
2030            phi_features: None,
2031            mention_type: None,
2032        }
2033    }
2034
2035    /// Create an entity from a visual bounding box (ColPali multi-modal).
2036    #[must_use]
2037    pub fn from_visual(
2038        text: impl Into<String>,
2039        entity_type: EntityType,
2040        bbox: Span,
2041        confidence: impl Into<Confidence>,
2042    ) -> Self {
2043        Self {
2044            text: text.into(),
2045            entity_type,
2046            start: 0,
2047            end: 0,
2048            confidence: confidence.into(),
2049            normalized: None,
2050            provenance: None,
2051            kb_id: None,
2052            canonical_id: None,
2053            hierarchical_confidence: None,
2054            visual_span: Some(bbox),
2055            discontinuous_span: None,
2056            valid_from: None,
2057            valid_until: None,
2058            viewport: None,
2059            phi_features: None,
2060            mention_type: None,
2061        }
2062    }
2063
2064    /// Create an entity with default confidence (1.0).
2065    #[must_use]
2066    pub fn with_type(
2067        text: impl Into<String>,
2068        entity_type: EntityType,
2069        start: usize,
2070        end: usize,
2071    ) -> Self {
2072        Self::new(text, entity_type, start, end, 1.0)
2073    }
2074
2075    /// Link this entity to an external knowledge base.
2076    ///
2077    /// # Examples
2078    /// ```
2079    /// use anno_core::{Entity, EntityType};
2080    /// let mut e = Entity::new("Marie Curie", EntityType::Person, 0, 11, 0.95);
2081    /// e.link_to_kb("Q7186");
2082    /// assert_eq!(e.kb_id.as_deref(), Some("Q7186"));
2083    /// ```
2084    pub fn link_to_kb(&mut self, kb_id: impl Into<String>) {
2085        self.kb_id = Some(kb_id.into());
2086    }
2087
2088    /// Assign this entity to a coreference cluster.
2089    ///
2090    /// Entities with the same `canonical_id` refer to the same real-world entity.
2091    pub fn set_canonical(&mut self, canonical_id: impl Into<super::types::CanonicalId>) {
2092        self.canonical_id = Some(canonical_id.into());
2093    }
2094
2095    /// Builder-style method to set canonical ID.
2096    ///
2097    /// # Example
2098    /// ```
2099    /// use anno_core::{CanonicalId, Entity, EntityType};
2100    /// let entity = Entity::new("John", EntityType::Person, 0, 4, 0.9)
2101    ///     .with_canonical_id(42);
2102    /// assert_eq!(entity.canonical_id, Some(CanonicalId::new(42)));
2103    /// ```
2104    #[must_use]
2105    pub fn with_canonical_id(mut self, canonical_id: impl Into<super::types::CanonicalId>) -> Self {
2106        self.canonical_id = Some(canonical_id.into());
2107        self
2108    }
2109
2110    /// Check if this entity is linked to a knowledge base.
2111    #[must_use]
2112    pub fn is_linked(&self) -> bool {
2113        self.kb_id.is_some()
2114    }
2115
2116    /// Check if this entity has coreference information.
2117    #[must_use]
2118    pub fn has_coreference(&self) -> bool {
2119        self.canonical_id.is_some()
2120    }
2121
2122    /// Check if this entity has a discontinuous span.
2123    ///
2124    /// Discontinuous entities span non-contiguous text regions.
2125    /// Example: "New York and LA airports" contains "New York airports"
2126    /// as a discontinuous entity.
2127    #[must_use]
2128    pub fn is_discontinuous(&self) -> bool {
2129        self.discontinuous_span
2130            .as_ref()
2131            .map(|s| s.is_discontinuous())
2132            .unwrap_or(false)
2133    }
2134
2135    /// Get the discontinuous segments if present.
2136    ///
2137    /// Returns `None` if this is a contiguous entity.
2138    #[must_use]
2139    pub fn discontinuous_segments(&self) -> Option<Vec<std::ops::Range<usize>>> {
2140        self.discontinuous_span
2141            .as_ref()
2142            .filter(|s| s.is_discontinuous())
2143            .map(|s| s.segments().to_vec())
2144    }
2145
2146    /// Set a discontinuous span for this entity.
2147    ///
2148    /// This is used by W2NER and similar models that detect non-contiguous mentions.
2149    pub fn set_discontinuous_span(&mut self, span: DiscontinuousSpan) {
2150        // Update start/end to match the bounding range
2151        if let Some(bounding) = span.bounding_range() {
2152            self.start = bounding.start;
2153            self.end = bounding.end;
2154        }
2155        self.discontinuous_span = Some(span);
2156    }
2157
2158    /// Get the total length covered by this entity, in **characters**.
2159    ///
2160    /// - **Contiguous**: `end - start`
2161    /// - **Discontinuous**: sum of segment lengths
2162    ///
2163    /// This is intentionally consistent: all offsets in `anno::core` entity spans
2164    /// are **character offsets** (Unicode scalar values), not byte offsets.
2165    #[must_use]
2166    pub fn total_len(&self) -> usize {
2167        if let Some(ref span) = self.discontinuous_span {
2168            span.segments().iter().map(|r| r.end - r.start).sum()
2169        } else {
2170            self.end.saturating_sub(self.start)
2171        }
2172    }
2173
2174    /// Set the normalized form for this entity.
2175    ///
2176    /// # Examples
2177    ///
2178    /// ```rust
2179    /// use anno_core::{Entity, EntityType};
2180    ///
2181    /// let mut entity = Entity::new("Jan 15", EntityType::Date, 0, 6, 0.95);
2182    /// entity.set_normalized("2024-01-15");
2183    /// assert_eq!(entity.normalized.as_deref(), Some("2024-01-15"));
2184    /// ```
2185    pub fn set_normalized(&mut self, normalized: impl Into<String>) {
2186        self.normalized = Some(normalized.into());
2187    }
2188
2189    /// Get the normalized form, or the original text if not normalized.
2190    #[must_use]
2191    pub fn normalized_or_text(&self) -> &str {
2192        self.normalized.as_deref().unwrap_or(&self.text)
2193    }
2194
2195    /// Get the extraction method, if known.
2196    #[must_use]
2197    pub fn method(&self) -> ExtractionMethod {
2198        self.provenance
2199            .as_ref()
2200            .map_or(ExtractionMethod::Unknown, |p| p.method)
2201    }
2202
2203    /// Get the source backend name, if known.
2204    #[must_use]
2205    pub fn source(&self) -> Option<&str> {
2206        self.provenance.as_ref().map(|p| p.source.as_ref())
2207    }
2208
2209    /// Get the entity category.
2210    #[must_use]
2211    pub fn category(&self) -> EntityCategory {
2212        self.entity_type.category()
2213    }
2214
2215    /// Returns true if this entity was detected via patterns (not ML).
2216    #[must_use]
2217    pub fn is_structured(&self) -> bool {
2218        self.entity_type.pattern_detectable()
2219    }
2220
2221    /// Returns true if this entity required ML for detection.
2222    #[must_use]
2223    pub fn is_named(&self) -> bool {
2224        self.entity_type.requires_ml()
2225    }
2226
2227    /// Check if this entity overlaps with another.
2228    #[must_use]
2229    pub fn overlaps(&self, other: &Entity) -> bool {
2230        !(self.end <= other.start || other.end <= self.start)
2231    }
2232
2233    /// Calculate overlap ratio (IoU) with another entity.
2234    #[must_use]
2235    pub fn overlap_ratio(&self, other: &Entity) -> f64 {
2236        let intersection_start = self.start.max(other.start);
2237        let intersection_end = self.end.min(other.end);
2238
2239        if intersection_start >= intersection_end {
2240            return 0.0;
2241        }
2242
2243        let intersection = (intersection_end - intersection_start) as f64;
2244        let union = ((self.end - self.start) + (other.end - other.start)
2245            - (intersection_end - intersection_start)) as f64;
2246
2247        if union == 0.0 {
2248            return 1.0;
2249        }
2250
2251        intersection / union
2252    }
2253
2254    /// Set hierarchical confidence scores.
2255    pub fn set_hierarchical_confidence(&mut self, confidence: HierarchicalConfidence) {
2256        self.confidence = Confidence::new(confidence.as_f64());
2257        self.hierarchical_confidence = Some(confidence);
2258    }
2259
2260    /// Get the linkage confidence (coarse filter score).
2261    #[must_use]
2262    pub fn linkage_confidence(&self) -> f32 {
2263        self.hierarchical_confidence
2264            .map_or(f32::from(self.confidence), |h| h.linkage)
2265    }
2266
2267    /// Get the type classification confidence.
2268    #[must_use]
2269    pub fn type_confidence(&self) -> f32 {
2270        self.hierarchical_confidence
2271            .map_or(f32::from(self.confidence), |h| h.type_score)
2272    }
2273
2274    /// Get the boundary confidence.
2275    #[must_use]
2276    pub fn boundary_confidence(&self) -> f32 {
2277        self.hierarchical_confidence
2278            .map_or(f32::from(self.confidence), |h| h.boundary)
2279    }
2280
2281    /// Check if this entity has visual location (multi-modal).
2282    #[must_use]
2283    pub fn is_visual(&self) -> bool {
2284        self.visual_span.is_some()
2285    }
2286
2287    /// Get the text span (start, end).
2288    #[must_use]
2289    pub const fn text_span(&self) -> (usize, usize) {
2290        (self.start, self.end)
2291    }
2292
2293    /// Get the span length.
2294    #[must_use]
2295    pub const fn span_len(&self) -> usize {
2296        self.end.saturating_sub(self.start)
2297    }
2298
2299    /// Create a unified TextSpan with both byte and char offsets.
2300    ///
2301    /// This is useful when you need to work with both offset systems.
2302    /// The `text` parameter must be the original source text from which
2303    /// this entity was extracted.
2304    ///
2305    /// # Arguments
2306    /// * `source_text` - The original text (needed to compute byte offsets)
2307    ///
2308    /// # Returns
2309    /// A TextSpan with both byte and char offsets.
2310    ///
2311    /// # Note
2312    ///
2313    /// This method requires the offset conversion utilities from the `anno` crate.
2314    /// Use `anno::offset::char_to_byte_offsets()` directly for now.
2315    ///
2316    /// # Example
2317    /// ```rust,ignore
2318    /// use anno_core::{Entity, EntityType};
2319    ///
2320    /// let (byte_start, byte_end) = char_to_byte_offsets(text, entity.start, entity.end);
2321    /// ```
2322    /// Set visual span for multi-modal extraction.
2323    pub fn set_visual_span(&mut self, span: Span) {
2324        self.visual_span = Some(span);
2325    }
2326
2327    /// Safely extract text from source using character offsets.
2328    ///
2329    /// Entity stores character offsets, not byte offsets. This method
2330    /// correctly extracts text by iterating over characters.
2331    ///
2332    /// # Arguments
2333    /// * `source_text` - The original text from which this entity was extracted
2334    ///
2335    /// # Returns
2336    /// The extracted text, or empty string if offsets are invalid
2337    ///
2338    /// # Example
2339    /// ```rust
2340    /// use anno_core::{Entity, EntityType};
2341    ///
2342    /// let text = "Hello, 日本!";
2343    /// let entity = Entity::new("日本", EntityType::Location, 7, 9, 0.95);
2344    /// assert_eq!(entity.extract_text(text), "日本");
2345    /// ```
2346    #[must_use]
2347    pub fn extract_text(&self, source_text: &str) -> String {
2348        // Performance: Use cached length if available, but fallback to counting
2349        // For single entity extraction, this is fine. For batch operations,
2350        // use extract_text_with_len with pre-computed length.
2351        let char_count = source_text.chars().count();
2352        self.extract_text_with_len(source_text, char_count)
2353    }
2354
2355    /// Extract text with pre-computed text length (performance optimization).
2356    ///
2357    /// Use this when validating/clamping multiple entities from the same text
2358    /// to avoid recalculating `text.chars().count()` for each entity.
2359    ///
2360    /// # Arguments
2361    /// * `source_text` - The original text
2362    /// * `text_char_count` - Pre-computed character count (from `text.chars().count()`)
2363    ///
2364    /// # Returns
2365    /// The extracted text, or empty string if offsets are invalid
2366    #[must_use]
2367    pub fn extract_text_with_len(&self, source_text: &str, text_char_count: usize) -> String {
2368        if self.start >= text_char_count || self.end > text_char_count || self.start >= self.end {
2369            return String::new();
2370        }
2371        source_text
2372            .chars()
2373            .skip(self.start)
2374            .take(self.end - self.start)
2375            .collect()
2376    }
2377
2378    // =========================================================================
2379    // Temporal Validity Methods
2380    // =========================================================================
2381
2382    /// Set the temporal validity start for this entity assertion.
2383    ///
2384    /// # Example
2385    /// ```rust
2386    /// use anno_core::{Entity, EntityType};
2387    /// use chrono::{TimeZone, Utc};
2388    ///
2389    /// let mut entity = Entity::new("CEO", EntityType::Person, 0, 3, 0.9);
2390    /// entity.set_valid_from(Utc.with_ymd_and_hms(2008, 10, 1, 0, 0, 0).unwrap());
2391    /// assert!(entity.is_temporal());
2392    /// ```
2393    pub fn set_valid_from(&mut self, dt: chrono::DateTime<chrono::Utc>) {
2394        self.valid_from = Some(dt);
2395    }
2396
2397    /// Set the temporal validity end for this entity assertion.
2398    pub fn set_valid_until(&mut self, dt: chrono::DateTime<chrono::Utc>) {
2399        self.valid_until = Some(dt);
2400    }
2401
2402    /// Set both temporal bounds at once.
2403    pub fn set_temporal_range(
2404        &mut self,
2405        from: chrono::DateTime<chrono::Utc>,
2406        until: chrono::DateTime<chrono::Utc>,
2407    ) {
2408        self.valid_from = Some(from);
2409        self.valid_until = Some(until);
2410    }
2411
2412    /// Check if this entity has temporal validity information.
2413    #[must_use]
2414    pub fn is_temporal(&self) -> bool {
2415        self.valid_from.is_some() || self.valid_until.is_some()
2416    }
2417
2418    /// Check if this entity was valid at a specific point in time.
2419    ///
2420    /// Returns `true` if:
2421    /// - No temporal bounds are set (atemporal entity)
2422    /// - The timestamp falls within [valid_from, valid_until]
2423    ///
2424    /// # Example
2425    /// ```rust
2426    /// use anno_core::{Entity, EntityType};
2427    /// use chrono::{TimeZone, Utc};
2428    ///
2429    /// let mut entity = Entity::new("CEO of Microsoft", EntityType::Person, 0, 16, 0.9);
2430    /// entity.set_valid_from(Utc.with_ymd_and_hms(2008, 1, 1, 0, 0, 0).unwrap());
2431    /// entity.set_valid_until(Utc.with_ymd_and_hms(2023, 12, 31, 0, 0, 0).unwrap());
2432    ///
2433    /// let query_2015 = Utc.with_ymd_and_hms(2015, 6, 1, 0, 0, 0).unwrap();
2434    /// let query_2005 = Utc.with_ymd_and_hms(2005, 6, 1, 0, 0, 0).unwrap();
2435    ///
2436    /// assert!(entity.valid_at(&query_2015));
2437    /// assert!(!entity.valid_at(&query_2005));
2438    /// ```
2439    #[must_use]
2440    pub fn valid_at(&self, timestamp: &chrono::DateTime<chrono::Utc>) -> bool {
2441        match (&self.valid_from, &self.valid_until) {
2442            (None, None) => true,                      // Atemporal - always valid
2443            (Some(from), None) => timestamp >= from,   // Started, still valid
2444            (None, Some(until)) => timestamp <= until, // Unknown start, ended
2445            (Some(from), Some(until)) => timestamp >= from && timestamp <= until,
2446        }
2447    }
2448
2449    /// Check if this entity is currently valid (at the current time).
2450    #[must_use]
2451    pub fn is_currently_valid(&self) -> bool {
2452        self.valid_at(&chrono::Utc::now())
2453    }
2454
2455    // =========================================================================
2456    // Viewport/Context Methods
2457    // =========================================================================
2458
2459    /// Set the viewport context for this entity.
2460    ///
2461    /// # Example
2462    /// ```rust
2463    /// use anno_core::{Entity, EntityType, EntityViewport};
2464    ///
2465    /// let mut entity = Entity::new("Marie Curie", EntityType::Person, 0, 11, 0.9);
2466    /// entity.set_viewport(EntityViewport::Academic);
2467    /// assert!(entity.has_viewport());
2468    /// ```
2469    pub fn set_viewport(&mut self, viewport: EntityViewport) {
2470        self.viewport = Some(viewport);
2471    }
2472
2473    /// Check if this entity has a viewport context.
2474    #[must_use]
2475    pub fn has_viewport(&self) -> bool {
2476        self.viewport.is_some()
2477    }
2478
2479    /// Get the viewport, defaulting to General if not set.
2480    #[must_use]
2481    pub fn viewport_or_default(&self) -> EntityViewport {
2482        self.viewport.clone().unwrap_or_default()
2483    }
2484
2485    /// Check if this entity matches a viewport context.
2486    ///
2487    /// Returns true if:
2488    /// - The entity has no viewport (matches any)
2489    /// - The entity's viewport matches the query
2490    #[must_use]
2491    pub fn matches_viewport(&self, query_viewport: &EntityViewport) -> bool {
2492        match &self.viewport {
2493            None => true, // No viewport = matches any
2494            Some(v) => v == query_viewport,
2495        }
2496    }
2497
2498    /// Create a builder for fluent entity construction.
2499    #[must_use]
2500    pub fn builder(text: impl Into<String>, entity_type: EntityType) -> EntityBuilder {
2501        EntityBuilder::new(text, entity_type)
2502    }
2503
2504    // =========================================================================
2505    // Validation Methods (Production Quality)
2506    // =========================================================================
2507
2508    /// Validate this entity against the source text.
2509    ///
2510    /// Returns a list of validation issues. Empty list means the entity is valid.
2511    ///
2512    /// # Checks Performed
2513    ///
2514    /// 1. **Span bounds**: `start < end`, both within text length
2515    /// 2. **Text match**: `text` matches the span in source
2516    /// 3. **Confidence range**: `confidence` in [0.0, 1.0]
2517    /// 4. **Type consistency**: Custom types have non-empty names
2518    /// 5. **Discontinuous consistency**: If present, segments are valid
2519    ///
2520    /// # Example
2521    ///
2522    /// ```rust
2523    /// use anno_core::{Entity, EntityType};
2524    ///
2525    /// let text = "John works at Apple";
2526    /// let entity = Entity::new("John", EntityType::Person, 0, 4, 0.95);
2527    ///
2528    /// let issues = entity.validate(text);
2529    /// assert!(issues.is_empty(), "Entity should be valid");
2530    ///
2531    /// // Invalid entity: span doesn't match text
2532    /// let bad = Entity::new("Jane", EntityType::Person, 0, 4, 0.95);
2533    /// let issues = bad.validate(text);
2534    /// assert!(!issues.is_empty(), "Entity text doesn't match span");
2535    /// ```
2536    #[must_use]
2537    pub fn validate(&self, source_text: &str) -> Vec<ValidationIssue> {
2538        // Performance: Calculate length once, delegate to optimized version
2539        let char_count = source_text.chars().count();
2540        self.validate_with_len(source_text, char_count)
2541    }
2542
2543    /// Validate entity with pre-computed text length (performance optimization).
2544    ///
2545    /// Use this when validating multiple entities from the same text to avoid
2546    /// recalculating `text.chars().count()` for each entity.
2547    ///
2548    /// # Arguments
2549    /// * `source_text` - The original text
2550    /// * `text_char_count` - Pre-computed character count (from `text.chars().count()`)
2551    ///
2552    /// # Returns
2553    /// Vector of validation issues (empty if valid)
2554    #[must_use]
2555    pub fn validate_with_len(
2556        &self,
2557        source_text: &str,
2558        text_char_count: usize,
2559    ) -> Vec<ValidationIssue> {
2560        let mut issues = Vec::new();
2561
2562        // 1. Span bounds
2563        if self.start >= self.end {
2564            issues.push(ValidationIssue::InvalidSpan {
2565                start: self.start,
2566                end: self.end,
2567                reason: "start must be less than end".to_string(),
2568            });
2569        }
2570
2571        if self.end > text_char_count {
2572            issues.push(ValidationIssue::SpanOutOfBounds {
2573                end: self.end,
2574                text_len: text_char_count,
2575            });
2576        }
2577
2578        // 2. Text match (only if span is valid)
2579        if self.start < self.end && self.end <= text_char_count {
2580            let actual = self.extract_text_with_len(source_text, text_char_count);
2581            if actual != self.text {
2582                issues.push(ValidationIssue::TextMismatch {
2583                    expected: self.text.clone(),
2584                    actual,
2585                    start: self.start,
2586                    end: self.end,
2587                });
2588            }
2589        }
2590
2591        // 3. Confidence range (now enforced by the Confidence type, so this is a no-op)
2592
2593        // 4. Type consistency
2594        if let EntityType::Custom { ref name, .. } = self.entity_type {
2595            if name.is_empty() {
2596                issues.push(ValidationIssue::InvalidType {
2597                    reason: "Custom entity type has empty name".to_string(),
2598                });
2599            }
2600        }
2601
2602        // 5. Discontinuous span consistency
2603        if let Some(ref disc_span) = self.discontinuous_span {
2604            for (i, seg) in disc_span.segments().iter().enumerate() {
2605                if seg.start >= seg.end {
2606                    issues.push(ValidationIssue::InvalidSpan {
2607                        start: seg.start,
2608                        end: seg.end,
2609                        reason: format!("discontinuous segment {} is invalid", i),
2610                    });
2611                }
2612                if seg.end > text_char_count {
2613                    issues.push(ValidationIssue::SpanOutOfBounds {
2614                        end: seg.end,
2615                        text_len: text_char_count,
2616                    });
2617                }
2618            }
2619        }
2620
2621        issues
2622    }
2623
2624    /// Check if this entity is valid against the source text.
2625    ///
2626    /// Convenience method that returns `true` if `validate()` returns empty.
2627    #[must_use]
2628    pub fn is_valid(&self, source_text: &str) -> bool {
2629        self.validate(source_text).is_empty()
2630    }
2631
2632    /// Validate a batch of entities efficiently.
2633    ///
2634    /// Returns a map of entity index -> validation issues.
2635    /// Only entities with issues are included.
2636    ///
2637    /// # Example
2638    ///
2639    /// ```rust
2640    /// use anno_core::{Entity, EntityType};
2641    ///
2642    /// let text = "John and Jane work at Apple";
2643    /// let entities = vec![
2644    ///     Entity::new("John", EntityType::Person, 0, 4, 0.95),
2645    ///     Entity::new("Wrong", EntityType::Person, 9, 13, 0.8),
2646    /// ];
2647    ///
2648    /// let issues = Entity::validate_batch(&entities, text);
2649    /// assert!(issues.is_empty() || issues.contains_key(&1)); // Second entity might fail
2650    /// ```
2651    #[must_use]
2652    pub fn validate_batch(
2653        entities: &[Entity],
2654        source_text: &str,
2655    ) -> std::collections::HashMap<usize, Vec<ValidationIssue>> {
2656        entities
2657            .iter()
2658            .enumerate()
2659            .filter_map(|(idx, entity)| {
2660                let issues = entity.validate(source_text);
2661                if issues.is_empty() {
2662                    None
2663                } else {
2664                    Some((idx, issues))
2665                }
2666            })
2667            .collect()
2668    }
2669}
2670
2671/// Validation issue found during entity validation.
2672#[derive(Debug, Clone, PartialEq)]
2673pub enum ValidationIssue {
2674    /// Span bounds are invalid (start >= end).
2675    InvalidSpan {
2676        /// Start position of the invalid span.
2677        start: usize,
2678        /// End position of the invalid span.
2679        end: usize,
2680        /// Description of why the span is invalid.
2681        reason: String,
2682    },
2683    /// Span extends beyond text length.
2684    SpanOutOfBounds {
2685        /// End position that exceeds the text.
2686        end: usize,
2687        /// Actual length of the text.
2688        text_len: usize,
2689    },
2690    /// Entity text doesn't match the span in source.
2691    TextMismatch {
2692        /// Text stored in the entity.
2693        expected: String,
2694        /// Text found at the span in source.
2695        actual: String,
2696        /// Start position of the span.
2697        start: usize,
2698        /// End position of the span.
2699        end: usize,
2700    },
2701    /// Confidence is outside [0.0, 1.0].
2702    InvalidConfidence {
2703        /// The invalid confidence value.
2704        value: f64,
2705    },
2706    /// Entity type is invalid.
2707    InvalidType {
2708        /// Description of why the type is invalid.
2709        reason: String,
2710    },
2711}
2712
2713impl std::fmt::Display for ValidationIssue {
2714    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
2715        match self {
2716            ValidationIssue::InvalidSpan { start, end, reason } => {
2717                write!(f, "Invalid span [{}, {}): {}", start, end, reason)
2718            }
2719            ValidationIssue::SpanOutOfBounds { end, text_len } => {
2720                write!(f, "Span end {} exceeds text length {}", end, text_len)
2721            }
2722            ValidationIssue::TextMismatch {
2723                expected,
2724                actual,
2725                start,
2726                end,
2727            } => {
2728                write!(
2729                    f,
2730                    "Text mismatch at [{}, {}): expected '{}', got '{}'",
2731                    start, end, expected, actual
2732                )
2733            }
2734            ValidationIssue::InvalidConfidence { value } => {
2735                write!(f, "Confidence {} outside [0.0, 1.0]", value)
2736            }
2737            ValidationIssue::InvalidType { reason } => {
2738                write!(f, "Invalid entity type: {}", reason)
2739            }
2740        }
2741    }
2742}
2743
2744/// Fluent builder for constructing entities with optional fields.
2745///
2746/// # Example
2747///
2748/// ```rust
2749/// use anno_core::{Entity, EntityType, Provenance};
2750///
2751/// let entity = Entity::builder("Marie Curie", EntityType::Person)
2752///     .span(0, 11)
2753///     .confidence(0.95)
2754///     .kb_id("Q7186")
2755///     .provenance(Provenance::ml("bert", 0.95))
2756///     .build();
2757/// ```
2758#[derive(Debug, Clone)]
2759pub struct EntityBuilder {
2760    text: String,
2761    entity_type: EntityType,
2762    start: usize,
2763    end: usize,
2764    confidence: Confidence,
2765    normalized: Option<String>,
2766    provenance: Option<Provenance>,
2767    kb_id: Option<String>,
2768    canonical_id: Option<super::types::CanonicalId>,
2769    hierarchical_confidence: Option<HierarchicalConfidence>,
2770    visual_span: Option<Span>,
2771    discontinuous_span: Option<DiscontinuousSpan>,
2772    valid_from: Option<chrono::DateTime<chrono::Utc>>,
2773    valid_until: Option<chrono::DateTime<chrono::Utc>>,
2774    viewport: Option<EntityViewport>,
2775    phi_features: Option<PhiFeatures>,
2776    mention_type: Option<MentionType>,
2777}
2778
2779impl EntityBuilder {
2780    /// Create a new builder.
2781    #[must_use]
2782    pub fn new(text: impl Into<String>, entity_type: EntityType) -> Self {
2783        let text = text.into();
2784        let end = text.chars().count();
2785        Self {
2786            text,
2787            entity_type,
2788            start: 0,
2789            end,
2790            confidence: Confidence::ONE,
2791            normalized: None,
2792            provenance: None,
2793            kb_id: None,
2794            canonical_id: None,
2795            hierarchical_confidence: None,
2796            visual_span: None,
2797            discontinuous_span: None,
2798            valid_from: None,
2799            valid_until: None,
2800            viewport: None,
2801            phi_features: None,
2802            mention_type: None,
2803        }
2804    }
2805
2806    /// Set span offsets.
2807    #[must_use]
2808    pub const fn span(mut self, start: usize, end: usize) -> Self {
2809        self.start = start;
2810        self.end = end;
2811        self
2812    }
2813
2814    /// Set confidence score.
2815    #[must_use]
2816    pub fn confidence(mut self, confidence: impl Into<Confidence>) -> Self {
2817        self.confidence = confidence.into();
2818        self
2819    }
2820
2821    /// Set hierarchical confidence.
2822    #[must_use]
2823    pub fn hierarchical_confidence(mut self, confidence: HierarchicalConfidence) -> Self {
2824        self.confidence = Confidence::new(confidence.as_f64());
2825        self.hierarchical_confidence = Some(confidence);
2826        self
2827    }
2828
2829    /// Set normalized form.
2830    #[must_use]
2831    pub fn normalized(mut self, normalized: impl Into<String>) -> Self {
2832        self.normalized = Some(normalized.into());
2833        self
2834    }
2835
2836    /// Set provenance.
2837    #[must_use]
2838    pub fn provenance(mut self, provenance: Provenance) -> Self {
2839        self.provenance = Some(provenance);
2840        self
2841    }
2842
2843    /// Set knowledge base ID.
2844    #[must_use]
2845    pub fn kb_id(mut self, kb_id: impl Into<String>) -> Self {
2846        self.kb_id = Some(kb_id.into());
2847        self
2848    }
2849
2850    /// Set canonical (coreference) ID.
2851    #[must_use]
2852    pub const fn canonical_id(mut self, canonical_id: u64) -> Self {
2853        self.canonical_id = Some(super::types::CanonicalId::new(canonical_id));
2854        self
2855    }
2856
2857    /// Set visual span.
2858    #[must_use]
2859    pub fn visual_span(mut self, span: Span) -> Self {
2860        self.visual_span = Some(span);
2861        self
2862    }
2863
2864    /// Set discontinuous span for non-contiguous entities.
2865    ///
2866    /// This automatically updates `start` and `end` to the bounding range.
2867    #[must_use]
2868    pub fn discontinuous_span(mut self, span: DiscontinuousSpan) -> Self {
2869        // Update start/end to bounding range
2870        if let Some(bounding) = span.bounding_range() {
2871            self.start = bounding.start;
2872            self.end = bounding.end;
2873        }
2874        self.discontinuous_span = Some(span);
2875        self
2876    }
2877
2878    /// Set temporal validity start (when this entity assertion became true).
2879    ///
2880    /// # Example
2881    /// ```rust
2882    /// use anno_core::{EntityBuilder, EntityType};
2883    /// use chrono::{TimeZone, Utc};
2884    ///
2885    /// let entity = EntityBuilder::new("CEO of Microsoft", EntityType::Person)
2886    ///     .span(0, 12)
2887    ///     .valid_from(Utc.with_ymd_and_hms(2008, 10, 1, 0, 0, 0).unwrap())
2888    ///     .build();
2889    /// assert!(entity.valid_from.is_some());
2890    /// ```
2891    #[must_use]
2892    pub fn valid_from(mut self, dt: chrono::DateTime<chrono::Utc>) -> Self {
2893        self.valid_from = Some(dt);
2894        self
2895    }
2896
2897    /// Set temporal validity end (when this entity assertion stopped being true).
2898    #[must_use]
2899    pub fn valid_until(mut self, dt: chrono::DateTime<chrono::Utc>) -> Self {
2900        self.valid_until = Some(dt);
2901        self
2902    }
2903
2904    /// Set temporal validity range (convenience method).
2905    #[must_use]
2906    pub fn temporal_range(
2907        mut self,
2908        from: chrono::DateTime<chrono::Utc>,
2909        until: chrono::DateTime<chrono::Utc>,
2910    ) -> Self {
2911        self.valid_from = Some(from);
2912        self.valid_until = Some(until);
2913        self
2914    }
2915
2916    /// Set the viewport context for multi-faceted entity representation.
2917    ///
2918    /// # Example
2919    /// ```rust
2920    /// use anno_core::{EntityBuilder, EntityType, EntityViewport};
2921    ///
2922    /// let entity = EntityBuilder::new("Marie Curie", EntityType::Person)
2923    ///     .span(0, 11)
2924    ///     .viewport(EntityViewport::Academic)
2925    ///     .build();
2926    /// assert_eq!(entity.viewport, Some(EntityViewport::Academic));
2927    /// ```
2928    #[must_use]
2929    pub fn viewport(mut self, viewport: EntityViewport) -> Self {
2930        self.viewport = Some(viewport);
2931        self
2932    }
2933
2934    /// Set phi-features (person, number, gender) for morphological agreement.
2935    #[must_use]
2936    pub fn phi_features(mut self, phi_features: PhiFeatures) -> Self {
2937        self.phi_features = Some(phi_features);
2938        self
2939    }
2940
2941    /// Set mention type classification.
2942    #[must_use]
2943    pub fn mention_type(mut self, mention_type: MentionType) -> Self {
2944        self.mention_type = Some(mention_type);
2945        self
2946    }
2947
2948    /// Build the entity.
2949    #[must_use]
2950    pub fn build(self) -> Entity {
2951        Entity {
2952            text: self.text,
2953            entity_type: self.entity_type,
2954            start: self.start,
2955            end: self.end,
2956            confidence: self.confidence,
2957            normalized: self.normalized,
2958            provenance: self.provenance,
2959            kb_id: self.kb_id,
2960            canonical_id: self.canonical_id,
2961            hierarchical_confidence: self.hierarchical_confidence,
2962            visual_span: self.visual_span,
2963            discontinuous_span: self.discontinuous_span,
2964            valid_from: self.valid_from,
2965            valid_until: self.valid_until,
2966            viewport: self.viewport,
2967            phi_features: self.phi_features,
2968            mention_type: self.mention_type,
2969        }
2970    }
2971}
2972
2973// ============================================================================
2974// Relation (for Knowledge Graph Construction)
2975// ============================================================================
2976
2977/// A relation between two entities, forming a knowledge graph triple.
2978///
2979/// In the GLiNER bi-encoder paradigm, relations are detected just like entities:
2980/// the relation trigger text ("CEO of", "located in") is matched against
2981/// relation type labels in the same latent space.
2982///
2983/// # Structure
2984///
2985/// ```text
2986/// Triple: (Head, Relation, Tail)
2987///
2988/// "Marie Curie worked at the Sorbonne"
2989///  ^^^^^^^^^^^ ~~~~~~~~~ ^^^^^^^^
2990///  Head        Rel       Tail
2991///  (Person)  (Employment)  (Organization)
2992/// ```
2993///
2994/// # TPLinker/Joint Extraction
2995///
2996/// For joint extraction, relations are extracted in a single pass with entities.
2997/// The `trigger_span` captures the text that indicates the relation.
2998#[derive(Debug, Clone, Serialize, Deserialize)]
2999pub struct Relation {
3000    /// The source entity (head of the triple)
3001    pub head: Entity,
3002    /// The target entity (tail of the triple)
3003    pub tail: Entity,
3004    /// Relation type label (e.g., "EMPLOYMENT", "LOCATED_IN", "FOUNDED_BY")
3005    pub relation_type: String,
3006    /// Optional trigger span: the text that indicates this relation
3007    /// For "CEO of", this would be the span covering "CEO of"
3008    pub trigger_span: Option<(usize, usize)>,
3009    /// Confidence score for this relation (0.0-1.0).
3010    pub confidence: Confidence,
3011}
3012
3013impl Relation {
3014    /// Create a new relation between two entities.
3015    #[must_use]
3016    pub fn new(
3017        head: Entity,
3018        tail: Entity,
3019        relation_type: impl Into<String>,
3020        confidence: impl Into<Confidence>,
3021    ) -> Self {
3022        Self {
3023            head,
3024            tail,
3025            relation_type: relation_type.into(),
3026            trigger_span: None,
3027            confidence: confidence.into(),
3028        }
3029    }
3030
3031    /// Create a relation with an explicit trigger span.
3032    #[must_use]
3033    pub fn with_trigger(
3034        head: Entity,
3035        tail: Entity,
3036        relation_type: impl Into<String>,
3037        trigger_start: usize,
3038        trigger_end: usize,
3039        confidence: impl Into<Confidence>,
3040    ) -> Self {
3041        Self {
3042            head,
3043            tail,
3044            relation_type: relation_type.into(),
3045            trigger_span: Some((trigger_start, trigger_end)),
3046            confidence: confidence.into(),
3047        }
3048    }
3049
3050    /// Convert to a triple string representation (for debugging/display).
3051    #[must_use]
3052    pub fn as_triple(&self) -> String {
3053        format!(
3054            "({}, {}, {})",
3055            self.head.text, self.relation_type, self.tail.text
3056        )
3057    }
3058
3059    /// Check if the head and tail entities are adjacent (within n tokens).
3060    /// Useful for filtering spurious long-distance relations.
3061    #[must_use]
3062    pub fn span_distance(&self) -> usize {
3063        if self.head.end <= self.tail.start {
3064            self.tail.start.saturating_sub(self.head.end)
3065        } else if self.tail.end <= self.head.start {
3066            self.head.start.saturating_sub(self.tail.end)
3067        } else {
3068            0 // Overlapping spans
3069        }
3070    }
3071}
3072
3073#[cfg(test)]
3074mod tests {
3075    #![allow(clippy::unwrap_used)] // unwrap() is acceptable in test code
3076    use super::*;
3077
3078    #[test]
3079    fn test_entity_type_roundtrip() {
3080        let types = [
3081            EntityType::Person,
3082            EntityType::Organization,
3083            EntityType::Location,
3084            EntityType::Date,
3085            EntityType::Money,
3086            EntityType::Percent,
3087        ];
3088
3089        for t in types {
3090            let label = t.as_label();
3091            let parsed = EntityType::from_label(label);
3092            assert_eq!(t, parsed);
3093        }
3094    }
3095
3096    #[test]
3097    fn test_entity_overlap() {
3098        let e1 = Entity::new("John", EntityType::Person, 0, 4, 0.9);
3099        let e2 = Entity::new("Smith", EntityType::Person, 5, 10, 0.9);
3100        let e3 = Entity::new("John Smith", EntityType::Person, 0, 10, 0.9);
3101
3102        assert!(!e1.overlaps(&e2)); // No overlap
3103        assert!(e1.overlaps(&e3)); // e1 is contained in e3
3104        assert!(e3.overlaps(&e2)); // e3 contains e2
3105    }
3106
3107    #[test]
3108    fn test_confidence_clamping() {
3109        let e1 = Entity::new("test", EntityType::Person, 0, 4, 1.5);
3110        assert!((e1.confidence - 1.0).abs() < f64::EPSILON);
3111
3112        let e2 = Entity::new("test", EntityType::Person, 0, 4, -0.5);
3113        assert!(e2.confidence.abs() < f64::EPSILON);
3114    }
3115
3116    #[test]
3117    fn test_entity_categories() {
3118        // Agent/Org/Place entities require ML
3119        assert_eq!(EntityType::Person.category(), EntityCategory::Agent);
3120        assert_eq!(
3121            EntityType::Organization.category(),
3122            EntityCategory::Organization
3123        );
3124        assert_eq!(EntityType::Location.category(), EntityCategory::Place);
3125        assert!(EntityType::Person.requires_ml());
3126        assert!(!EntityType::Person.pattern_detectable());
3127
3128        // Temporal entities are pattern-detectable
3129        assert_eq!(EntityType::Date.category(), EntityCategory::Temporal);
3130        assert_eq!(EntityType::Time.category(), EntityCategory::Temporal);
3131        assert!(EntityType::Date.pattern_detectable());
3132        assert!(!EntityType::Date.requires_ml());
3133
3134        // Numeric entities are pattern-detectable
3135        assert_eq!(EntityType::Money.category(), EntityCategory::Numeric);
3136        assert_eq!(EntityType::Percent.category(), EntityCategory::Numeric);
3137        assert!(EntityType::Money.pattern_detectable());
3138
3139        // Contact entities are pattern-detectable
3140        assert_eq!(EntityType::Email.category(), EntityCategory::Contact);
3141        assert_eq!(EntityType::Url.category(), EntityCategory::Contact);
3142        assert_eq!(EntityType::Phone.category(), EntityCategory::Contact);
3143        assert!(EntityType::Email.pattern_detectable());
3144    }
3145
3146    #[test]
3147    fn test_new_types_roundtrip() {
3148        let types = [
3149            EntityType::Time,
3150            EntityType::Email,
3151            EntityType::Url,
3152            EntityType::Phone,
3153            EntityType::Quantity,
3154            EntityType::Cardinal,
3155            EntityType::Ordinal,
3156        ];
3157
3158        for t in types {
3159            let label = t.as_label();
3160            let parsed = EntityType::from_label(label);
3161            assert_eq!(t, parsed, "Roundtrip failed for {}", label);
3162        }
3163    }
3164
3165    #[test]
3166    fn test_custom_entity_type() {
3167        let disease = EntityType::custom("DISEASE", EntityCategory::Agent);
3168        assert_eq!(disease.as_label(), "DISEASE");
3169        assert!(disease.requires_ml());
3170
3171        let product_id = EntityType::custom("PRODUCT_ID", EntityCategory::Misc);
3172        assert_eq!(product_id.as_label(), "PRODUCT_ID");
3173        assert!(!product_id.requires_ml());
3174        assert!(!product_id.pattern_detectable());
3175    }
3176
3177    #[test]
3178    fn test_entity_normalization() {
3179        let mut e = Entity::new("Jan 15", EntityType::Date, 0, 6, 0.95);
3180        assert!(e.normalized.is_none());
3181        assert_eq!(e.normalized_or_text(), "Jan 15");
3182
3183        e.set_normalized("2024-01-15");
3184        assert_eq!(e.normalized.as_deref(), Some("2024-01-15"));
3185        assert_eq!(e.normalized_or_text(), "2024-01-15");
3186    }
3187
3188    #[test]
3189    fn test_entity_helpers() {
3190        let named = Entity::new("John", EntityType::Person, 0, 4, 0.9);
3191        assert!(named.is_named());
3192        assert!(!named.is_structured());
3193        assert_eq!(named.category(), EntityCategory::Agent);
3194
3195        let structured = Entity::new("$100", EntityType::Money, 0, 4, 0.95);
3196        assert!(!structured.is_named());
3197        assert!(structured.is_structured());
3198        assert_eq!(structured.category(), EntityCategory::Numeric);
3199    }
3200
3201    #[test]
3202    fn test_knowledge_linking() {
3203        let mut entity = Entity::new("Marie Curie", EntityType::Person, 0, 11, 0.95);
3204        assert!(!entity.is_linked());
3205        assert!(!entity.has_coreference());
3206
3207        entity.link_to_kb("Q7186"); // Wikidata ID
3208        assert!(entity.is_linked());
3209        assert_eq!(entity.kb_id.as_deref(), Some("Q7186"));
3210
3211        entity.set_canonical(42);
3212        assert!(entity.has_coreference());
3213        assert_eq!(
3214            entity.canonical_id,
3215            Some(crate::core::types::CanonicalId::new(42))
3216        );
3217    }
3218
3219    #[test]
3220    fn test_relation_creation() {
3221        let head = Entity::new("Marie Curie", EntityType::Person, 0, 11, 0.95);
3222        let tail = Entity::new("Sorbonne", EntityType::Organization, 24, 32, 0.90);
3223
3224        let relation = Relation::new(head.clone(), tail.clone(), "WORKED_AT", 0.85);
3225        assert_eq!(relation.relation_type, "WORKED_AT");
3226        assert_eq!(relation.as_triple(), "(Marie Curie, WORKED_AT, Sorbonne)");
3227        assert!(relation.trigger_span.is_none());
3228
3229        // With trigger span
3230        let relation2 = Relation::with_trigger(head, tail, "EMPLOYMENT", 13, 19, 0.85);
3231        assert_eq!(relation2.trigger_span, Some((13, 19)));
3232    }
3233
3234    #[test]
3235    fn test_relation_span_distance() {
3236        // Head at 0-11, tail at 24-32 -> distance is 24-11 = 13
3237        let head = Entity::new("Marie Curie", EntityType::Person, 0, 11, 0.95);
3238        let tail = Entity::new("Sorbonne", EntityType::Organization, 24, 32, 0.90);
3239        let relation = Relation::new(head, tail, "WORKED_AT", 0.85);
3240        assert_eq!(relation.span_distance(), 13);
3241    }
3242
3243    #[test]
3244    fn test_relation_category() {
3245        // Relation types should be categorized as Relation
3246        let rel_type = EntityType::custom("CEO_OF", EntityCategory::Relation);
3247        assert_eq!(rel_type.category(), EntityCategory::Relation);
3248        assert!(rel_type.category().is_relation());
3249        assert!(rel_type.requires_ml()); // Relations require ML
3250    }
3251
3252    // ========================================================================
3253    // Span Tests
3254    // ========================================================================
3255
3256    #[test]
3257    fn test_span_text() {
3258        let span = Span::text(10, 20);
3259        assert!(span.is_text());
3260        assert!(!span.is_visual());
3261        assert_eq!(span.text_offsets(), Some((10, 20)));
3262        assert_eq!(span.len(), 10);
3263        assert!(!span.is_empty());
3264    }
3265
3266    #[test]
3267    fn test_span_bbox() {
3268        let span = Span::bbox(0.1, 0.2, 0.3, 0.4);
3269        assert!(!span.is_text());
3270        assert!(span.is_visual());
3271        assert_eq!(span.text_offsets(), None);
3272        assert_eq!(span.len(), 0); // No text length
3273    }
3274
3275    #[test]
3276    fn test_span_bbox_with_page() {
3277        let span = Span::bbox_on_page(0.1, 0.2, 0.3, 0.4, 5);
3278        if let Span::BoundingBox { page, .. } = span {
3279            assert_eq!(page, Some(5));
3280        } else {
3281            panic!("Expected BoundingBox");
3282        }
3283    }
3284
3285    #[test]
3286    fn test_span_hybrid() {
3287        let bbox = Span::bbox(0.1, 0.2, 0.3, 0.4);
3288        let hybrid = Span::Hybrid {
3289            start: 10,
3290            end: 20,
3291            bbox: Box::new(bbox),
3292        };
3293        assert!(hybrid.is_text());
3294        assert!(hybrid.is_visual());
3295        assert_eq!(hybrid.text_offsets(), Some((10, 20)));
3296        assert_eq!(hybrid.len(), 10);
3297    }
3298
3299    // ========================================================================
3300    // Hierarchical Confidence Tests
3301    // ========================================================================
3302
3303    #[test]
3304    fn test_hierarchical_confidence_new() {
3305        let hc = HierarchicalConfidence::new(0.9, 0.8, 0.7);
3306        assert!((hc.linkage - 0.9).abs() < f32::EPSILON);
3307        assert!((hc.type_score - 0.8).abs() < f32::EPSILON);
3308        assert!((hc.boundary - 0.7).abs() < f32::EPSILON);
3309    }
3310
3311    #[test]
3312    fn test_hierarchical_confidence_clamping() {
3313        let hc = HierarchicalConfidence::new(1.5, -0.5, 0.5);
3314        assert!((hc.linkage - 1.0).abs() < f32::EPSILON);
3315        assert!(hc.type_score.abs() < f32::EPSILON);
3316        assert!((hc.boundary - 0.5).abs() < f32::EPSILON);
3317    }
3318
3319    #[test]
3320    fn test_hierarchical_confidence_from_single() {
3321        let hc = HierarchicalConfidence::from_single(0.8);
3322        assert!((hc.linkage - 0.8).abs() < f32::EPSILON);
3323        assert!((hc.type_score - 0.8).abs() < f32::EPSILON);
3324        assert!((hc.boundary - 0.8).abs() < f32::EPSILON);
3325    }
3326
3327    #[test]
3328    fn test_hierarchical_confidence_combined() {
3329        let hc = HierarchicalConfidence::new(1.0, 1.0, 1.0);
3330        assert!((hc.combined() - 1.0).abs() < f32::EPSILON);
3331
3332        let hc2 = HierarchicalConfidence::new(0.8, 0.8, 0.8);
3333        assert!((hc2.combined() - 0.8).abs() < f32::EPSILON);
3334
3335        // Geometric mean: (0.5 * 0.5 * 0.5)^(1/3) = 0.5
3336        let hc3 = HierarchicalConfidence::new(0.5, 0.5, 0.5);
3337        assert!((hc3.combined() - 0.5).abs() < 0.001);
3338    }
3339
3340    #[test]
3341    fn test_hierarchical_confidence_threshold() {
3342        let hc = HierarchicalConfidence::new(0.9, 0.8, 0.7);
3343        assert!(hc.passes_threshold(0.5, 0.5, 0.5));
3344        assert!(hc.passes_threshold(0.9, 0.8, 0.7));
3345        assert!(!hc.passes_threshold(0.95, 0.8, 0.7)); // linkage too high
3346        assert!(!hc.passes_threshold(0.9, 0.85, 0.7)); // type too high
3347    }
3348
3349    #[test]
3350    fn test_hierarchical_confidence_from_f64() {
3351        let hc: HierarchicalConfidence = 0.85_f64.into();
3352        assert!((hc.linkage - 0.85).abs() < 0.001);
3353    }
3354
3355    // ========================================================================
3356    // RaggedBatch Tests
3357    // ========================================================================
3358
3359    #[test]
3360    fn test_ragged_batch_from_sequences() {
3361        let seqs = vec![vec![1, 2, 3], vec![4, 5], vec![6, 7, 8, 9]];
3362        let batch = RaggedBatch::from_sequences(&seqs);
3363
3364        assert_eq!(batch.batch_size(), 3);
3365        assert_eq!(batch.total_tokens(), 9);
3366        assert_eq!(batch.max_seq_len, 4);
3367        assert_eq!(batch.cumulative_offsets, vec![0, 3, 5, 9]);
3368    }
3369
3370    #[test]
3371    fn test_ragged_batch_doc_range() {
3372        let seqs = vec![vec![1, 2, 3], vec![4, 5]];
3373        let batch = RaggedBatch::from_sequences(&seqs);
3374
3375        assert_eq!(batch.doc_range(0), Some(0..3));
3376        assert_eq!(batch.doc_range(1), Some(3..5));
3377        assert_eq!(batch.doc_range(2), None);
3378    }
3379
3380    #[test]
3381    fn test_ragged_batch_doc_tokens() {
3382        let seqs = vec![vec![1, 2, 3], vec![4, 5]];
3383        let batch = RaggedBatch::from_sequences(&seqs);
3384
3385        assert_eq!(batch.doc_tokens(0), Some(&[1, 2, 3][..]));
3386        assert_eq!(batch.doc_tokens(1), Some(&[4, 5][..]));
3387    }
3388
3389    #[test]
3390    fn test_ragged_batch_padding_savings() {
3391        // 3 docs: [3, 2, 4] tokens, max = 4
3392        // Padded: 3 * 4 = 12, actual: 9
3393        // Savings: 1 - 9/12 = 0.25
3394        let seqs = vec![vec![1, 2, 3], vec![4, 5], vec![6, 7, 8, 9]];
3395        let batch = RaggedBatch::from_sequences(&seqs);
3396        let savings = batch.padding_savings();
3397        assert!((savings - 0.25).abs() < 0.001);
3398    }
3399
3400    // ========================================================================
3401    // SpanCandidate Tests
3402    // ========================================================================
3403
3404    #[test]
3405    fn test_span_candidate() {
3406        let sc = SpanCandidate::new(0, 5, 10);
3407        assert_eq!(sc.doc_idx, 0);
3408        assert_eq!(sc.start, 5);
3409        assert_eq!(sc.end, 10);
3410        assert_eq!(sc.width(), 5);
3411    }
3412
3413    #[test]
3414    fn test_generate_span_candidates() {
3415        let seqs = vec![vec![1, 2, 3]]; // doc with 3 tokens
3416        let batch = RaggedBatch::from_sequences(&seqs);
3417        let candidates = generate_span_candidates(&batch, 2);
3418
3419        // With max_width=2: [0,1], [1,2], [2,3], [0,2], [1,3]
3420        // = spans: (0,1), (0,2), (1,2), (1,3), (2,3)
3421        assert_eq!(candidates.len(), 5);
3422
3423        // Verify all candidates are valid
3424        for c in &candidates {
3425            assert_eq!(c.doc_idx, 0);
3426            assert!(c.end as usize <= 3);
3427            assert!(c.width() as usize <= 2);
3428        }
3429    }
3430
3431    #[test]
3432    fn test_generate_filtered_candidates() {
3433        let seqs = vec![vec![1, 2, 3]];
3434        let batch = RaggedBatch::from_sequences(&seqs);
3435
3436        // With max_width=2, we have 5 candidates
3437        // Set mask: only first 2 pass threshold
3438        let mask = vec![0.9, 0.9, 0.1, 0.1, 0.1];
3439        let candidates = generate_filtered_candidates(&batch, 2, &mask, 0.5);
3440
3441        assert_eq!(candidates.len(), 2);
3442    }
3443
3444    // ========================================================================
3445    // EntityBuilder Tests
3446    // ========================================================================
3447
3448    #[test]
3449    fn test_entity_builder_basic() {
3450        let entity = Entity::builder("John", EntityType::Person)
3451            .span(0, 4)
3452            .confidence(0.95)
3453            .build();
3454
3455        assert_eq!(entity.text, "John");
3456        assert_eq!(entity.entity_type, EntityType::Person);
3457        assert_eq!(entity.start, 0);
3458        assert_eq!(entity.end, 4);
3459        assert!((entity.confidence - 0.95).abs() < f64::EPSILON);
3460    }
3461
3462    #[test]
3463    fn test_entity_builder_full() {
3464        let entity = Entity::builder("Marie Curie", EntityType::Person)
3465            .span(0, 11)
3466            .confidence(0.95)
3467            .kb_id("Q7186")
3468            .canonical_id(42)
3469            .normalized("Marie Salomea Skłodowska Curie")
3470            .provenance(Provenance::ml("bert", 0.95))
3471            .build();
3472
3473        assert_eq!(entity.text, "Marie Curie");
3474        assert_eq!(entity.kb_id.as_deref(), Some("Q7186"));
3475        assert_eq!(
3476            entity.canonical_id,
3477            Some(crate::core::types::CanonicalId::new(42))
3478        );
3479        assert_eq!(
3480            entity.normalized.as_deref(),
3481            Some("Marie Salomea Skłodowska Curie")
3482        );
3483        assert!(entity.provenance.is_some());
3484    }
3485
3486    #[test]
3487    fn test_entity_builder_hierarchical() {
3488        let hc = HierarchicalConfidence::new(0.9, 0.8, 0.7);
3489        let entity = Entity::builder("test", EntityType::Person)
3490            .span(0, 4)
3491            .hierarchical_confidence(hc)
3492            .build();
3493
3494        assert!(entity.hierarchical_confidence.is_some());
3495        assert!((entity.linkage_confidence() - 0.9).abs() < 0.001);
3496        assert!((entity.type_confidence() - 0.8).abs() < 0.001);
3497        assert!((entity.boundary_confidence() - 0.7).abs() < 0.001);
3498    }
3499
3500    #[test]
3501    fn test_entity_builder_visual() {
3502        let bbox = Span::bbox(0.1, 0.2, 0.3, 0.4);
3503        let entity = Entity::builder("receipt item", EntityType::Money)
3504            .visual_span(bbox)
3505            .confidence(0.9)
3506            .build();
3507
3508        assert!(entity.is_visual());
3509        assert!(entity.visual_span.is_some());
3510    }
3511
3512    // ========================================================================
3513    // Entity Helper Method Tests
3514    // ========================================================================
3515
3516    #[test]
3517    fn test_entity_hierarchical_confidence_helpers() {
3518        let mut entity = Entity::new("test", EntityType::Person, 0, 4, 0.8);
3519
3520        // Without hierarchical confidence, falls back to main confidence
3521        assert!((entity.linkage_confidence() - 0.8).abs() < 0.001);
3522        assert!((entity.type_confidence() - 0.8).abs() < 0.001);
3523        assert!((entity.boundary_confidence() - 0.8).abs() < 0.001);
3524
3525        // Set hierarchical confidence
3526        entity.set_hierarchical_confidence(HierarchicalConfidence::new(0.95, 0.85, 0.75));
3527        assert!((entity.linkage_confidence() - 0.95).abs() < 0.001);
3528        assert!((entity.type_confidence() - 0.85).abs() < 0.001);
3529        assert!((entity.boundary_confidence() - 0.75).abs() < 0.001);
3530    }
3531
3532    #[test]
3533    fn test_entity_from_visual() {
3534        let entity = Entity::from_visual(
3535            "receipt total",
3536            EntityType::Money,
3537            Span::bbox(0.5, 0.8, 0.2, 0.05),
3538            0.92,
3539        );
3540
3541        assert!(entity.is_visual());
3542        assert_eq!(entity.start, 0);
3543        assert_eq!(entity.end, 0);
3544        assert!((entity.confidence - 0.92).abs() < f64::EPSILON);
3545    }
3546
3547    #[test]
3548    fn test_entity_span_helpers() {
3549        let entity = Entity::new("test", EntityType::Person, 10, 20, 0.9);
3550        assert_eq!(entity.text_span(), (10, 20));
3551        assert_eq!(entity.span_len(), 10);
3552    }
3553
3554    // ========================================================================
3555    // Provenance Tests
3556    // ========================================================================
3557
3558    #[test]
3559    fn test_provenance_pattern() {
3560        let prov = Provenance::pattern("EMAIL");
3561        assert_eq!(prov.method, ExtractionMethod::Pattern);
3562        assert_eq!(prov.pattern.as_deref(), Some("EMAIL"));
3563        assert_eq!(prov.raw_confidence, Some(Confidence::new(1.0))); // Patterns are deterministic
3564    }
3565
3566    #[test]
3567    fn test_provenance_ml() {
3568        let prov = Provenance::ml("bert-ner", 0.87);
3569        assert_eq!(prov.method, ExtractionMethod::Neural);
3570        assert_eq!(prov.source.as_ref(), "bert-ner");
3571        assert_eq!(prov.raw_confidence, Some(Confidence::new(0.87)));
3572    }
3573
3574    #[test]
3575    fn test_provenance_with_version() {
3576        let prov = Provenance::ml("gliner", 0.92).with_version("v2.1.0");
3577
3578        assert_eq!(prov.model_version.as_deref(), Some("v2.1.0"));
3579        assert_eq!(prov.source.as_ref(), "gliner");
3580    }
3581
3582    #[test]
3583    fn test_provenance_with_timestamp() {
3584        let prov = Provenance::pattern("DATE").with_timestamp("2024-01-15T10:30:00Z");
3585
3586        assert_eq!(prov.timestamp.as_deref(), Some("2024-01-15T10:30:00Z"));
3587    }
3588
3589    #[test]
3590    fn test_provenance_builder_chain() {
3591        let prov = Provenance::ml("modernbert-ner", 0.95)
3592            .with_version("v1.0.0")
3593            .with_timestamp("2024-11-27T12:00:00Z");
3594
3595        assert_eq!(prov.method, ExtractionMethod::Neural);
3596        assert_eq!(prov.source.as_ref(), "modernbert-ner");
3597        assert_eq!(prov.raw_confidence, Some(Confidence::new(0.95)));
3598        assert_eq!(prov.model_version.as_deref(), Some("v1.0.0"));
3599        assert_eq!(prov.timestamp.as_deref(), Some("2024-11-27T12:00:00Z"));
3600    }
3601
3602    #[test]
3603    fn test_provenance_serialization() {
3604        let prov = Provenance::ml("test", 0.9)
3605            .with_version("v1.0")
3606            .with_timestamp("2024-01-01");
3607
3608        let json = serde_json::to_string(&prov).unwrap();
3609        assert!(json.contains("model_version"));
3610        assert!(json.contains("v1.0"));
3611
3612        let restored: Provenance = serde_json::from_str(&json).unwrap();
3613        assert_eq!(restored.model_version.as_deref(), Some("v1.0"));
3614        assert_eq!(restored.timestamp.as_deref(), Some("2024-01-01"));
3615    }
3616}
3617
3618#[cfg(test)]
3619mod proptests {
3620    #![allow(clippy::unwrap_used)] // unwrap() is acceptable in property tests
3621    use super::*;
3622    use proptest::prelude::*;
3623
3624    proptest! {
3625        #[test]
3626        fn confidence_always_clamped(conf in -10.0f64..10.0) {
3627            let e = Entity::new("test", EntityType::Person, 0, 4, conf);
3628            prop_assert!(e.confidence >= 0.0);
3629            prop_assert!(e.confidence <= 1.0);
3630        }
3631
3632        #[test]
3633        fn entity_type_roundtrip(label in "[A-Z]{3,10}") {
3634            let et = EntityType::from_label(&label);
3635            let back = EntityType::from_label(et.as_label());
3636            // Custom types may round-trip to themselves or normalize
3637            let is_custom = matches!(back, EntityType::Custom { .. });
3638            prop_assert!(is_custom || back == et);
3639        }
3640
3641        #[test]
3642        fn overlap_is_symmetric(
3643            s1 in 0usize..100,
3644            len1 in 1usize..50,
3645            s2 in 0usize..100,
3646            len2 in 1usize..50,
3647        ) {
3648            let e1 = Entity::new("a", EntityType::Person, s1, s1 + len1, 1.0);
3649            let e2 = Entity::new("b", EntityType::Person, s2, s2 + len2, 1.0);
3650            prop_assert_eq!(e1.overlaps(&e2), e2.overlaps(&e1));
3651        }
3652
3653        #[test]
3654        fn overlap_ratio_bounded(
3655            s1 in 0usize..100,
3656            len1 in 1usize..50,
3657            s2 in 0usize..100,
3658            len2 in 1usize..50,
3659        ) {
3660            let e1 = Entity::new("a", EntityType::Person, s1, s1 + len1, 1.0);
3661            let e2 = Entity::new("b", EntityType::Person, s2, s2 + len2, 1.0);
3662            let ratio = e1.overlap_ratio(&e2);
3663            prop_assert!(ratio >= 0.0);
3664            prop_assert!(ratio <= 1.0);
3665        }
3666
3667        #[test]
3668        fn self_overlap_ratio_is_one(s in 0usize..100, len in 1usize..50) {
3669            let e = Entity::new("test", EntityType::Person, s, s + len, 1.0);
3670            let ratio = e.overlap_ratio(&e);
3671            prop_assert!((ratio - 1.0).abs() < 1e-10);
3672        }
3673
3674        #[test]
3675        fn hierarchical_confidence_always_clamped(
3676            linkage in -2.0f32..2.0,
3677            type_score in -2.0f32..2.0,
3678            boundary in -2.0f32..2.0,
3679        ) {
3680            let hc = HierarchicalConfidence::new(linkage, type_score, boundary);
3681            prop_assert!(hc.linkage >= 0.0 && hc.linkage <= 1.0);
3682            prop_assert!(hc.type_score >= 0.0 && hc.type_score <= 1.0);
3683            prop_assert!(hc.boundary >= 0.0 && hc.boundary <= 1.0);
3684            prop_assert!(hc.combined() >= 0.0 && hc.combined() <= 1.0);
3685        }
3686
3687        #[test]
3688        fn span_candidate_width_consistent(
3689            doc in 0u32..10,
3690            start in 0u32..100,
3691            end in 1u32..100,
3692        ) {
3693            let actual_end = start.max(end);
3694            let sc = SpanCandidate::new(doc, start, actual_end);
3695            prop_assert_eq!(sc.width(), actual_end.saturating_sub(start));
3696        }
3697
3698        #[test]
3699        fn ragged_batch_preserves_tokens(
3700            seq_lens in proptest::collection::vec(1usize..10, 1..5),
3701        ) {
3702            // Create sequences with sequential token IDs
3703            let mut counter = 0u32;
3704            let seqs: Vec<Vec<u32>> = seq_lens.iter().map(|&len| {
3705                let seq: Vec<u32> = (counter..counter + len as u32).collect();
3706                counter += len as u32;
3707                seq
3708            }).collect();
3709
3710            let batch = RaggedBatch::from_sequences(&seqs);
3711
3712            // Verify batch properties
3713            prop_assert_eq!(batch.batch_size(), seqs.len());
3714            prop_assert_eq!(batch.total_tokens(), seq_lens.iter().sum::<usize>());
3715
3716            // Verify each doc can be retrieved correctly
3717            for (i, seq) in seqs.iter().enumerate() {
3718                let doc_tokens = batch.doc_tokens(i).unwrap();
3719                prop_assert_eq!(doc_tokens, seq.as_slice());
3720            }
3721        }
3722
3723        #[test]
3724        fn span_text_offsets_consistent(start in 0usize..100, len in 0usize..50) {
3725            let end = start + len;
3726            let span = Span::text(start, end);
3727            let (s, e) = span.text_offsets().unwrap();
3728            prop_assert_eq!(s, start);
3729            prop_assert_eq!(e, end);
3730            prop_assert_eq!(span.len(), len);
3731        }
3732
3733        // =================================================================
3734        // Property tests for core type invariants
3735        // =================================================================
3736
3737        /// Entity with start < end always passes the span validity check in validate().
3738        #[test]
3739        fn entity_span_validity(
3740            start in 0usize..10000,
3741            len in 1usize..500,
3742            conf in 0.0f64..=1.0,
3743        ) {
3744            let end = start + len;
3745            // Build a source text long enough to cover the span
3746            let text_content: String = "x".repeat(end);
3747            let entity_text: String = text_content.chars().skip(start).take(len).collect();
3748            let e = Entity::new(&entity_text, EntityType::Person, start, end, conf);
3749            let issues = e.validate(&text_content);
3750            // No InvalidSpan or SpanOutOfBounds issues
3751            for issue in &issues {
3752                match issue {
3753                    ValidationIssue::InvalidSpan { .. } => {
3754                        prop_assert!(false, "start < end should never produce InvalidSpan");
3755                    }
3756                    ValidationIssue::SpanOutOfBounds { .. } => {
3757                        prop_assert!(false, "span within text should never produce SpanOutOfBounds");
3758                    }
3759                    _ => {} // TextMismatch or others are fine to check separately
3760                }
3761            }
3762        }
3763
3764        /// EntityType::from_label(et.as_label()) == et for all standard (non-Custom, non-Other) types.
3765        #[test]
3766        fn entity_type_label_roundtrip_standard(
3767            idx in 0usize..13,
3768        ) {
3769            let standard_types = [
3770                EntityType::Person,
3771                EntityType::Organization,
3772                EntityType::Location,
3773                EntityType::Date,
3774                EntityType::Time,
3775                EntityType::Money,
3776                EntityType::Percent,
3777                EntityType::Quantity,
3778                EntityType::Cardinal,
3779                EntityType::Ordinal,
3780                EntityType::Email,
3781                EntityType::Url,
3782                EntityType::Phone,
3783            ];
3784            let et = &standard_types[idx];
3785            let label = et.as_label();
3786            let roundtripped = EntityType::from_label(label);
3787            prop_assert_eq!(&roundtripped, et,
3788                "from_label(as_label()) must roundtrip for {:?} (label={:?})", et, label);
3789        }
3790
3791        /// Span containment: if span A contains span B, then A.start <= B.start && A.end >= B.end.
3792        #[test]
3793        fn span_containment_property(
3794            a_start in 0usize..5000,
3795            a_len in 1usize..5000,
3796            b_offset in 0usize..5000,
3797            b_len in 1usize..5000,
3798        ) {
3799            let a_end = a_start + a_len;
3800            let b_start = a_start + (b_offset % a_len); // B starts within A
3801            let b_end_candidate = b_start + b_len;
3802
3803            // Only test the containment invariant when B is actually inside A
3804            if b_start >= a_start && b_end_candidate <= a_end {
3805                // B is contained in A
3806                prop_assert!(a_start <= b_start);
3807                prop_assert!(a_end >= b_end_candidate);
3808
3809                // Also verify via Entity overlap: A must overlap B if A contains B
3810                let ea = Entity::new("a", EntityType::Person, a_start, a_end, 1.0);
3811                let eb = Entity::new("b", EntityType::Person, b_start, b_end_candidate, 1.0);
3812                prop_assert!(ea.overlaps(&eb),
3813                    "containing span must overlap contained span");
3814            }
3815        }
3816
3817        /// Serde roundtrip preserves all fields of Entity.
3818        #[test]
3819        fn entity_serde_roundtrip(
3820            start in 0usize..10000,
3821            len in 1usize..500,
3822            conf in 0.0f64..=1.0,
3823            type_idx in 0usize..5,
3824        ) {
3825            let end = start + len;
3826            let types = [
3827                EntityType::Person,
3828                EntityType::Organization,
3829                EntityType::Location,
3830                EntityType::Date,
3831                EntityType::Email,
3832            ];
3833            let et = types[type_idx].clone();
3834            let text = format!("entity_{}", start);
3835            let e = Entity::new(&text, et, start, end, conf);
3836
3837            let json = serde_json::to_string(&e).unwrap();
3838            let e2: Entity = serde_json::from_str(&json).unwrap();
3839
3840            prop_assert_eq!(&e.text, &e2.text);
3841            prop_assert_eq!(&e.entity_type, &e2.entity_type);
3842            prop_assert_eq!(e.start, e2.start);
3843            prop_assert_eq!(e.end, e2.end);
3844            // f64 roundtrip through JSON: compare with tolerance
3845            prop_assert!((e.confidence - e2.confidence).abs() < 1e-10,
3846                "confidence roundtrip: {} vs {}", e.confidence, e2.confidence);
3847            prop_assert_eq!(&e.normalized, &e2.normalized);
3848            prop_assert_eq!(&e.kb_id, &e2.kb_id);
3849        }
3850
3851        /// DiscontinuousSpan: total_len() == sum of individual segment lengths.
3852        #[test]
3853        fn discontinuous_span_total_length(
3854            segments in proptest::collection::vec(
3855                (0usize..5000, 1usize..500),
3856                1..6
3857            ),
3858        ) {
3859            let ranges: Vec<std::ops::Range<usize>> = segments.iter()
3860                .map(|&(start, len)| start..start + len)
3861                .collect();
3862            let expected_sum: usize = ranges.iter().map(|r| r.end - r.start).sum();
3863            let span = DiscontinuousSpan::new(ranges);
3864            prop_assert_eq!(span.total_len(), expected_sum,
3865                "total_len must equal sum of segment lengths");
3866        }
3867    }
3868
3869    // ========================================================================
3870    // EntityViewport Tests
3871    // ========================================================================
3872
3873    #[test]
3874    fn test_entity_viewport_as_str() {
3875        assert_eq!(EntityViewport::Business.as_str(), "business");
3876        assert_eq!(EntityViewport::Legal.as_str(), "legal");
3877        assert_eq!(EntityViewport::Technical.as_str(), "technical");
3878        assert_eq!(EntityViewport::Academic.as_str(), "academic");
3879        assert_eq!(EntityViewport::Personal.as_str(), "personal");
3880        assert_eq!(EntityViewport::Political.as_str(), "political");
3881        assert_eq!(EntityViewport::Media.as_str(), "media");
3882        assert_eq!(EntityViewport::Historical.as_str(), "historical");
3883        assert_eq!(EntityViewport::General.as_str(), "general");
3884        assert_eq!(
3885            EntityViewport::Custom("custom".to_string()).as_str(),
3886            "custom"
3887        );
3888    }
3889
3890    #[test]
3891    fn test_entity_viewport_is_professional() {
3892        assert!(EntityViewport::Business.is_professional());
3893        assert!(EntityViewport::Legal.is_professional());
3894        assert!(EntityViewport::Technical.is_professional());
3895        assert!(EntityViewport::Academic.is_professional());
3896        assert!(EntityViewport::Political.is_professional());
3897
3898        assert!(!EntityViewport::Personal.is_professional());
3899        assert!(!EntityViewport::Media.is_professional());
3900        assert!(!EntityViewport::Historical.is_professional());
3901        assert!(!EntityViewport::General.is_professional());
3902        assert!(!EntityViewport::Custom("test".to_string()).is_professional());
3903    }
3904
3905    #[test]
3906    fn test_entity_viewport_from_str() {
3907        assert_eq!(
3908            "business".parse::<EntityViewport>().unwrap(),
3909            EntityViewport::Business
3910        );
3911        assert_eq!(
3912            "financial".parse::<EntityViewport>().unwrap(),
3913            EntityViewport::Business
3914        );
3915        assert_eq!(
3916            "corporate".parse::<EntityViewport>().unwrap(),
3917            EntityViewport::Business
3918        );
3919
3920        assert_eq!(
3921            "legal".parse::<EntityViewport>().unwrap(),
3922            EntityViewport::Legal
3923        );
3924        assert_eq!(
3925            "law".parse::<EntityViewport>().unwrap(),
3926            EntityViewport::Legal
3927        );
3928
3929        assert_eq!(
3930            "technical".parse::<EntityViewport>().unwrap(),
3931            EntityViewport::Technical
3932        );
3933        assert_eq!(
3934            "engineering".parse::<EntityViewport>().unwrap(),
3935            EntityViewport::Technical
3936        );
3937
3938        assert_eq!(
3939            "academic".parse::<EntityViewport>().unwrap(),
3940            EntityViewport::Academic
3941        );
3942        assert_eq!(
3943            "research".parse::<EntityViewport>().unwrap(),
3944            EntityViewport::Academic
3945        );
3946
3947        assert_eq!(
3948            "personal".parse::<EntityViewport>().unwrap(),
3949            EntityViewport::Personal
3950        );
3951        assert_eq!(
3952            "biographical".parse::<EntityViewport>().unwrap(),
3953            EntityViewport::Personal
3954        );
3955
3956        assert_eq!(
3957            "political".parse::<EntityViewport>().unwrap(),
3958            EntityViewport::Political
3959        );
3960        assert_eq!(
3961            "policy".parse::<EntityViewport>().unwrap(),
3962            EntityViewport::Political
3963        );
3964
3965        assert_eq!(
3966            "media".parse::<EntityViewport>().unwrap(),
3967            EntityViewport::Media
3968        );
3969        assert_eq!(
3970            "press".parse::<EntityViewport>().unwrap(),
3971            EntityViewport::Media
3972        );
3973
3974        assert_eq!(
3975            "historical".parse::<EntityViewport>().unwrap(),
3976            EntityViewport::Historical
3977        );
3978        assert_eq!(
3979            "history".parse::<EntityViewport>().unwrap(),
3980            EntityViewport::Historical
3981        );
3982
3983        assert_eq!(
3984            "general".parse::<EntityViewport>().unwrap(),
3985            EntityViewport::General
3986        );
3987        assert_eq!(
3988            "generic".parse::<EntityViewport>().unwrap(),
3989            EntityViewport::General
3990        );
3991        assert_eq!(
3992            "".parse::<EntityViewport>().unwrap(),
3993            EntityViewport::General
3994        );
3995
3996        // Custom viewport
3997        assert_eq!(
3998            "custom_viewport".parse::<EntityViewport>().unwrap(),
3999            EntityViewport::Custom("custom_viewport".to_string())
4000        );
4001    }
4002
4003    #[test]
4004    fn test_entity_viewport_from_str_case_insensitive() {
4005        assert_eq!(
4006            "BUSINESS".parse::<EntityViewport>().unwrap(),
4007            EntityViewport::Business
4008        );
4009        assert_eq!(
4010            "Business".parse::<EntityViewport>().unwrap(),
4011            EntityViewport::Business
4012        );
4013        assert_eq!(
4014            "BuSiNeSs".parse::<EntityViewport>().unwrap(),
4015            EntityViewport::Business
4016        );
4017    }
4018
4019    #[test]
4020    fn test_entity_viewport_display() {
4021        assert_eq!(format!("{}", EntityViewport::Business), "business");
4022        assert_eq!(format!("{}", EntityViewport::Academic), "academic");
4023        assert_eq!(
4024            format!("{}", EntityViewport::Custom("test".to_string())),
4025            "test"
4026        );
4027    }
4028
4029    #[test]
4030    fn test_entity_viewport_methods() {
4031        let mut entity = Entity::new("Marie Curie", EntityType::Person, 0, 11, 0.9);
4032
4033        // Initially no viewport
4034        assert!(!entity.has_viewport());
4035        assert_eq!(entity.viewport_or_default(), EntityViewport::General);
4036        assert!(entity.matches_viewport(&EntityViewport::Academic)); // No viewport matches any
4037
4038        // Set viewport
4039        entity.set_viewport(EntityViewport::Academic);
4040        assert!(entity.has_viewport());
4041        assert_eq!(entity.viewport_or_default(), EntityViewport::Academic);
4042        assert!(entity.matches_viewport(&EntityViewport::Academic));
4043        assert!(!entity.matches_viewport(&EntityViewport::Business));
4044    }
4045
4046    #[test]
4047    fn test_entity_builder_with_viewport() {
4048        let entity = Entity::builder("Marie Curie", EntityType::Person)
4049            .span(0, 11)
4050            .viewport(EntityViewport::Academic)
4051            .build();
4052
4053        assert_eq!(entity.viewport, Some(EntityViewport::Academic));
4054        assert!(entity.has_viewport());
4055    }
4056
4057    // ========================================================================
4058    // EntityCategory Tests
4059    // ========================================================================
4060
4061    #[test]
4062    fn test_entity_category_requires_ml() {
4063        assert!(EntityCategory::Agent.requires_ml());
4064        assert!(EntityCategory::Organization.requires_ml());
4065        assert!(EntityCategory::Place.requires_ml());
4066        assert!(EntityCategory::Creative.requires_ml());
4067        assert!(EntityCategory::Relation.requires_ml());
4068
4069        assert!(!EntityCategory::Temporal.requires_ml());
4070        assert!(!EntityCategory::Numeric.requires_ml());
4071        assert!(!EntityCategory::Contact.requires_ml());
4072        assert!(!EntityCategory::Misc.requires_ml());
4073    }
4074
4075    #[test]
4076    fn test_entity_category_pattern_detectable() {
4077        assert!(EntityCategory::Temporal.pattern_detectable());
4078        assert!(EntityCategory::Numeric.pattern_detectable());
4079        assert!(EntityCategory::Contact.pattern_detectable());
4080
4081        assert!(!EntityCategory::Agent.pattern_detectable());
4082        assert!(!EntityCategory::Organization.pattern_detectable());
4083        assert!(!EntityCategory::Place.pattern_detectable());
4084        assert!(!EntityCategory::Creative.pattern_detectable());
4085        assert!(!EntityCategory::Relation.pattern_detectable());
4086        assert!(!EntityCategory::Misc.pattern_detectable());
4087    }
4088
4089    #[test]
4090    fn test_entity_category_is_relation() {
4091        assert!(EntityCategory::Relation.is_relation());
4092
4093        assert!(!EntityCategory::Agent.is_relation());
4094        assert!(!EntityCategory::Organization.is_relation());
4095        assert!(!EntityCategory::Place.is_relation());
4096        assert!(!EntityCategory::Temporal.is_relation());
4097        assert!(!EntityCategory::Numeric.is_relation());
4098        assert!(!EntityCategory::Contact.is_relation());
4099        assert!(!EntityCategory::Creative.is_relation());
4100        assert!(!EntityCategory::Misc.is_relation());
4101    }
4102
4103    #[test]
4104    fn test_entity_category_as_str() {
4105        assert_eq!(EntityCategory::Agent.as_str(), "agent");
4106        assert_eq!(EntityCategory::Organization.as_str(), "organization");
4107        assert_eq!(EntityCategory::Place.as_str(), "place");
4108        assert_eq!(EntityCategory::Creative.as_str(), "creative");
4109        assert_eq!(EntityCategory::Temporal.as_str(), "temporal");
4110        assert_eq!(EntityCategory::Numeric.as_str(), "numeric");
4111        assert_eq!(EntityCategory::Contact.as_str(), "contact");
4112        assert_eq!(EntityCategory::Relation.as_str(), "relation");
4113        assert_eq!(EntityCategory::Misc.as_str(), "misc");
4114    }
4115
4116    #[test]
4117    fn test_entity_category_display() {
4118        assert_eq!(format!("{}", EntityCategory::Agent), "agent");
4119        assert_eq!(format!("{}", EntityCategory::Temporal), "temporal");
4120        assert_eq!(format!("{}", EntityCategory::Relation), "relation");
4121    }
4122
4123    // ========================================================================
4124    // EntityType serde tests (N20: flat string serialization)
4125    // ========================================================================
4126
4127    #[test]
4128    fn test_entity_type_serializes_to_flat_string() {
4129        assert_eq!(
4130            serde_json::to_string(&EntityType::Person).unwrap(),
4131            r#""PER""#
4132        );
4133        assert_eq!(
4134            serde_json::to_string(&EntityType::Organization).unwrap(),
4135            r#""ORG""#
4136        );
4137        assert_eq!(
4138            serde_json::to_string(&EntityType::Location).unwrap(),
4139            r#""LOC""#
4140        );
4141        assert_eq!(
4142            serde_json::to_string(&EntityType::Date).unwrap(),
4143            r#""DATE""#
4144        );
4145        assert_eq!(
4146            serde_json::to_string(&EntityType::Money).unwrap(),
4147            r#""MONEY""#
4148        );
4149    }
4150
4151    #[test]
4152    fn test_custom_entity_type_serializes_flat() {
4153        let misc = EntityType::custom("MISC", EntityCategory::Misc);
4154        assert_eq!(serde_json::to_string(&misc).unwrap(), r#""MISC""#);
4155
4156        let disease = EntityType::custom("DISEASE", EntityCategory::Agent);
4157        assert_eq!(serde_json::to_string(&disease).unwrap(), r#""DISEASE""#);
4158    }
4159
4160    #[test]
4161    fn test_entity_type_deserializes_from_flat_string() {
4162        let per: EntityType = serde_json::from_str(r#""PER""#).unwrap();
4163        assert_eq!(per, EntityType::Person);
4164
4165        let org: EntityType = serde_json::from_str(r#""ORG""#).unwrap();
4166        assert_eq!(org, EntityType::Organization);
4167
4168        let misc: EntityType = serde_json::from_str(r#""MISC""#).unwrap();
4169        assert_eq!(misc, EntityType::custom("MISC", EntityCategory::Misc));
4170    }
4171
4172    #[test]
4173    fn test_entity_type_deserializes_backward_compat_custom() {
4174        // Old format: {"Custom":{"name":"MISC","category":"Misc"}}
4175        let json = r#"{"Custom":{"name":"MISC","category":"Misc"}}"#;
4176        let et: EntityType = serde_json::from_str(json).unwrap();
4177        assert_eq!(et, EntityType::custom("MISC", EntityCategory::Misc));
4178    }
4179
4180    #[test]
4181    fn test_entity_type_deserializes_backward_compat_other() {
4182        // Old format: {"Other":"foo"} -- now routes to Custom with Misc category
4183        let json = r#"{"Other":"foo"}"#;
4184        let et: EntityType = serde_json::from_str(json).unwrap();
4185        assert_eq!(et, EntityType::custom("foo", EntityCategory::Misc));
4186    }
4187
4188    #[test]
4189    fn test_entity_type_serde_roundtrip() {
4190        let types = vec![
4191            EntityType::Person,
4192            EntityType::Organization,
4193            EntityType::Location,
4194            EntityType::Date,
4195            EntityType::Time,
4196            EntityType::Money,
4197            EntityType::Percent,
4198            EntityType::Quantity,
4199            EntityType::Cardinal,
4200            EntityType::Ordinal,
4201            EntityType::Email,
4202            EntityType::Url,
4203            EntityType::Phone,
4204            EntityType::custom("MISC", EntityCategory::Misc),
4205            EntityType::custom("DISEASE", EntityCategory::Agent),
4206        ];
4207
4208        for t in &types {
4209            let json = serde_json::to_string(t).unwrap();
4210            let back: EntityType = serde_json::from_str(&json).unwrap();
4211            // All variants roundtrip through from_label, so Custom types
4212            // survive as Custom (not as a built-in variant).
4213            assert_eq!(
4214                t.as_label(),
4215                back.as_label(),
4216                "roundtrip failed for {:?}",
4217                t
4218            );
4219        }
4220    }
4221}