Skip to main content

anno_core/core/
entity.rs

1//! Entity types and structures for NER.
2//!
3//! # Design Philosophy (Research-Aligned)
4//!
5//! This module implements entity types informed by modern NER research:
6//!
7//! - **GLiNER/Bi-Encoder**: Entity types are *labels to match against*, not fixed classes.
8//!   Relations ("CEO of") are entities too - they're just labels in the same latent space.
9//!
10//! - **TPLinker/Joint Extraction**: Entities and relations can be extracted in a single pass.
11//!   The type system supports relation triggers as first-class mentions.
12//!
13//! - **Knowledge Graphs**: Entities can link to external knowledge bases (`kb_id`) for
14//!   coreference resolution and GraphRAG applications.
15//!
16//! # Type Hierarchy
17//!
18//! ```text
19//! Mention
20//! ├── Entity (single span)
21//! │   ├── Named (ML): Person, Organization, Location
22//! │   ├── Temporal (Pattern): Date, Time
23//! │   ├── Numeric (Pattern): Money, Percent, Quantity, Cardinal, Ordinal
24//! │   └── Contact (Pattern): Email, Url, Phone
25//! │
26//! └── Relation (connects entities)
27//!     └── Trigger text: "CEO of", "located in", "born on"
28//! ```
29//!
30//! # Design Principles
31//!
32//! 1. **Bi-encoder compatible**: Types are semantic labels, not fixed enums
33//! 2. **Joint extraction**: Relations are mentions with trigger spans
34//! 3. **Knowledge linking**: `kb_id` for connecting to external KBs
35//! 4. **Hierarchical confidence**: Coarse (linkage) + fine (type) scores
36//! 5. **Multi-modal ready**: Spans can be text offsets or visual bboxes
37
38use super::confidence::Confidence;
39use super::types::{MentionType, PhiFeatures};
40use serde::{Deserialize, Serialize};
41use std::borrow::Cow;
42
43// ============================================================================
44// Entity Category (OntoNotes-inspired)
45// ============================================================================
46
47/// Category of entity based on detection characteristics and semantics.
48///
49/// Based on OntoNotes 5.0 categories with extensions for:
50/// - Structured data (Contact, patterns)
51/// - Knowledge graphs (Relation, for TPLinker/GLiNER joint extraction)
52#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
53#[non_exhaustive]
54pub enum EntityCategory {
55    /// Named entities for people/groups (ML-required).
56    /// Types: Person, NORP (nationalities/religious/political groups)
57    Agent,
58    /// Named entities for organizations/facilities (ML-required).
59    /// Types: Organization, Facility
60    Organization,
61    /// Named entities for places (ML-required).
62    /// Types: GPE (geo-political), Location (geographic)
63    Place,
64    /// Named entities for creative/conceptual (ML-required).
65    /// Types: Event, Product, WorkOfArt, Law, Language
66    Creative,
67    /// Temporal entities (pattern-detectable).
68    /// Types: Date, Time
69    Temporal,
70    /// Numeric entities (pattern-detectable).
71    /// Types: Money, Percent, Quantity, Cardinal, Ordinal
72    Numeric,
73    /// Contact/identifier entities (pattern-detectable).
74    /// Types: Email, Url, Phone
75    Contact,
76    /// Relation triggers for knowledge graph construction (ML-required).
77    /// Examples: "CEO of", "located in", "founded by"
78    /// In GLiNER bi-encoder, relations are just another label to match.
79    Relation,
80    /// Miscellaneous/unknown category
81    Misc,
82}
83
84impl EntityCategory {
85    /// Returns true if this category requires ML for detection.
86    #[must_use]
87    pub const fn requires_ml(&self) -> bool {
88        matches!(
89            self,
90            EntityCategory::Agent
91                | EntityCategory::Organization
92                | EntityCategory::Place
93                | EntityCategory::Creative
94                | EntityCategory::Relation
95        )
96    }
97
98    /// Returns true if this category can be detected via patterns.
99    #[must_use]
100    pub const fn pattern_detectable(&self) -> bool {
101        matches!(
102            self,
103            EntityCategory::Temporal | EntityCategory::Numeric | EntityCategory::Contact
104        )
105    }
106
107    /// Returns true if this is a relation (for knowledge graph construction).
108    #[must_use]
109    pub const fn is_relation(&self) -> bool {
110        matches!(self, EntityCategory::Relation)
111    }
112
113    /// Returns OntoNotes-compatible category name.
114    #[must_use]
115    pub const fn as_str(&self) -> &'static str {
116        match self {
117            EntityCategory::Agent => "agent",
118            EntityCategory::Organization => "organization",
119            EntityCategory::Place => "place",
120            EntityCategory::Creative => "creative",
121            EntityCategory::Temporal => "temporal",
122            EntityCategory::Numeric => "numeric",
123            EntityCategory::Contact => "contact",
124            EntityCategory::Relation => "relation",
125            EntityCategory::Misc => "misc",
126        }
127    }
128}
129
130impl std::fmt::Display for EntityCategory {
131    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
132        write!(f, "{}", self.as_str())
133    }
134}
135
136// ============================================================================
137// Entity Viewport (Research: Entity Manifolds)
138// ============================================================================
139
140/// Viewport context for multi-faceted entity representation.
141///
142/// # Research Background
143///
144/// The concept of "Entity Viewports" comes from the observation that
145/// real-world entities are not monolithic - they present different
146/// facets depending on context:
147///
148/// - "Marie Curie" in an **Academic** context: physicist, Nobel laureate
149/// - "Marie Curie" in a **Technical** context: radioactivity researcher, X-ray pioneer
150/// - "Marie Curie" in a **Personal** context: mother, immigrant, educator
151/// - "Marie Curie" in a **Medical** context: founder of mobile X-ray units
152///
153/// Rather than collapsing all information into a single vector,
154/// the viewport model preserves these distinctions and enables
155/// "projection" at query time.
156///
157/// # Usage in RAG Systems
158///
159/// When answering "What were Curie's scientific contributions?", retrieve
160/// facts from the `Academic` viewport. When answering "What was Curie's
161/// personal life like?", retrieve from `Personal`.
162///
163/// # Example
164///
165/// ```rust
166/// use anno_core::{Entity, EntityType, EntityViewport};
167///
168/// let mut entity = Entity::new("Marie Curie", EntityType::Person, 0, 11, 0.9);
169/// entity.viewport = Some(EntityViewport::Academic);
170/// assert!(entity.viewport.as_ref().unwrap().is_professional());
171/// ```
172#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize, Default)]
173#[non_exhaustive]
174pub enum EntityViewport {
175    /// Business/financial context (CEO, revenue, market cap)
176    Business,
177    /// Legal context (lawsuits, settlements, compliance)
178    Legal,
179    /// Technical/engineering context (patents, inventions, code)
180    Technical,
181    /// Academic/research context (publications, citations, grants)
182    Academic,
183    /// Personal/biographical context (family, hobbies, background)
184    Personal,
185    /// Political context (lobbying, donations, policy positions)
186    Political,
187    /// Media/public relations context (interviews, statements, PR)
188    Media,
189    /// Historical context (past roles, timeline events)
190    Historical,
191    /// Generic/unspecified context
192    #[default]
193    General,
194    /// Custom viewport with a descriptive label
195    Custom(String),
196}
197
198impl EntityViewport {
199    /// Human-readable label for the viewport.
200    #[must_use]
201    pub fn as_str(&self) -> &str {
202        match self {
203            EntityViewport::Business => "business",
204            EntityViewport::Legal => "legal",
205            EntityViewport::Technical => "technical",
206            EntityViewport::Academic => "academic",
207            EntityViewport::Personal => "personal",
208            EntityViewport::Political => "political",
209            EntityViewport::Media => "media",
210            EntityViewport::Historical => "historical",
211            EntityViewport::General => "general",
212            EntityViewport::Custom(s) => s,
213        }
214    }
215
216    /// Is this a professional/work-related viewport?
217    #[must_use]
218    pub const fn is_professional(&self) -> bool {
219        matches!(
220            self,
221            EntityViewport::Business
222                | EntityViewport::Legal
223                | EntityViewport::Technical
224                | EntityViewport::Academic
225                | EntityViewport::Political
226        )
227    }
228}
229
230impl std::str::FromStr for EntityViewport {
231    type Err = std::convert::Infallible;
232
233    fn from_str(s: &str) -> Result<Self, Self::Err> {
234        Ok(match s.to_lowercase().as_str() {
235            "business" | "financial" | "corporate" => EntityViewport::Business,
236            "legal" | "law" | "compliance" => EntityViewport::Legal,
237            "technical" | "engineering" | "tech" => EntityViewport::Technical,
238            "academic" | "research" | "scholarly" => EntityViewport::Academic,
239            "personal" | "biographical" | "private" => EntityViewport::Personal,
240            "political" | "policy" | "government" => EntityViewport::Political,
241            "media" | "press" | "pr" | "public_relations" => EntityViewport::Media,
242            "historical" | "history" | "past" => EntityViewport::Historical,
243            "general" | "generic" | "" => EntityViewport::General,
244            other => EntityViewport::Custom(other.to_string()),
245        })
246    }
247}
248
249impl std::fmt::Display for EntityViewport {
250    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
251        write!(f, "{}", self.as_str())
252    }
253}
254
255// ============================================================================
256// Entity Type
257// ============================================================================
258
259/// Entity type classification.
260///
261/// Organized into categories:
262/// - **Named** (ML-required): Person, Organization, Location
263/// - **Temporal** (pattern): Date, Time
264/// - **Numeric** (pattern): Money, Percent, Quantity, Cardinal, Ordinal
265/// - **Contact** (pattern): Email, Url, Phone
266///
267/// # Examples
268///
269/// ```
270/// use anno_core::EntityType;
271///
272/// let ty = EntityType::Email;
273/// assert!(ty.category().pattern_detectable());
274/// assert!(!ty.category().requires_ml());
275///
276/// let ty = EntityType::Person;
277/// assert!(ty.category().requires_ml());
278/// ```
279#[derive(Debug, Clone, PartialEq, Eq, Hash)]
280#[non_exhaustive]
281pub enum EntityType {
282    // === Named Entities (ML-required) ===
283    /// Person name (PER) - requires ML/context
284    Person,
285    /// Organization name (ORG) - requires ML/context
286    Organization,
287    /// Location/Place (LOC/GPE) - requires ML/context
288    Location,
289
290    // === Temporal Entities (Pattern-detectable) ===
291    /// Date expression (DATE) - pattern-detectable
292    Date,
293    /// Time expression (TIME) - pattern-detectable
294    Time,
295
296    // === Numeric Entities (Pattern-detectable) ===
297    /// Monetary value (MONEY) - pattern-detectable
298    Money,
299    /// Percentage (PERCENT) - pattern-detectable
300    Percent,
301    /// Quantity with unit (QUANTITY) - pattern-detectable
302    Quantity,
303    /// Cardinal number (CARDINAL) - pattern-detectable
304    Cardinal,
305    /// Ordinal number (ORDINAL) - pattern-detectable
306    Ordinal,
307
308    // === Contact Entities (Pattern-detectable) ===
309    /// Email address - pattern-detectable
310    Email,
311    /// URL/URI - pattern-detectable
312    Url,
313    /// Phone number - pattern-detectable
314    Phone,
315
316    // === Extensibility ===
317    /// Domain-specific custom type with explicit category
318    Custom {
319        /// Type name (e.g., "DISEASE", "PRODUCT", "EVENT")
320        name: String,
321        /// Category for this custom type
322        category: EntityCategory,
323    },
324
325    /// Legacy catch-all for unknown types.
326    ///
327    /// **Deprecated**: use `EntityType::custom(name, category)` instead.
328    /// Retained only for serde backward compatibility with existing data.
329    /// Deserialization of `{"Other":"X"}` now routes to `Custom { name: "X", category: Misc }`.
330    #[deprecated(note = "use EntityType::custom(name, EntityCategory::Misc) instead")]
331    Other(String),
332}
333
334impl EntityType {
335    /// Get the category of this entity type.
336    #[must_use]
337    pub fn category(&self) -> EntityCategory {
338        match self {
339            // Agent entities (people/groups)
340            EntityType::Person => EntityCategory::Agent,
341            // Organization entities
342            EntityType::Organization => EntityCategory::Organization,
343            // Place entities (locations)
344            EntityType::Location => EntityCategory::Place,
345            // Temporal entities
346            EntityType::Date | EntityType::Time => EntityCategory::Temporal,
347            // Numeric entities
348            EntityType::Money
349            | EntityType::Percent
350            | EntityType::Quantity
351            | EntityType::Cardinal
352            | EntityType::Ordinal => EntityCategory::Numeric,
353            // Contact entities
354            EntityType::Email | EntityType::Url | EntityType::Phone => EntityCategory::Contact,
355            // Custom with explicit category
356            EntityType::Custom { category, .. } => *category,
357            // Legacy Other -- kept for exhaustiveness (variant is #[deprecated])
358            #[allow(deprecated)]
359            EntityType::Other(_) => EntityCategory::Misc,
360        }
361    }
362
363    /// Returns true if this entity type requires ML for detection.
364    #[must_use]
365    pub fn requires_ml(&self) -> bool {
366        self.category().requires_ml()
367    }
368
369    /// Returns true if this entity type can be detected via patterns.
370    #[must_use]
371    pub fn pattern_detectable(&self) -> bool {
372        self.category().pattern_detectable()
373    }
374
375    /// Convert to standard label string (CoNLL/OntoNotes format).
376    ///
377    /// ```
378    /// use anno_core::EntityType;
379    ///
380    /// assert_eq!(EntityType::Person.as_label(), "PER");
381    /// assert_eq!(EntityType::Location.as_label(), "LOC");
382    /// ```
383    #[must_use]
384    pub fn as_label(&self) -> &str {
385        match self {
386            EntityType::Person => "PER",
387            EntityType::Organization => "ORG",
388            EntityType::Location => "LOC",
389            EntityType::Date => "DATE",
390            EntityType::Time => "TIME",
391            EntityType::Money => "MONEY",
392            EntityType::Percent => "PERCENT",
393            EntityType::Quantity => "QUANTITY",
394            EntityType::Cardinal => "CARDINAL",
395            EntityType::Ordinal => "ORDINAL",
396            EntityType::Email => "EMAIL",
397            EntityType::Url => "URL",
398            EntityType::Phone => "PHONE",
399            EntityType::Custom { name, .. } => name.as_str(),
400            #[allow(deprecated)]
401            EntityType::Other(s) => s.as_str(),
402        }
403    }
404
405    /// Parse from standard label string.
406    ///
407    /// Handles various formats: CoNLL (PER), OntoNotes (PERSON), BIO (B-PER).
408    ///
409    /// ```
410    /// use anno_core::EntityType;
411    ///
412    /// assert_eq!(EntityType::from_label("PER"), EntityType::Person);
413    /// assert_eq!(EntityType::from_label("B-ORG"), EntityType::Organization);
414    /// assert_eq!(EntityType::from_label("PERSON"), EntityType::Person);
415    /// ```
416    #[must_use]
417    pub fn from_label(label: &str) -> Self {
418        // Strip BIO prefix if present
419        let label = label
420            .strip_prefix("B-")
421            .or_else(|| label.strip_prefix("I-"))
422            .or_else(|| label.strip_prefix("E-"))
423            .or_else(|| label.strip_prefix("S-"))
424            .unwrap_or(label);
425
426        match label.to_uppercase().as_str() {
427            // Named entities (multiple variations)
428            "PER" | "PERSON" => EntityType::Person,
429            "ORG" | "ORGANIZATION" | "COMPANY" | "CORPORATION" => EntityType::Organization,
430            "LOC" | "LOCATION" | "GPE" | "GEO-LOC" => EntityType::Location,
431            // WNUT / FewNERD specific types (common in social media / Wikipedia)
432            "FACILITY" | "FAC" | "BUILDING" => {
433                EntityType::custom("BUILDING", EntityCategory::Place)
434            }
435            "PRODUCT" | "PROD" => EntityType::custom("PRODUCT", EntityCategory::Misc),
436            "EVENT" => EntityType::custom("EVENT", EntityCategory::Creative),
437            "CREATIVE-WORK" | "WORK_OF_ART" | "ART" => {
438                EntityType::custom("CREATIVE_WORK", EntityCategory::Creative)
439            }
440            "GROUP" | "NORP" => EntityType::custom("GROUP", EntityCategory::Agent),
441            // Temporal
442            "DATE" => EntityType::Date,
443            "TIME" => EntityType::Time,
444            // Numeric
445            "MONEY" | "CURRENCY" => EntityType::Money,
446            "PERCENT" | "PERCENTAGE" => EntityType::Percent,
447            "QUANTITY" => EntityType::Quantity,
448            "CARDINAL" => EntityType::Cardinal,
449            "ORDINAL" => EntityType::Ordinal,
450            // Contact
451            "EMAIL" => EntityType::Email,
452            "URL" | "URI" => EntityType::Url,
453            "PHONE" | "TELEPHONE" => EntityType::Phone,
454            // MISC variations
455            "MISC" | "MISCELLANEOUS" | "OTHER" => EntityType::custom("MISC", EntityCategory::Misc),
456            // Biomedical types
457            "DISEASE" | "DISORDER" => EntityType::custom("DISEASE", EntityCategory::Misc),
458            "CHEMICAL" | "DRUG" => EntityType::custom("CHEMICAL", EntityCategory::Misc),
459            "GENE" => EntityType::custom("GENE", EntityCategory::Misc),
460            "PROTEIN" => EntityType::custom("PROTEIN", EntityCategory::Misc),
461            // Unknown -> Custom with Misc category
462            other => EntityType::custom(other, EntityCategory::Misc),
463        }
464    }
465
466    /// Create a custom domain-specific entity type.
467    ///
468    /// # Examples
469    ///
470    /// ```
471    /// use anno_core::{EntityType, EntityCategory};
472    ///
473    /// let disease = EntityType::custom("DISEASE", EntityCategory::Agent);
474    /// assert!(disease.requires_ml());
475    ///
476    /// let product_id = EntityType::custom("PRODUCT_ID", EntityCategory::Misc);
477    /// assert!(!product_id.requires_ml());
478    /// ```
479    #[must_use]
480    pub fn custom(name: impl Into<String>, category: EntityCategory) -> Self {
481        EntityType::Custom {
482            name: name.into(),
483            category,
484        }
485    }
486}
487
488impl std::fmt::Display for EntityType {
489    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
490        write!(f, "{}", self.as_label())
491    }
492}
493
494impl std::str::FromStr for EntityType {
495    type Err = std::convert::Infallible;
496
497    /// Parse from standard label string. Never fails -- unknown labels become `Custom`.
498    fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
499        Ok(Self::from_label(s))
500    }
501}
502
503// Flatten EntityType to its label string for JSON serialization.
504// `Custom { name: "MISC", .. }` -> `"MISC"`, `Person` -> `"PER"`, etc.
505// Deserialization accepts both the flat string (new format) and the
506// tagged-enum object (backward compat with existing serialized data).
507impl Serialize for EntityType {
508    fn serialize<S: serde::Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
509        serializer.serialize_str(self.as_label())
510    }
511}
512
513impl<'de> Deserialize<'de> for EntityType {
514    fn deserialize<D: serde::Deserializer<'de>>(deserializer: D) -> Result<Self, D::Error> {
515        struct EntityTypeVisitor;
516
517        impl<'de> serde::de::Visitor<'de> for EntityTypeVisitor {
518            type Value = EntityType;
519
520            fn expecting(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
521                f.write_str("a string label or a tagged enum object")
522            }
523
524            // New flat format: "PER", "ORG", "MISC", etc.
525            fn visit_str<E: serde::de::Error>(self, v: &str) -> Result<EntityType, E> {
526                Ok(EntityType::from_label(v))
527            }
528
529            // Backward-compat: {"Custom":{"name":"MISC","category":"Misc"}}
530            // or {"Other":"foo"} or "Person" (unit variant as map key)
531            fn visit_map<A: serde::de::MapAccess<'de>>(
532                self,
533                mut map: A,
534            ) -> Result<EntityType, A::Error> {
535                let key: String = map
536                    .next_key()?
537                    .ok_or_else(|| serde::de::Error::custom("empty object"))?;
538                match key.as_str() {
539                    "Custom" => {
540                        #[derive(Deserialize)]
541                        struct CustomFields {
542                            name: String,
543                            category: EntityCategory,
544                        }
545                        let fields: CustomFields = map.next_value()?;
546                        Ok(EntityType::Custom {
547                            name: fields.name,
548                            category: fields.category,
549                        })
550                    }
551                    "Other" => {
552                        // Route legacy Other to Custom with Misc category
553                        let val: String = map.next_value()?;
554                        Ok(EntityType::custom(val, EntityCategory::Misc))
555                    }
556                    // Unit variants serialized as {"Person":null} etc.
557                    variant => {
558                        // Consume the value (null or unit)
559                        let _: serde::de::IgnoredAny = map.next_value()?;
560                        Ok(EntityType::from_label(variant))
561                    }
562                }
563            }
564        }
565
566        deserializer.deserialize_any(EntityTypeVisitor)
567    }
568}
569
570// =============================================================================
571// Type Mapping for Domain-Specific Datasets
572// =============================================================================
573
574/// Maps domain-specific entity types to standard NER types.
575///
576/// # Research Context (Familiarity paper, arXiv:2412.10121)
577///
578/// Type mapping creates "label overlap" between training and evaluation:
579/// - Mapping ACTOR → Person increases overlap
580/// - This can inflate zero-shot F1 scores
581///
582/// Use `LabelShift::from_type_sets()` to quantify how much overlap exists.
583/// High overlap (>80%) means the evaluation is NOT truly zero-shot.
584///
585/// # When to Use TypeMapper
586///
587/// - Cross-dataset comparison (normalize schemas for fair eval)
588/// - Domain adaptation (map new labels to known types)
589///
590/// # When NOT to Use TypeMapper
591///
592/// - True zero-shot evaluation (keep labels distinct)
593/// - Measuring generalization (overlap hides generalization failures)
594///
595/// # Example
596///
597/// ```rust
598/// use anno_core::{TypeMapper, EntityType, EntityCategory};
599///
600/// // MIT Movie dataset mapping
601/// let mut mapper = TypeMapper::new();
602/// mapper.add("ACTOR", EntityType::Person);
603/// mapper.add("DIRECTOR", EntityType::Person);
604/// mapper.add("TITLE", EntityType::custom("WORK_OF_ART", EntityCategory::Creative));
605///
606/// assert_eq!(mapper.map("ACTOR"), Some(&EntityType::Person));
607/// assert_eq!(mapper.normalize("DIRECTOR"), EntityType::Person);
608/// ```
609#[derive(Debug, Clone, Default)]
610pub struct TypeMapper {
611    mappings: std::collections::HashMap<String, EntityType>,
612}
613
614impl TypeMapper {
615    /// Create empty mapper.
616    #[must_use]
617    pub fn new() -> Self {
618        Self::default()
619    }
620
621    /// Create mapper for MIT Movie dataset.
622    #[must_use]
623    pub fn mit_movie() -> Self {
624        let mut mapper = Self::new();
625        // Map to standard types where possible
626        mapper.add("ACTOR", EntityType::Person);
627        mapper.add("DIRECTOR", EntityType::Person);
628        mapper.add("CHARACTER", EntityType::Person);
629        mapper.add(
630            "TITLE",
631            EntityType::custom("WORK_OF_ART", EntityCategory::Creative),
632        );
633        mapper.add("GENRE", EntityType::custom("GENRE", EntityCategory::Misc));
634        mapper.add("YEAR", EntityType::Date);
635        mapper.add("RATING", EntityType::custom("RATING", EntityCategory::Misc));
636        mapper.add("PLOT", EntityType::custom("PLOT", EntityCategory::Misc));
637        mapper
638    }
639
640    /// Create mapper for MIT Restaurant dataset.
641    #[must_use]
642    pub fn mit_restaurant() -> Self {
643        let mut mapper = Self::new();
644        mapper.add("RESTAURANT_NAME", EntityType::Organization);
645        mapper.add("LOCATION", EntityType::Location);
646        mapper.add(
647            "CUISINE",
648            EntityType::custom("CUISINE", EntityCategory::Misc),
649        );
650        mapper.add("DISH", EntityType::custom("DISH", EntityCategory::Misc));
651        mapper.add("PRICE", EntityType::Money);
652        mapper.add(
653            "AMENITY",
654            EntityType::custom("AMENITY", EntityCategory::Misc),
655        );
656        mapper.add("HOURS", EntityType::Time);
657        mapper
658    }
659
660    /// Create mapper for biomedical datasets (BC5CDR, NCBI).
661    #[must_use]
662    pub fn biomedical() -> Self {
663        let mut mapper = Self::new();
664        mapper.add(
665            "DISEASE",
666            EntityType::custom("DISEASE", EntityCategory::Agent),
667        );
668        mapper.add(
669            "CHEMICAL",
670            EntityType::custom("CHEMICAL", EntityCategory::Misc),
671        );
672        mapper.add("DRUG", EntityType::custom("DRUG", EntityCategory::Misc));
673        mapper.add("GENE", EntityType::custom("GENE", EntityCategory::Misc));
674        mapper.add(
675            "PROTEIN",
676            EntityType::custom("PROTEIN", EntityCategory::Misc),
677        );
678        // GENIA types
679        mapper.add("DNA", EntityType::custom("DNA", EntityCategory::Misc));
680        mapper.add("RNA", EntityType::custom("RNA", EntityCategory::Misc));
681        mapper.add(
682            "cell_line",
683            EntityType::custom("CELL_LINE", EntityCategory::Misc),
684        );
685        mapper.add(
686            "cell_type",
687            EntityType::custom("CELL_TYPE", EntityCategory::Misc),
688        );
689        mapper
690    }
691
692    /// Create mapper for social media NER datasets (TweetNER7, etc.).
693    #[must_use]
694    pub fn social_media() -> Self {
695        let mut mapper = Self::new();
696        // TweetNER7 types
697        mapper.add("person", EntityType::Person);
698        mapper.add("corporation", EntityType::Organization);
699        mapper.add("location", EntityType::Location);
700        mapper.add("group", EntityType::Organization);
701        mapper.add(
702            "product",
703            EntityType::custom("PRODUCT", EntityCategory::Misc),
704        );
705        mapper.add(
706            "creative_work",
707            EntityType::custom("WORK_OF_ART", EntityCategory::Creative),
708        );
709        mapper.add("event", EntityType::custom("EVENT", EntityCategory::Misc));
710        mapper
711    }
712
713    /// Create mapper for manufacturing domain datasets (FabNER, etc.).
714    #[must_use]
715    pub fn manufacturing() -> Self {
716        let mut mapper = Self::new();
717        // FabNER entity types
718        mapper.add("MATE", EntityType::custom("MATERIAL", EntityCategory::Misc));
719        mapper.add("MANP", EntityType::custom("PROCESS", EntityCategory::Misc));
720        mapper.add("MACEQ", EntityType::custom("MACHINE", EntityCategory::Misc));
721        mapper.add(
722            "APPL",
723            EntityType::custom("APPLICATION", EntityCategory::Misc),
724        );
725        mapper.add("FEAT", EntityType::custom("FEATURE", EntityCategory::Misc));
726        mapper.add(
727            "PARA",
728            EntityType::custom("PARAMETER", EntityCategory::Misc),
729        );
730        mapper.add("PRO", EntityType::custom("PROPERTY", EntityCategory::Misc));
731        mapper.add(
732            "CHAR",
733            EntityType::custom("CHARACTERISTIC", EntityCategory::Misc),
734        );
735        mapper.add(
736            "ENAT",
737            EntityType::custom("ENABLING_TECHNOLOGY", EntityCategory::Misc),
738        );
739        mapper.add(
740            "CONPRI",
741            EntityType::custom("CONCEPT_PRINCIPLE", EntityCategory::Misc),
742        );
743        mapper.add(
744            "BIOP",
745            EntityType::custom("BIO_PROCESS", EntityCategory::Misc),
746        );
747        mapper.add(
748            "MANS",
749            EntityType::custom("MAN_STANDARD", EntityCategory::Misc),
750        );
751        mapper
752    }
753
754    /// Add a mapping from source label to target type.
755    pub fn add(&mut self, source: impl Into<String>, target: EntityType) {
756        self.mappings.insert(source.into().to_uppercase(), target);
757    }
758
759    /// Get mapped type for a label (returns None if not mapped).
760    #[must_use]
761    pub fn map(&self, label: &str) -> Option<&EntityType> {
762        self.mappings.get(&label.to_uppercase())
763    }
764
765    /// Normalize a label to EntityType, using mapping if available.
766    ///
767    /// Falls back to `EntityType::from_label()` if no mapping exists.
768    #[must_use]
769    pub fn normalize(&self, label: &str) -> EntityType {
770        self.map(label)
771            .cloned()
772            .unwrap_or_else(|| EntityType::from_label(label))
773    }
774
775    /// Check if a label is mapped.
776    #[must_use]
777    pub fn contains(&self, label: &str) -> bool {
778        self.mappings.contains_key(&label.to_uppercase())
779    }
780
781    /// Get all source labels.
782    pub fn labels(&self) -> impl Iterator<Item = &String> {
783        self.mappings.keys()
784    }
785}
786
787/// Extraction method used to identify an entity.
788///
789/// # Research Context
790///
791/// Different extraction methods have different strengths:
792///
793/// | Method | Precision | Recall | Generalization | Use Case |
794/// |--------|-----------|--------|----------------|----------|
795/// | Pattern | Very High | Low | N/A (format-based) | Dates, emails, money |
796/// | Neural | High | High | Good | General NER |
797/// | Lexicon | Very High | Low | None | Closed-domain entities |
798/// | SoftLexicon | Medium | High | Good for rare types | Low-resource NER |
799/// | GatedEnsemble | Highest | Highest | Contextual | Short texts, domain shift |
800///
801/// See `docs/` for repo-local notes and entry points.
802#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, Default)]
803#[non_exhaustive]
804pub enum ExtractionMethod {
805    /// Regex pattern matching (high precision for structured data like dates, money).
806    /// Does not generalize - only detects format-based entities.
807    Pattern,
808
809    /// Neural model inference (BERT, GLiNER, etc.).
810    /// The recommended default for general NER. Generalizes to unseen entities.
811    #[default]
812    Neural,
813
814    /// Exact lexicon/gazetteer lookup (deprecated approach).
815    /// High precision on known entities, zero recall on novel entities.
816    /// Only use for closed domains (stock tickers, medical codes).
817    #[deprecated(since = "0.2.0", note = "Use Neural or GatedEnsemble instead")]
818    Lexicon,
819
820    /// Embedding-based soft lexicon matching.
821    /// Useful for low-resource languages and rare entity types.
822    /// See: Rijhwani et al. (2020) "Soft Gazetteers for Low-Resource NER"
823    SoftLexicon,
824
825    /// Gated ensemble: neural + lexicon with learned weighting.
826    /// Model learns when to trust lexicon vs. context.
827    /// See: Nie et al. (2021) "GEMNET: Effective Gated Gazetteer Representations"
828    GatedEnsemble,
829
830    /// Multiple methods agreed on this entity (high confidence).
831    Consensus,
832
833    /// Heuristic-based extraction (capitalization, word shape, context).
834    /// Used by heuristic backends that don't use neural models.
835    Heuristic,
836
837    /// Unknown or unspecified extraction method.
838    Unknown,
839
840    /// Legacy rule-based extraction (for backward compatibility).
841    #[deprecated(since = "0.2.0", note = "Use Heuristic or Pattern instead")]
842    Rule,
843
844    /// Legacy alias for Neural (for backward compatibility).
845    #[deprecated(since = "0.2.0", note = "Use Neural instead")]
846    ML,
847
848    /// Legacy alias for Consensus (for backward compatibility).
849    #[deprecated(since = "0.2.0", note = "Use Consensus instead")]
850    Ensemble,
851}
852
853impl ExtractionMethod {
854    /// Returns true if this extraction method produces probabilistically calibrated
855    /// confidence scores suitable for calibration analysis (ECE, Brier score, etc.).
856    ///
857    /// # Calibrated Methods
858    ///
859    /// - **Neural**: Softmax outputs are intended to be probabilistic (though may need
860    ///   temperature scaling for true calibration)
861    /// - **GatedEnsemble**: Produces learned probability estimates
862    /// - **SoftLexicon**: Embedding similarity is pseudo-probabilistic
863    ///
864    /// # Uncalibrated Methods
865    ///
866    /// - **Pattern**: Binary (match/no-match); confidence is typically hardcoded
867    /// - **Heuristic**: Arbitrary scores from hand-crafted rules
868    /// - **Lexicon**: Binary exact match
869    /// - **Consensus**: Agreement count, not a probability
870    ///
871    /// # Example
872    ///
873    /// ```rust
874    /// use anno_core::ExtractionMethod;
875    ///
876    /// assert!(ExtractionMethod::Neural.is_calibrated());
877    /// assert!(!ExtractionMethod::Pattern.is_calibrated());
878    /// assert!(!ExtractionMethod::Heuristic.is_calibrated());
879    /// ```
880    #[must_use]
881    pub const fn is_calibrated(&self) -> bool {
882        #[allow(deprecated)]
883        match self {
884            ExtractionMethod::Neural => true,
885            ExtractionMethod::GatedEnsemble => true,
886            ExtractionMethod::SoftLexicon => true,
887            ExtractionMethod::ML => true, // Legacy alias for Neural
888            // Everything else is not calibrated
889            ExtractionMethod::Pattern => false,
890            ExtractionMethod::Lexicon => false,
891            ExtractionMethod::Consensus => false,
892            ExtractionMethod::Heuristic => false,
893            ExtractionMethod::Unknown => false,
894            ExtractionMethod::Rule => false,
895            ExtractionMethod::Ensemble => false,
896        }
897    }
898
899    /// Returns the confidence interpretation for this extraction method.
900    ///
901    /// This helps users understand what the confidence score means:
902    /// - `"probability"`: Score approximates P(correct)
903    /// - `"heuristic_score"`: Score is a non-probabilistic quality measure
904    /// - `"binary"`: Score is 0 or 1 (or a fixed value for matches)
905    #[must_use]
906    pub const fn confidence_interpretation(&self) -> &'static str {
907        #[allow(deprecated)]
908        match self {
909            ExtractionMethod::Neural | ExtractionMethod::ML => "probability",
910            ExtractionMethod::GatedEnsemble | ExtractionMethod::SoftLexicon => "probability",
911            ExtractionMethod::Pattern | ExtractionMethod::Lexicon => "binary",
912            ExtractionMethod::Heuristic | ExtractionMethod::Rule => "heuristic_score",
913            ExtractionMethod::Consensus | ExtractionMethod::Ensemble => "agreement_ratio",
914            ExtractionMethod::Unknown => "unknown",
915        }
916    }
917}
918
919impl std::fmt::Display for ExtractionMethod {
920    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
921        #[allow(deprecated)]
922        match self {
923            ExtractionMethod::Pattern => write!(f, "pattern"),
924            ExtractionMethod::Neural => write!(f, "neural"),
925            ExtractionMethod::Lexicon => write!(f, "lexicon"),
926            ExtractionMethod::SoftLexicon => write!(f, "soft_lexicon"),
927            ExtractionMethod::GatedEnsemble => write!(f, "gated_ensemble"),
928            ExtractionMethod::Consensus => write!(f, "consensus"),
929            ExtractionMethod::Heuristic => write!(f, "heuristic"),
930            ExtractionMethod::Unknown => write!(f, "unknown"),
931            ExtractionMethod::Rule => write!(f, "heuristic"), // Legacy alias
932            ExtractionMethod::ML => write!(f, "neural"),      // Legacy alias
933            ExtractionMethod::Ensemble => write!(f, "consensus"), // Legacy alias
934        }
935    }
936}
937
938// =============================================================================
939// Lexicon Traits
940// =============================================================================
941
942/// Exact-match lexicon/gazetteer for entity lookup.
943///
944/// # Research Context
945///
946/// Gazetteers (lists of known entities) are a classic NER technique. Modern research
947/// suggests they are most valuable when:
948///
949/// 1. **Domain is closed**: Stock tickers, medical codes, known product catalogs
950/// 2. **Text is short**: where context is insufficient
951/// 3. **Used as features**: Input to neural model, not final output (Song et al. 2020)
952///
953/// They're harmful when:
954/// 1. **Domain is open**: Novel entities not in the list get missed
955/// 2. **Used as authority**: Hardcoded lookups inflate test scores but fail in production
956///
957/// # When to Use
958///
959/// ```text
960/// Decision: Should I use a Lexicon?
961///
962/// Is entity type CLOSED (fixed, known list)?
963/// ├─ Yes: Lexicon is appropriate
964/// │       Examples: stock tickers, ICD-10 codes, country names
965/// └─ No:  Use Neural extraction instead
966///         Examples: person names, organization names, products
967/// ```
968///
969/// # Example
970///
971/// ```rust
972/// use anno_core::{Lexicon, EntityType, HashMapLexicon};
973///
974/// // Create a domain-specific lexicon
975/// let mut lexicon = HashMapLexicon::new("stock_tickers");
976/// lexicon.insert("AAPL", EntityType::Organization, 0.99);
977/// lexicon.insert("GOOGL", EntityType::Organization, 0.99);
978///
979/// // Lookup
980/// if let Some((entity_type, confidence)) = lexicon.lookup("AAPL") {
981///     assert_eq!(entity_type, EntityType::Organization);
982///     assert!(confidence > 0.9);
983/// }
984/// ```
985///
986/// # References
987///
988/// - Song et al. (2020). "Improving Neural NER with Gazetteers"
989/// - Nie et al. (2021). "GEMNET: Effective Gated Gazetteer Representations"
990/// - Rijhwani et al. (2020). "Soft Gazetteers for Low-Resource NER"
991pub trait Lexicon: Send + Sync {
992    /// Lookup an exact string, returning entity type and confidence if found.
993    ///
994    /// Returns `None` if the text is not in the lexicon.
995    fn lookup(&self, text: &str) -> Option<(EntityType, Confidence)>;
996
997    /// Check if the lexicon contains this exact string.
998    fn contains(&self, text: &str) -> bool {
999        self.lookup(text).is_some()
1000    }
1001
1002    /// Get the lexicon source identifier (for provenance tracking).
1003    fn source(&self) -> &str;
1004
1005    /// Get approximate number of entries (for debugging/metrics).
1006    fn len(&self) -> usize;
1007
1008    /// Check if lexicon is empty.
1009    fn is_empty(&self) -> bool {
1010        self.len() == 0
1011    }
1012}
1013
1014/// Simple HashMap-based lexicon implementation.
1015///
1016/// Suitable for small to medium lexicons (<100k entries).
1017/// For larger lexicons, consider a trie-based or FST implementation.
1018#[derive(Debug, Clone)]
1019pub struct HashMapLexicon {
1020    entries: std::collections::HashMap<String, (EntityType, Confidence)>,
1021    source: String,
1022}
1023
1024impl HashMapLexicon {
1025    /// Create a new empty lexicon with the given source identifier.
1026    #[must_use]
1027    pub fn new(source: impl Into<String>) -> Self {
1028        Self {
1029            entries: std::collections::HashMap::new(),
1030            source: source.into(),
1031        }
1032    }
1033
1034    /// Insert an entry into the lexicon.
1035    pub fn insert(
1036        &mut self,
1037        text: impl Into<String>,
1038        entity_type: EntityType,
1039        confidence: impl Into<Confidence>,
1040    ) {
1041        self.entries
1042            .insert(text.into(), (entity_type, confidence.into()));
1043    }
1044
1045    /// Create from an iterator of (text, type, confidence) tuples.
1046    pub fn from_iter<I, S, C>(source: impl Into<String>, entries: I) -> Self
1047    where
1048        I: IntoIterator<Item = (S, EntityType, C)>,
1049        S: Into<String>,
1050        C: Into<Confidence>,
1051    {
1052        let mut lexicon = Self::new(source);
1053        for (text, entity_type, conf) in entries {
1054            lexicon.insert(text, entity_type, conf);
1055        }
1056        lexicon
1057    }
1058
1059    /// Get all entries as an iterator (for debugging).
1060    pub fn entries(&self) -> impl Iterator<Item = (&str, &EntityType, Confidence)> {
1061        self.entries.iter().map(|(k, (t, c))| (k.as_str(), t, *c))
1062    }
1063}
1064
1065impl Lexicon for HashMapLexicon {
1066    fn lookup(&self, text: &str) -> Option<(EntityType, Confidence)> {
1067        self.entries.get(text).cloned()
1068    }
1069
1070    fn source(&self) -> &str {
1071        &self.source
1072    }
1073
1074    fn len(&self) -> usize {
1075        self.entries.len()
1076    }
1077}
1078
1079/// Provenance information for an extracted entity.
1080///
1081/// Tracks where an entity came from for debugging, explainability,
1082/// and confidence calibration in hybrid/ensemble systems.
1083#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Default)]
1084pub struct Provenance {
1085    /// Name of the backend that produced this entity (e.g., "pattern", "bert-onnx")
1086    pub source: Cow<'static, str>,
1087    /// Extraction method used
1088    pub method: ExtractionMethod,
1089    /// Specific pattern/rule name (for pattern/rule-based extraction)
1090    pub pattern: Option<Cow<'static, str>>,
1091    /// Raw confidence from the source model (before any calibration)
1092    pub raw_confidence: Option<Confidence>,
1093    /// Model version for reproducibility (e.g., "gliner-v2.1", "bert-base-uncased-2024-01")
1094    #[serde(default, skip_serializing_if = "Option::is_none")]
1095    pub model_version: Option<Cow<'static, str>>,
1096    /// Timestamp when extraction occurred (ISO 8601)
1097    #[serde(default, skip_serializing_if = "Option::is_none")]
1098    pub timestamp: Option<String>,
1099}
1100
1101impl Provenance {
1102    /// Create provenance for regex-based extraction.
1103    #[must_use]
1104    pub fn pattern(pattern_name: &'static str) -> Self {
1105        Self {
1106            source: Cow::Borrowed("pattern"),
1107            method: ExtractionMethod::Pattern,
1108            pattern: Some(Cow::Borrowed(pattern_name)),
1109            raw_confidence: Some(Confidence::ONE), // Patterns are deterministic
1110            model_version: None,
1111            timestamp: None,
1112        }
1113    }
1114
1115    /// Create provenance for ML-based extraction.
1116    ///
1117    /// Accepts both static strings and owned strings:
1118    /// ```rust
1119    /// use anno_core::Provenance;
1120    ///
1121    /// // Static string (zero allocation)
1122    /// let p1 = Provenance::ml("gliner", 0.95);
1123    ///
1124    /// // Owned string (dynamic model name)
1125    /// let model_name = "bert-base";
1126    /// let p2 = Provenance::ml(model_name.to_string(), 0.95);
1127    /// ```
1128    #[must_use]
1129    pub fn ml(model_name: impl Into<Cow<'static, str>>, confidence: impl Into<Confidence>) -> Self {
1130        Self {
1131            source: model_name.into(),
1132            method: ExtractionMethod::Neural,
1133            pattern: None,
1134            raw_confidence: Some(confidence.into()),
1135            model_version: None,
1136            timestamp: None,
1137        }
1138    }
1139
1140    /// Deprecated: Use `ml()` instead which now accepts both static and owned strings.
1141    #[deprecated(
1142        since = "0.2.1",
1143        note = "Use ml() instead, it now accepts owned strings"
1144    )]
1145    #[must_use]
1146    pub fn ml_owned(model_name: impl Into<String>, confidence: impl Into<Confidence>) -> Self {
1147        Self::ml(Cow::Owned(model_name.into()), confidence)
1148    }
1149
1150    /// Create provenance for ensemble/hybrid extraction.
1151    #[must_use]
1152    pub fn ensemble(sources: &'static str) -> Self {
1153        Self {
1154            source: Cow::Borrowed(sources),
1155            method: ExtractionMethod::Consensus,
1156            pattern: None,
1157            raw_confidence: None,
1158            model_version: None,
1159            timestamp: None,
1160        }
1161    }
1162
1163    /// Create provenance with model version for reproducibility.
1164    #[must_use]
1165    pub fn with_version(mut self, version: &'static str) -> Self {
1166        self.model_version = Some(Cow::Borrowed(version));
1167        self
1168    }
1169
1170    /// Create provenance with timestamp.
1171    #[must_use]
1172    pub fn with_timestamp(mut self, timestamp: impl Into<String>) -> Self {
1173        self.timestamp = Some(timestamp.into());
1174        self
1175    }
1176}
1177
1178// ============================================================================
1179// Span Types (Multi-Modal Support)
1180// ============================================================================
1181
1182/// A span locator for text and visual modalities.
1183///
1184/// `Span` is a **simplified subset** of [`grounded::Location`] designed for
1185/// the detection layer (`Entity`). It covers the most common cases:
1186///
1187/// - Text offsets (traditional NER)
1188/// - Bounding boxes (visual document understanding)
1189/// - Hybrid (OCR with both text and visual location)
1190///
1191/// # Relationship to `Location`
1192///
1193/// | `Span` variant | `Location` equivalent |
1194/// |----------------|-----------------------|
1195/// | `Text` | `Location::Text` |
1196/// | `BoundingBox` | `Location::BoundingBox` |
1197/// | `Hybrid` | `Location::TextWithBbox` |
1198///
1199/// For modalities not covered by `Span` (temporal, cuboid, genomic, discontinuous),
1200/// use `Location` directly via the canonical `Signal` → `Track` → `Identity` pipeline.
1201///
1202/// # Conversion
1203///
1204/// - `Span → Location`: Always succeeds via `Location::from(&span)`
1205/// - `Location → Span`: Use `location.to_span()`, returns `None` for unsupported variants
1206///
1207/// [`grounded::Location`]: super::grounded::Location
1208#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
1209pub enum Span {
1210    /// Text span with **character offsets** (start, end).
1211    ///
1212    /// Offsets are Unicode scalar value indices (what `text.chars()` counts),
1213    /// consistent with `Entity.start/end` and `grounded::Location::Text`.
1214    Text {
1215        /// Start character offset (inclusive)
1216        start: usize,
1217        /// End character offset (exclusive)
1218        end: usize,
1219    },
1220    /// Visual bounding box (normalized 0.0-1.0 coordinates)
1221    /// For ColPali: image patch locations
1222    BoundingBox {
1223        /// X coordinate (normalized 0.0-1.0)
1224        x: f32,
1225        /// Y coordinate (normalized 0.0-1.0)
1226        y: f32,
1227        /// Width (normalized 0.0-1.0)
1228        width: f32,
1229        /// Height (normalized 0.0-1.0)
1230        height: f32,
1231        /// Optional page number (for multi-page documents)
1232        page: Option<u32>,
1233    },
1234    /// Hybrid: both text and visual location (for OCR-verified extraction)
1235    Hybrid {
1236        /// Start character offset (inclusive)
1237        start: usize,
1238        /// End character offset (exclusive)
1239        end: usize,
1240        /// Bounding box for visual location
1241        bbox: Box<Span>,
1242    },
1243}
1244
1245impl Span {
1246    /// Create a text span.
1247    #[must_use]
1248    pub const fn text(start: usize, end: usize) -> Self {
1249        Self::Text { start, end }
1250    }
1251
1252    /// Create a bounding box span with normalized coordinates.
1253    #[must_use]
1254    pub fn bbox(x: f32, y: f32, width: f32, height: f32) -> Self {
1255        Self::BoundingBox {
1256            x,
1257            y,
1258            width,
1259            height,
1260            page: None,
1261        }
1262    }
1263
1264    /// Create a bounding box with page number.
1265    #[must_use]
1266    pub fn bbox_on_page(x: f32, y: f32, width: f32, height: f32, page: u32) -> Self {
1267        Self::BoundingBox {
1268            x,
1269            y,
1270            width,
1271            height,
1272            page: Some(page),
1273        }
1274    }
1275
1276    /// Check if this is a text span.
1277    #[must_use]
1278    pub const fn is_text(&self) -> bool {
1279        matches!(self, Self::Text { .. } | Self::Hybrid { .. })
1280    }
1281
1282    /// Check if this has visual location.
1283    #[must_use]
1284    pub const fn is_visual(&self) -> bool {
1285        matches!(self, Self::BoundingBox { .. } | Self::Hybrid { .. })
1286    }
1287
1288    /// Get text offsets if available.
1289    #[must_use]
1290    pub const fn text_offsets(&self) -> Option<(usize, usize)> {
1291        match self {
1292            Self::Text { start, end } => Some((*start, *end)),
1293            Self::Hybrid { start, end, .. } => Some((*start, *end)),
1294            Self::BoundingBox { .. } => None,
1295        }
1296    }
1297
1298    /// Calculate span length for text spans.
1299    #[must_use]
1300    pub fn len(&self) -> usize {
1301        match self {
1302            Self::Text { start, end } => end.saturating_sub(*start),
1303            Self::Hybrid { start, end, .. } => end.saturating_sub(*start),
1304            Self::BoundingBox { .. } => 0,
1305        }
1306    }
1307
1308    /// Check if span is empty.
1309    #[must_use]
1310    pub fn is_empty(&self) -> bool {
1311        self.len() == 0
1312    }
1313}
1314
1315// ============================================================================
1316// Discontinuous Spans (W2NER/ACE-style)
1317// ============================================================================
1318
1319/// A discontinuous span representing non-contiguous entity mentions.
1320///
1321/// Some entities span multiple non-adjacent text regions:
1322/// - "severe \[pain\] in the \[abdomen\]" → "severe abdominal pain"
1323/// - "the \[president\] ... \[Obama\]" → coreference
1324///
1325/// This is required for:
1326/// - **Medical NER**: Anatomical modifiers separated from findings
1327/// - **Legal NER**: Parties referenced across clauses
1328/// - **W2NER**: Word-word relation grids that detect discontinuous entities
1329///
1330/// # Offset Unit (CRITICAL)
1331///
1332/// `DiscontinuousSpan` uses **character offsets** (Unicode scalar value indices),
1333/// consistent with [`Entity::start`](super::entity::Entity::start) /
1334/// [`Entity::end`](super::entity::Entity::end) and `anno::core::grounded::Location`.
1335///
1336/// This is intentionally *not* byte offsets. If you have byte offsets (from regex,
1337/// `str::find`, tokenizers, etc.), convert them to character offsets first (see
1338/// `anno::offset::SpanConverter` in the `anno` crate).
1339///
1340/// # Example
1341///
1342/// ```rust
1343/// use anno_core::DiscontinuousSpan;
1344///
1345/// // "severe pain in the abdomen" where "severe" modifies "pain"
1346/// // but they're separated by other words
1347/// let span = DiscontinuousSpan::new(vec![
1348///     0..6,   // "severe"
1349///     12..16, // "pain"
1350/// ]);
1351///
1352/// assert_eq!(span.num_segments(), 2);
1353/// assert!(span.is_discontinuous());
1354/// ```
1355#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
1356pub struct DiscontinuousSpan {
1357    /// Non-overlapping segments, sorted by start position.
1358    /// Each `Range<usize>` represents (start_char, end_char).
1359    segments: Vec<std::ops::Range<usize>>,
1360}
1361
1362impl DiscontinuousSpan {
1363    /// Create a new discontinuous span from segments.
1364    ///
1365    /// Segments are sorted and validated (no overlaps).
1366    #[must_use]
1367    pub fn new(mut segments: Vec<std::ops::Range<usize>>) -> Self {
1368        // Sort by start position
1369        segments.sort_by_key(|r| r.start);
1370        Self { segments }
1371    }
1372
1373    /// Create from a single contiguous span.
1374    #[must_use]
1375    #[allow(clippy::single_range_in_vec_init)] // Intentional: contiguous is special case of discontinuous
1376    pub fn contiguous(start: usize, end: usize) -> Self {
1377        Self {
1378            segments: vec![start..end],
1379        }
1380    }
1381
1382    /// Number of segments.
1383    #[must_use]
1384    pub fn num_segments(&self) -> usize {
1385        self.segments.len()
1386    }
1387
1388    /// True if this spans multiple non-adjacent regions.
1389    #[must_use]
1390    pub fn is_discontinuous(&self) -> bool {
1391        self.segments.len() > 1
1392    }
1393
1394    /// True if this is a single contiguous span.
1395    #[must_use]
1396    pub fn is_contiguous(&self) -> bool {
1397        self.segments.len() <= 1
1398    }
1399
1400    /// Get the segments.
1401    #[must_use]
1402    pub fn segments(&self) -> &[std::ops::Range<usize>] {
1403        &self.segments
1404    }
1405
1406    /// Get the overall bounding range (start of first to end of last).
1407    #[must_use]
1408    pub fn bounding_range(&self) -> Option<std::ops::Range<usize>> {
1409        if self.segments.is_empty() {
1410            return None;
1411        }
1412        let start = self.segments.first()?.start;
1413        let end = self.segments.last()?.end;
1414        Some(start..end)
1415    }
1416
1417    /// Total character length (sum of all segments).
1418    ///
1419    #[must_use]
1420    pub fn total_len(&self) -> usize {
1421        self.segments.iter().map(|r| r.end - r.start).sum()
1422    }
1423
1424    /// Extract text from each segment and join with separator.
1425    #[must_use]
1426    pub fn extract_text(&self, text: &str, separator: &str) -> String {
1427        self.segments
1428            .iter()
1429            .map(|r| {
1430                let start = r.start;
1431                let len = r.end.saturating_sub(r.start);
1432                text.chars().skip(start).take(len).collect::<String>()
1433            })
1434            .collect::<Vec<_>>()
1435            .join(separator)
1436    }
1437
1438    /// Check if a character position falls within any segment.
1439    ///
1440    /// # Arguments
1441    ///
1442    /// * `pos` - Character offset to check (Unicode scalar value index)
1443    ///
1444    /// # Returns
1445    ///
1446    /// `true` if the character position falls within any segment of this span.
1447    #[must_use]
1448    pub fn contains(&self, pos: usize) -> bool {
1449        self.segments.iter().any(|r| r.contains(&pos))
1450    }
1451
1452    /// Convert to a regular Span (uses bounding range, loses discontinuity info).
1453    #[must_use]
1454    pub fn to_span(&self) -> Option<Span> {
1455        self.bounding_range().map(|r| Span::Text {
1456            start: r.start,
1457            end: r.end,
1458        })
1459    }
1460}
1461
1462impl From<std::ops::Range<usize>> for DiscontinuousSpan {
1463    fn from(range: std::ops::Range<usize>) -> Self {
1464        Self::contiguous(range.start, range.end)
1465    }
1466}
1467
1468impl Default for Span {
1469    fn default() -> Self {
1470        Self::Text { start: 0, end: 0 }
1471    }
1472}
1473
1474// ============================================================================
1475// Hierarchical Confidence (Coarse-to-Fine)
1476// ============================================================================
1477
1478/// Hierarchical confidence scores for coarse-to-fine extraction.
1479///
1480/// Research (HiNet, InfoHier) shows that extraction benefits from
1481/// decomposed confidence:
1482/// - **Linkage**: "Is there ANY entity here?" (binary, fast filter)
1483/// - **Type**: "What type is it?" (fine-grained classification)
1484/// - **Boundary**: "Where exactly does it start/end?" (span refinement)
1485#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
1486pub struct HierarchicalConfidence {
1487    /// Coarse: probability that this span contains ANY entity (0.0-1.0)
1488    /// Used for early filtering in the TPLinker "handshaking" matrix.
1489    pub linkage: Confidence,
1490    /// Fine: probability that the type classification is correct (0.0-1.0)
1491    pub type_score: Confidence,
1492    /// Boundary: confidence in the exact span boundaries (0.0-1.0)
1493    /// Low for entities with fuzzy boundaries (e.g., "the CEO" vs "CEO")
1494    pub boundary: Confidence,
1495}
1496
1497impl HierarchicalConfidence {
1498    /// Create hierarchical confidence with all scores.
1499    ///
1500    /// Accepts any type convertible to `Confidence` (f32, f64, Confidence).
1501    /// Out-of-range values are clamped to [0.0, 1.0].
1502    #[must_use]
1503    pub fn new(
1504        linkage: impl Into<Confidence>,
1505        type_score: impl Into<Confidence>,
1506        boundary: impl Into<Confidence>,
1507    ) -> Self {
1508        Self {
1509            linkage: linkage.into(),
1510            type_score: type_score.into(),
1511            boundary: boundary.into(),
1512        }
1513    }
1514
1515    /// Create from a single confidence score (legacy compatibility).
1516    /// Assigns same score to all levels.
1517    #[must_use]
1518    pub fn from_single(confidence: impl Into<Confidence>) -> Self {
1519        let c = confidence.into();
1520        Self {
1521            linkage: c,
1522            type_score: c,
1523            boundary: c,
1524        }
1525    }
1526
1527    /// Calculate combined confidence (geometric mean).
1528    /// Geometric mean penalizes low scores more than arithmetic mean.
1529    #[must_use]
1530    pub fn combined(&self) -> Confidence {
1531        let product = self.linkage.value() * self.type_score.value() * self.boundary.value();
1532        Confidence::new(product.powf(1.0 / 3.0))
1533    }
1534
1535    /// Calculate combined confidence as f64 for legacy compatibility.
1536    #[must_use]
1537    pub fn as_f64(&self) -> f64 {
1538        self.combined().value()
1539    }
1540
1541    /// Check if passes minimum threshold at all levels.
1542    #[must_use]
1543    pub fn passes_threshold(&self, linkage_min: f64, type_min: f64, boundary_min: f64) -> bool {
1544        self.linkage >= linkage_min && self.type_score >= type_min && self.boundary >= boundary_min
1545    }
1546}
1547
1548impl Default for HierarchicalConfidence {
1549    fn default() -> Self {
1550        Self {
1551            linkage: Confidence::ONE,
1552            type_score: Confidence::ONE,
1553            boundary: Confidence::ONE,
1554        }
1555    }
1556}
1557
1558impl From<f64> for HierarchicalConfidence {
1559    fn from(confidence: f64) -> Self {
1560        Self::from_single(confidence)
1561    }
1562}
1563
1564impl From<f32> for HierarchicalConfidence {
1565    fn from(confidence: f32) -> Self {
1566        Self::from_single(confidence)
1567    }
1568}
1569
1570impl From<Confidence> for HierarchicalConfidence {
1571    fn from(confidence: Confidence) -> Self {
1572        Self::from_single(confidence)
1573    }
1574}
1575
1576// ============================================================================
1577// Ragged Batch (ModernBERT Unpadding)
1578// ============================================================================
1579
1580/// A ragged (unpadded) batch for efficient ModernBERT inference.
1581///
1582/// ModernBERT achieves its speed advantage by avoiding padding tokens entirely.
1583/// Instead of `[batch, max_seq_len]`, it uses a single contiguous 1D sequence
1584/// with offset indices to track document boundaries.
1585///
1586/// # Memory Layout
1587///
1588/// ```text
1589/// Traditional (padded):
1590/// [doc1_tok1, doc1_tok2, PAD, PAD, PAD]  <- wasted compute
1591/// [doc2_tok1, doc2_tok2, doc2_tok3, PAD, PAD]
1592///
1593/// Ragged (unpadded):
1594/// [doc1_tok1, doc1_tok2, doc2_tok1, doc2_tok2, doc2_tok3]
1595/// cumulative_offsets: [0, 2, 5]  <- doc1 is [0..2], doc2 is [2..5]
1596/// ```
1597#[derive(Debug, Clone)]
1598pub struct RaggedBatch {
1599    /// Token IDs flattened into a single contiguous array.
1600    /// Shape: `[total_tokens]` (1D, no padding)
1601    pub token_ids: Vec<u32>,
1602    /// Cumulative sequence lengths.
1603    /// Length: batch_size + 1
1604    /// Document i spans tokens \[offsets\[i\]..offsets\[i+1\])
1605    pub cumulative_offsets: Vec<u32>,
1606    /// Maximum sequence length in this batch (for kernel bounds).
1607    pub max_seq_len: usize,
1608}
1609
1610impl RaggedBatch {
1611    /// Create a new ragged batch from sequences.
1612    pub fn from_sequences(sequences: &[Vec<u32>]) -> Self {
1613        let total_tokens: usize = sequences.iter().map(|s| s.len()).sum();
1614        let mut token_ids = Vec::with_capacity(total_tokens);
1615        let mut cumulative_offsets = Vec::with_capacity(sequences.len() + 1);
1616        let mut max_seq_len = 0;
1617
1618        cumulative_offsets.push(0);
1619        for seq in sequences {
1620            token_ids.extend_from_slice(seq);
1621            // Check for overflow: u32::MAX is 4,294,967,295
1622            // If token_ids.len() exceeds this, we'll truncate (which is a bug)
1623            // but in practice, this is unlikely for reasonable batch sizes
1624            let len = token_ids.len();
1625            if len > u32::MAX as usize {
1626                // This would overflow - use saturating cast to prevent panic
1627                // but log a warning as this indicates a problem
1628                log::warn!(
1629                    "Token count {} exceeds u32::MAX, truncating to {}",
1630                    len,
1631                    u32::MAX
1632                );
1633                cumulative_offsets.push(u32::MAX);
1634            } else {
1635                cumulative_offsets.push(len as u32);
1636            }
1637            max_seq_len = max_seq_len.max(seq.len());
1638        }
1639
1640        Self {
1641            token_ids,
1642            cumulative_offsets,
1643            max_seq_len,
1644        }
1645    }
1646
1647    /// Get the number of documents in this batch.
1648    #[must_use]
1649    pub fn batch_size(&self) -> usize {
1650        self.cumulative_offsets.len().saturating_sub(1)
1651    }
1652
1653    /// Get the total number of tokens (no padding).
1654    #[must_use]
1655    pub fn total_tokens(&self) -> usize {
1656        self.token_ids.len()
1657    }
1658
1659    /// Get token range for a specific document.
1660    #[must_use]
1661    pub fn doc_range(&self, doc_idx: usize) -> Option<std::ops::Range<usize>> {
1662        if doc_idx + 1 < self.cumulative_offsets.len() {
1663            let start = self.cumulative_offsets[doc_idx] as usize;
1664            let end = self.cumulative_offsets[doc_idx + 1] as usize;
1665            Some(start..end)
1666        } else {
1667            None
1668        }
1669    }
1670
1671    /// Get tokens for a specific document.
1672    #[must_use]
1673    pub fn doc_tokens(&self, doc_idx: usize) -> Option<&[u32]> {
1674        self.doc_range(doc_idx).map(|r| &self.token_ids[r])
1675    }
1676
1677    /// Calculate memory saved vs padded batch.
1678    #[must_use]
1679    pub fn padding_savings(&self) -> f64 {
1680        let padded_size = self.batch_size() * self.max_seq_len;
1681        if padded_size == 0 {
1682            return 0.0;
1683        }
1684        1.0 - (self.total_tokens() as f64 / padded_size as f64)
1685    }
1686}
1687
1688// ============================================================================
1689// Span Candidate Generation
1690// ============================================================================
1691
1692/// A candidate span for entity extraction.
1693///
1694/// In GLiNER/bi-encoder systems, we generate all possible spans up to a
1695/// maximum width and score them against entity type embeddings.
1696#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
1697pub struct SpanCandidate {
1698    /// Document index in the batch
1699    pub doc_idx: u32,
1700    /// Start token index (within the document)
1701    pub start: u32,
1702    /// End token index (exclusive)
1703    pub end: u32,
1704}
1705
1706impl SpanCandidate {
1707    /// Create a new span candidate.
1708    #[must_use]
1709    pub const fn new(doc_idx: u32, start: u32, end: u32) -> Self {
1710        Self {
1711            doc_idx,
1712            start,
1713            end,
1714        }
1715    }
1716
1717    /// Get span width (number of tokens).
1718    #[must_use]
1719    pub const fn width(&self) -> u32 {
1720        self.end.saturating_sub(self.start)
1721    }
1722}
1723
1724/// Generate all valid span candidates for a ragged batch.
1725///
1726/// This is the "gnarly" operation in GLiNER - efficiently enumerating
1727/// all valid spans without O(N^2) memory allocation.
1728pub fn generate_span_candidates(batch: &RaggedBatch, max_width: usize) -> Vec<SpanCandidate> {
1729    let mut candidates = Vec::new();
1730
1731    for doc_idx in 0..batch.batch_size() {
1732        if let Some(range) = batch.doc_range(doc_idx) {
1733            let doc_len = range.len();
1734            // Generate all spans [i, j) where j - i <= max_width
1735            for start in 0..doc_len {
1736                let max_end = (start + max_width).min(doc_len);
1737                for end in (start + 1)..=max_end {
1738                    candidates.push(SpanCandidate::new(doc_idx as u32, start as u32, end as u32));
1739                }
1740            }
1741        }
1742    }
1743
1744    candidates
1745}
1746
1747/// Generate span candidates with early filtering.
1748///
1749/// Uses a linkage mask to skip low-probability spans (TPLinker optimization).
1750pub fn generate_filtered_candidates(
1751    batch: &RaggedBatch,
1752    max_width: usize,
1753    linkage_mask: &[f32],
1754    threshold: f32,
1755) -> Vec<SpanCandidate> {
1756    let mut candidates = Vec::new();
1757    let mut mask_idx = 0;
1758
1759    for doc_idx in 0..batch.batch_size() {
1760        if let Some(range) = batch.doc_range(doc_idx) {
1761            let doc_len = range.len();
1762            for start in 0..doc_len {
1763                let max_end = (start + max_width).min(doc_len);
1764                for end in (start + 1)..=max_end {
1765                    // Only include if linkage probability exceeds threshold
1766                    if mask_idx < linkage_mask.len() && linkage_mask[mask_idx] >= threshold {
1767                        candidates.push(SpanCandidate::new(
1768                            doc_idx as u32,
1769                            start as u32,
1770                            end as u32,
1771                        ));
1772                    }
1773                    mask_idx += 1;
1774                }
1775            }
1776        }
1777    }
1778
1779    candidates
1780}
1781
1782// ============================================================================
1783// Entity (Extended)
1784// ============================================================================
1785
1786/// A recognized named entity or relation trigger.
1787///
1788/// # Entity Structure
1789///
1790/// ```text
1791/// "Contact John at john@example.com on Jan 15"
1792///          ^^^^    ^^^^^^^^^^^^^^^^    ^^^^^^
1793///          PER     EMAIL               DATE
1794///          |       |                   |
1795///          Named   Contact             Temporal
1796///          (ML)    (Pattern)           (Pattern)
1797/// ```
1798///
1799/// # Core Fields (Stable API)
1800///
1801/// - `text`, `entity_type`, `start`, `end`, `confidence` — always present
1802/// - `normalized`, `provenance` — commonly used optional fields
1803/// - `kb_id`, `canonical_id` — knowledge graph and coreference support
1804///
1805/// # Extended Fields (Research/Experimental)
1806///
1807/// The following fields support advanced research applications but may evolve:
1808///
1809/// | Field | Purpose | Status |
1810/// |-------|---------|--------|
1811/// | `visual_span` | Multi-modal (ColPali) extraction | Experimental |
1812/// | `discontinuous_span` | W2NER non-contiguous entities | Experimental |
1813/// | `valid_from`, `valid_until` | Temporal knowledge graphs | Research |
1814/// | `viewport` | Multi-faceted entity representation | Research |
1815/// | `hierarchical_confidence` | Coarse-to-fine NER | Experimental |
1816///
1817/// These fields are `#[serde(skip_serializing_if = "Option::is_none")]` so they
1818/// have no overhead when unused.
1819///
1820/// # Knowledge Graph Support
1821///
1822/// For GraphRAG and coreference resolution, entities support:
1823/// - `kb_id`: External knowledge base identifier (e.g., Wikidata Q-ID)
1824/// - `canonical_id`: Local coreference cluster ID (links "John" and "he")
1825///
1826/// # Normalization
1827///
1828/// Entities can have a normalized form for downstream processing:
1829/// - Dates: "Jan 15" → "2024-01-15" (ISO 8601)
1830/// - Money: "$1.5M" → "1500000 USD"
1831/// - Locations: "NYC" → "New York City"
1832#[derive(Debug, Clone, Serialize, Deserialize)]
1833pub struct Entity {
1834    /// Entity text (surface form as it appears in source)
1835    pub text: String,
1836    /// Entity type classification
1837    pub entity_type: EntityType,
1838    /// Start position (character offset, NOT byte offset).
1839    ///
1840    /// For Unicode text, character offsets differ from byte offsets.
1841    /// Use `anno::offset::bytes_to_chars` to convert if needed.
1842    pub start: usize,
1843    /// End position (character offset, exclusive).
1844    ///
1845    /// For Unicode text, character offsets differ from byte offsets.
1846    /// Use `anno::offset::bytes_to_chars` to convert if needed.
1847    pub end: usize,
1848    /// Confidence score (0.0-1.0, calibrated).
1849    ///
1850    /// Construction via [`Confidence::new`] clamps to `[0.0, 1.0]`.
1851    /// Use `.value()` or `Into<f64>` to extract the raw score.
1852    pub confidence: Confidence,
1853    /// Normalized/canonical form (e.g., "Jan 15" → "2024-01-15")
1854    #[serde(default, skip_serializing_if = "Option::is_none")]
1855    pub normalized: Option<String>,
1856    /// Provenance: which backend/method produced this entity
1857    #[serde(default, skip_serializing_if = "Option::is_none")]
1858    pub provenance: Option<Provenance>,
1859    /// External knowledge base ID (e.g., "Q7186" for Marie Curie in Wikidata).
1860    /// Used for entity linking and GraphRAG applications.
1861    #[serde(default, skip_serializing_if = "Option::is_none")]
1862    pub kb_id: Option<String>,
1863    /// Local coreference cluster ID.
1864    /// Multiple mentions with the same `canonical_id` refer to the same entity.
1865    /// Example: "Marie Curie" and "she" might share `canonical_id = CanonicalId(42)`.
1866    #[serde(default, skip_serializing_if = "Option::is_none")]
1867    pub canonical_id: Option<super::types::CanonicalId>,
1868    /// Hierarchical confidence (coarse-to-fine).
1869    /// Provides linkage, type, and boundary scores separately.
1870    #[serde(default, skip_serializing_if = "Option::is_none")]
1871    pub hierarchical_confidence: Option<HierarchicalConfidence>,
1872    /// Visual span for multi-modal (ColPali) extraction.
1873    /// When set, provides bounding box location in addition to text offsets.
1874    #[serde(default, skip_serializing_if = "Option::is_none")]
1875    pub visual_span: Option<Span>,
1876    /// Discontinuous span for non-contiguous entity mentions (W2NER support).
1877    /// When set, overrides `start`/`end` for length calculations.
1878    /// Example: "New York and LA \[airports\]" where "airports" modifies both.
1879    #[serde(default, skip_serializing_if = "Option::is_none")]
1880    pub discontinuous_span: Option<DiscontinuousSpan>,
1881    // =========================================================================
1882    // Temporal Validity (Research: Temporal Knowledge Graphs)
1883    // =========================================================================
1884    /// Start of temporal validity interval for this entity assertion.
1885    ///
1886    /// Entities are facts that may change over time:
1887    /// - "Satya Nadella is CEO of Microsoft" is valid from [2014, present]
1888    /// - "Steve Ballmer was CEO of Microsoft" was valid from [2000, 2014]
1889    ///
1890    /// When `None`, the entity is either:
1891    /// - Currently valid (no known end date)
1892    /// - Atemporal (timeless fact like "Paris is in France")
1893    ///
1894    /// # Example
1895    /// ```rust
1896    /// use anno_core::{Entity, EntityType};
1897    /// use chrono::{TimeZone, Utc};
1898    ///
1899    /// let mut entity = Entity::new("CEO of Microsoft", EntityType::Person, 0, 16, 0.9);
1900    /// entity.valid_from = Some(Utc.with_ymd_and_hms(2008, 10, 1, 0, 0, 0).unwrap());
1901    /// ```
1902    #[serde(default, skip_serializing_if = "Option::is_none")]
1903    pub valid_from: Option<chrono::DateTime<chrono::Utc>>,
1904    /// End of temporal validity interval for this entity assertion.
1905    ///
1906    /// When `None` and `valid_from` is set, the fact is currently valid.
1907    /// When both are `None`, the entity is atemporal.
1908    #[serde(default, skip_serializing_if = "Option::is_none")]
1909    pub valid_until: Option<chrono::DateTime<chrono::Utc>>,
1910    // =========================================================================
1911    // Viewport / Context (Research: Entity Manifolds)
1912    // =========================================================================
1913    /// Viewport context for multi-faceted entity representation.
1914    ///
1915    /// The same real-world entity can have different "faces" in different contexts:
1916    /// - "Marie Curie" in an academic context: professor, researcher
1917    /// - "Marie Curie" in a scientific context: physicist, chemist
1918    /// - "Marie Curie" in a personal context: mother, educator
1919    ///
1920    /// This enables "holographic" entity projection at query time:
1921    /// given a query context, project the entity manifold to the relevant viewport.
1922    ///
1923    /// # Example
1924    /// ```rust
1925    /// use anno_core::{Entity, EntityType, EntityViewport};
1926    ///
1927    /// let mut entity = Entity::new("Marie Curie", EntityType::Person, 0, 11, 0.9);
1928    /// entity.viewport = Some(EntityViewport::Academic);
1929    /// ```
1930    #[serde(default, skip_serializing_if = "Option::is_none")]
1931    pub viewport: Option<EntityViewport>,
1932    /// Phi-features (person, number, gender) for morphological agreement.
1933    ///
1934    /// Used for coreference constraints and zero pronoun resolution.
1935    /// In pro-drop languages (Arabic, Spanish, Japanese), verb morphology
1936    /// encodes subject features even when the pronoun is dropped.
1937    #[serde(default, skip_serializing_if = "Option::is_none")]
1938    pub phi_features: Option<PhiFeatures>,
1939    /// Mention type classification (Proper, Nominal, Pronominal, Zero).
1940    ///
1941    /// Classifies the referring expression type for coreference resolution.
1942    /// Follows the Accessibility Hierarchy (Ariel 1990):
1943    /// Proper > Nominal > Pronominal > Zero.
1944    #[serde(default, skip_serializing_if = "Option::is_none")]
1945    pub mention_type: Option<MentionType>,
1946}
1947
1948impl Entity {
1949    /// Create a new entity.
1950    ///
1951    /// ```
1952    /// use anno_core::{Entity, EntityType};
1953    ///
1954    /// let e = Entity::new("Berlin", EntityType::Location, 10, 16, 0.95);
1955    /// assert_eq!(e.text, "Berlin");
1956    /// assert_eq!(e.entity_type, EntityType::Location);
1957    /// assert_eq!((e.start, e.end), (10, 16));
1958    /// ```
1959    #[must_use]
1960    pub fn new(
1961        text: impl Into<String>,
1962        entity_type: EntityType,
1963        start: usize,
1964        end: usize,
1965        confidence: impl Into<Confidence>,
1966    ) -> Self {
1967        Self {
1968            text: text.into(),
1969            entity_type,
1970            start,
1971            end,
1972            confidence: confidence.into(),
1973            normalized: None,
1974            provenance: None,
1975            kb_id: None,
1976            canonical_id: None,
1977            hierarchical_confidence: None,
1978            visual_span: None,
1979            discontinuous_span: None,
1980            valid_from: None,
1981            valid_until: None,
1982            viewport: None,
1983            phi_features: None,
1984            mention_type: None,
1985        }
1986    }
1987
1988    /// Create a new entity with provenance information.
1989    #[must_use]
1990    pub fn with_provenance(
1991        text: impl Into<String>,
1992        entity_type: EntityType,
1993        start: usize,
1994        end: usize,
1995        confidence: impl Into<Confidence>,
1996        provenance: Provenance,
1997    ) -> Self {
1998        Self {
1999            text: text.into(),
2000            entity_type,
2001            start,
2002            end,
2003            confidence: confidence.into(),
2004            normalized: None,
2005            provenance: Some(provenance),
2006            kb_id: None,
2007            canonical_id: None,
2008            hierarchical_confidence: None,
2009            visual_span: None,
2010            discontinuous_span: None,
2011            valid_from: None,
2012            valid_until: None,
2013            viewport: None,
2014            phi_features: None,
2015            mention_type: None,
2016        }
2017    }
2018
2019    /// Create an entity with hierarchical confidence scores.
2020    #[must_use]
2021    pub fn with_hierarchical_confidence(
2022        text: impl Into<String>,
2023        entity_type: EntityType,
2024        start: usize,
2025        end: usize,
2026        confidence: HierarchicalConfidence,
2027    ) -> Self {
2028        Self {
2029            text: text.into(),
2030            entity_type,
2031            start,
2032            end,
2033            confidence: Confidence::new(confidence.as_f64()),
2034            normalized: None,
2035            provenance: None,
2036            kb_id: None,
2037            canonical_id: None,
2038            hierarchical_confidence: Some(confidence),
2039            visual_span: None,
2040            discontinuous_span: None,
2041            valid_from: None,
2042            valid_until: None,
2043            viewport: None,
2044            phi_features: None,
2045            mention_type: None,
2046        }
2047    }
2048
2049    /// Create an entity from a visual bounding box (ColPali multi-modal).
2050    #[must_use]
2051    pub fn from_visual(
2052        text: impl Into<String>,
2053        entity_type: EntityType,
2054        bbox: Span,
2055        confidence: impl Into<Confidence>,
2056    ) -> Self {
2057        Self {
2058            text: text.into(),
2059            entity_type,
2060            start: 0,
2061            end: 0,
2062            confidence: confidence.into(),
2063            normalized: None,
2064            provenance: None,
2065            kb_id: None,
2066            canonical_id: None,
2067            hierarchical_confidence: None,
2068            visual_span: Some(bbox),
2069            discontinuous_span: None,
2070            valid_from: None,
2071            valid_until: None,
2072            viewport: None,
2073            phi_features: None,
2074            mention_type: None,
2075        }
2076    }
2077
2078    /// Create an entity with default confidence (1.0).
2079    #[must_use]
2080    pub fn with_type(
2081        text: impl Into<String>,
2082        entity_type: EntityType,
2083        start: usize,
2084        end: usize,
2085    ) -> Self {
2086        Self::new(text, entity_type, start, end, 1.0)
2087    }
2088
2089    /// Link this entity to an external knowledge base.
2090    ///
2091    /// # Examples
2092    /// ```
2093    /// use anno_core::{Entity, EntityType};
2094    /// let mut e = Entity::new("Marie Curie", EntityType::Person, 0, 11, 0.95);
2095    /// e.link_to_kb("Q7186");
2096    /// assert_eq!(e.kb_id.as_deref(), Some("Q7186"));
2097    /// ```
2098    pub fn link_to_kb(&mut self, kb_id: impl Into<String>) {
2099        self.kb_id = Some(kb_id.into());
2100    }
2101
2102    /// Assign this entity to a coreference cluster.
2103    ///
2104    /// Entities with the same `canonical_id` refer to the same real-world entity.
2105    pub fn set_canonical(&mut self, canonical_id: impl Into<super::types::CanonicalId>) {
2106        self.canonical_id = Some(canonical_id.into());
2107    }
2108
2109    /// Builder-style method to set canonical ID.
2110    ///
2111    /// # Example
2112    /// ```
2113    /// use anno_core::{CanonicalId, Entity, EntityType};
2114    /// let entity = Entity::new("John", EntityType::Person, 0, 4, 0.9)
2115    ///     .with_canonical_id(42);
2116    /// assert_eq!(entity.canonical_id, Some(CanonicalId::new(42)));
2117    /// ```
2118    #[must_use]
2119    pub fn with_canonical_id(mut self, canonical_id: impl Into<super::types::CanonicalId>) -> Self {
2120        self.canonical_id = Some(canonical_id.into());
2121        self
2122    }
2123
2124    /// Check if this entity is linked to a knowledge base.
2125    #[must_use]
2126    pub fn is_linked(&self) -> bool {
2127        self.kb_id.is_some()
2128    }
2129
2130    /// Check if this entity has coreference information.
2131    #[must_use]
2132    pub fn has_coreference(&self) -> bool {
2133        self.canonical_id.is_some()
2134    }
2135
2136    /// Check if this entity has a discontinuous span.
2137    ///
2138    /// Discontinuous entities span non-contiguous text regions.
2139    /// Example: "New York and LA airports" contains "New York airports"
2140    /// as a discontinuous entity.
2141    #[must_use]
2142    pub fn is_discontinuous(&self) -> bool {
2143        self.discontinuous_span
2144            .as_ref()
2145            .map(|s| s.is_discontinuous())
2146            .unwrap_or(false)
2147    }
2148
2149    /// Get the discontinuous segments if present.
2150    ///
2151    /// Returns `None` if this is a contiguous entity.
2152    #[must_use]
2153    pub fn discontinuous_segments(&self) -> Option<Vec<std::ops::Range<usize>>> {
2154        self.discontinuous_span
2155            .as_ref()
2156            .filter(|s| s.is_discontinuous())
2157            .map(|s| s.segments().to_vec())
2158    }
2159
2160    /// Set a discontinuous span for this entity.
2161    ///
2162    /// This is used by W2NER and similar models that detect non-contiguous mentions.
2163    pub fn set_discontinuous_span(&mut self, span: DiscontinuousSpan) {
2164        // Update start/end to match the bounding range
2165        if let Some(bounding) = span.bounding_range() {
2166            self.start = bounding.start;
2167            self.end = bounding.end;
2168        }
2169        self.discontinuous_span = Some(span);
2170    }
2171
2172    /// Get the total length covered by this entity, in **characters**.
2173    ///
2174    /// - **Contiguous**: `end - start`
2175    /// - **Discontinuous**: sum of segment lengths
2176    ///
2177    /// This is intentionally consistent: all offsets in `anno::core` entity spans
2178    /// are **character offsets** (Unicode scalar values), not byte offsets.
2179    #[must_use]
2180    pub fn total_len(&self) -> usize {
2181        if let Some(ref span) = self.discontinuous_span {
2182            span.segments().iter().map(|r| r.end - r.start).sum()
2183        } else {
2184            self.end.saturating_sub(self.start)
2185        }
2186    }
2187
2188    /// Set the normalized form for this entity.
2189    ///
2190    /// # Examples
2191    ///
2192    /// ```rust
2193    /// use anno_core::{Entity, EntityType};
2194    ///
2195    /// let mut entity = Entity::new("Jan 15", EntityType::Date, 0, 6, 0.95);
2196    /// entity.set_normalized("2024-01-15");
2197    /// assert_eq!(entity.normalized.as_deref(), Some("2024-01-15"));
2198    /// ```
2199    pub fn set_normalized(&mut self, normalized: impl Into<String>) {
2200        self.normalized = Some(normalized.into());
2201    }
2202
2203    /// Get the normalized form, or the original text if not normalized.
2204    #[must_use]
2205    pub fn normalized_or_text(&self) -> &str {
2206        self.normalized.as_deref().unwrap_or(&self.text)
2207    }
2208
2209    /// Get the extraction method, if known.
2210    #[must_use]
2211    pub fn method(&self) -> ExtractionMethod {
2212        self.provenance
2213            .as_ref()
2214            .map_or(ExtractionMethod::Unknown, |p| p.method)
2215    }
2216
2217    /// Get the source backend name, if known.
2218    #[must_use]
2219    pub fn source(&self) -> Option<&str> {
2220        self.provenance.as_ref().map(|p| p.source.as_ref())
2221    }
2222
2223    /// Get the entity category.
2224    #[must_use]
2225    pub fn category(&self) -> EntityCategory {
2226        self.entity_type.category()
2227    }
2228
2229    /// Returns true if this entity was detected via patterns (not ML).
2230    #[must_use]
2231    pub fn is_structured(&self) -> bool {
2232        self.entity_type.pattern_detectable()
2233    }
2234
2235    /// Returns true if this entity required ML for detection.
2236    #[must_use]
2237    pub fn is_named(&self) -> bool {
2238        self.entity_type.requires_ml()
2239    }
2240
2241    /// Check if this entity overlaps with another.
2242    #[must_use]
2243    pub fn overlaps(&self, other: &Entity) -> bool {
2244        !(self.end <= other.start || other.end <= self.start)
2245    }
2246
2247    /// Calculate overlap ratio (IoU) with another entity.
2248    #[must_use]
2249    pub fn overlap_ratio(&self, other: &Entity) -> f64 {
2250        let intersection_start = self.start.max(other.start);
2251        let intersection_end = self.end.min(other.end);
2252
2253        if intersection_start >= intersection_end {
2254            return 0.0;
2255        }
2256
2257        let intersection = (intersection_end - intersection_start) as f64;
2258        let union = ((self.end - self.start) + (other.end - other.start)
2259            - (intersection_end - intersection_start)) as f64;
2260
2261        if union == 0.0 {
2262            return 1.0;
2263        }
2264
2265        intersection / union
2266    }
2267
2268    /// Set hierarchical confidence scores.
2269    pub fn set_hierarchical_confidence(&mut self, confidence: HierarchicalConfidence) {
2270        self.confidence = Confidence::new(confidence.as_f64());
2271        self.hierarchical_confidence = Some(confidence);
2272    }
2273
2274    /// Get the linkage confidence (coarse filter score).
2275    #[must_use]
2276    pub fn linkage_confidence(&self) -> Confidence {
2277        self.hierarchical_confidence
2278            .map_or(self.confidence, |h| h.linkage)
2279    }
2280
2281    /// Get the type classification confidence.
2282    #[must_use]
2283    pub fn type_confidence(&self) -> Confidence {
2284        self.hierarchical_confidence
2285            .map_or(self.confidence, |h| h.type_score)
2286    }
2287
2288    /// Get the boundary confidence.
2289    #[must_use]
2290    pub fn boundary_confidence(&self) -> Confidence {
2291        self.hierarchical_confidence
2292            .map_or(self.confidence, |h| h.boundary)
2293    }
2294
2295    /// Check if this entity has visual location (multi-modal).
2296    #[must_use]
2297    pub fn is_visual(&self) -> bool {
2298        self.visual_span.is_some()
2299    }
2300
2301    /// Get the text span (start, end).
2302    #[must_use]
2303    pub const fn text_span(&self) -> (usize, usize) {
2304        (self.start, self.end)
2305    }
2306
2307    /// Get the span length.
2308    #[must_use]
2309    pub const fn span_len(&self) -> usize {
2310        self.end.saturating_sub(self.start)
2311    }
2312
2313    /// Create a unified TextSpan with both byte and char offsets.
2314    ///
2315    /// This is useful when you need to work with both offset systems.
2316    /// The `text` parameter must be the original source text from which
2317    /// this entity was extracted.
2318    ///
2319    /// # Arguments
2320    /// * `source_text` - The original text (needed to compute byte offsets)
2321    ///
2322    /// # Returns
2323    /// A TextSpan with both byte and char offsets.
2324    ///
2325    /// # Note
2326    ///
2327    /// This method requires the offset conversion utilities from the `anno` crate.
2328    /// Use `anno::offset::char_to_byte_offsets()` directly for now.
2329    ///
2330    /// # Example
2331    /// ```rust,ignore
2332    /// use anno_core::{Entity, EntityType};
2333    ///
2334    /// let (byte_start, byte_end) = char_to_byte_offsets(text, entity.start, entity.end);
2335    /// ```
2336    /// Set visual span for multi-modal extraction.
2337    pub fn set_visual_span(&mut self, span: Span) {
2338        self.visual_span = Some(span);
2339    }
2340
2341    /// Safely extract text from source using character offsets.
2342    ///
2343    /// Entity stores character offsets, not byte offsets. This method
2344    /// correctly extracts text by iterating over characters.
2345    ///
2346    /// # Arguments
2347    /// * `source_text` - The original text from which this entity was extracted
2348    ///
2349    /// # Returns
2350    /// The extracted text, or empty string if offsets are invalid
2351    ///
2352    /// # Example
2353    /// ```rust
2354    /// use anno_core::{Entity, EntityType};
2355    ///
2356    /// let text = "Hello, 日本!";
2357    /// let entity = Entity::new("日本", EntityType::Location, 7, 9, 0.95);
2358    /// assert_eq!(entity.extract_text(text), "日本");
2359    /// ```
2360    #[must_use]
2361    pub fn extract_text(&self, source_text: &str) -> String {
2362        // Performance: Use cached length if available, but fallback to counting
2363        // For single entity extraction, this is fine. For batch operations,
2364        // use extract_text_with_len with pre-computed length.
2365        let char_count = source_text.chars().count();
2366        self.extract_text_with_len(source_text, char_count)
2367    }
2368
2369    /// Extract text with pre-computed text length (performance optimization).
2370    ///
2371    /// Use this when validating/clamping multiple entities from the same text
2372    /// to avoid recalculating `text.chars().count()` for each entity.
2373    ///
2374    /// # Arguments
2375    /// * `source_text` - The original text
2376    /// * `text_char_count` - Pre-computed character count (from `text.chars().count()`)
2377    ///
2378    /// # Returns
2379    /// The extracted text, or empty string if offsets are invalid
2380    #[must_use]
2381    pub fn extract_text_with_len(&self, source_text: &str, text_char_count: usize) -> String {
2382        if self.start >= text_char_count || self.end > text_char_count || self.start >= self.end {
2383            return String::new();
2384        }
2385        source_text
2386            .chars()
2387            .skip(self.start)
2388            .take(self.end - self.start)
2389            .collect()
2390    }
2391
2392    // =========================================================================
2393    // Temporal Validity Methods
2394    // =========================================================================
2395
2396    /// Set the temporal validity start for this entity assertion.
2397    ///
2398    /// # Example
2399    /// ```rust
2400    /// use anno_core::{Entity, EntityType};
2401    /// use chrono::{TimeZone, Utc};
2402    ///
2403    /// let mut entity = Entity::new("CEO", EntityType::Person, 0, 3, 0.9);
2404    /// entity.set_valid_from(Utc.with_ymd_and_hms(2008, 10, 1, 0, 0, 0).unwrap());
2405    /// assert!(entity.is_temporal());
2406    /// ```
2407    pub fn set_valid_from(&mut self, dt: chrono::DateTime<chrono::Utc>) {
2408        self.valid_from = Some(dt);
2409    }
2410
2411    /// Set the temporal validity end for this entity assertion.
2412    pub fn set_valid_until(&mut self, dt: chrono::DateTime<chrono::Utc>) {
2413        self.valid_until = Some(dt);
2414    }
2415
2416    /// Set both temporal bounds at once.
2417    pub fn set_temporal_range(
2418        &mut self,
2419        from: chrono::DateTime<chrono::Utc>,
2420        until: chrono::DateTime<chrono::Utc>,
2421    ) {
2422        self.valid_from = Some(from);
2423        self.valid_until = Some(until);
2424    }
2425
2426    /// Check if this entity has temporal validity information.
2427    #[must_use]
2428    pub fn is_temporal(&self) -> bool {
2429        self.valid_from.is_some() || self.valid_until.is_some()
2430    }
2431
2432    /// Check if this entity was valid at a specific point in time.
2433    ///
2434    /// Returns `true` if:
2435    /// - No temporal bounds are set (atemporal entity)
2436    /// - The timestamp falls within [valid_from, valid_until]
2437    ///
2438    /// # Example
2439    /// ```rust
2440    /// use anno_core::{Entity, EntityType};
2441    /// use chrono::{TimeZone, Utc};
2442    ///
2443    /// let mut entity = Entity::new("CEO of Microsoft", EntityType::Person, 0, 16, 0.9);
2444    /// entity.set_valid_from(Utc.with_ymd_and_hms(2008, 1, 1, 0, 0, 0).unwrap());
2445    /// entity.set_valid_until(Utc.with_ymd_and_hms(2023, 12, 31, 0, 0, 0).unwrap());
2446    ///
2447    /// let query_2015 = Utc.with_ymd_and_hms(2015, 6, 1, 0, 0, 0).unwrap();
2448    /// let query_2005 = Utc.with_ymd_and_hms(2005, 6, 1, 0, 0, 0).unwrap();
2449    ///
2450    /// assert!(entity.valid_at(&query_2015));
2451    /// assert!(!entity.valid_at(&query_2005));
2452    /// ```
2453    #[must_use]
2454    pub fn valid_at(&self, timestamp: &chrono::DateTime<chrono::Utc>) -> bool {
2455        match (&self.valid_from, &self.valid_until) {
2456            (None, None) => true,                      // Atemporal - always valid
2457            (Some(from), None) => timestamp >= from,   // Started, still valid
2458            (None, Some(until)) => timestamp <= until, // Unknown start, ended
2459            (Some(from), Some(until)) => timestamp >= from && timestamp <= until,
2460        }
2461    }
2462
2463    /// Check if this entity is currently valid (at the current time).
2464    #[must_use]
2465    pub fn is_currently_valid(&self) -> bool {
2466        self.valid_at(&chrono::Utc::now())
2467    }
2468
2469    // =========================================================================
2470    // Viewport/Context Methods
2471    // =========================================================================
2472
2473    /// Set the viewport context for this entity.
2474    ///
2475    /// # Example
2476    /// ```rust
2477    /// use anno_core::{Entity, EntityType, EntityViewport};
2478    ///
2479    /// let mut entity = Entity::new("Marie Curie", EntityType::Person, 0, 11, 0.9);
2480    /// entity.set_viewport(EntityViewport::Academic);
2481    /// assert!(entity.has_viewport());
2482    /// ```
2483    pub fn set_viewport(&mut self, viewport: EntityViewport) {
2484        self.viewport = Some(viewport);
2485    }
2486
2487    /// Check if this entity has a viewport context.
2488    #[must_use]
2489    pub fn has_viewport(&self) -> bool {
2490        self.viewport.is_some()
2491    }
2492
2493    /// Get the viewport, defaulting to General if not set.
2494    #[must_use]
2495    pub fn viewport_or_default(&self) -> EntityViewport {
2496        self.viewport.clone().unwrap_or_default()
2497    }
2498
2499    /// Check if this entity matches a viewport context.
2500    ///
2501    /// Returns true if:
2502    /// - The entity has no viewport (matches any)
2503    /// - The entity's viewport matches the query
2504    #[must_use]
2505    pub fn matches_viewport(&self, query_viewport: &EntityViewport) -> bool {
2506        match &self.viewport {
2507            None => true, // No viewport = matches any
2508            Some(v) => v == query_viewport,
2509        }
2510    }
2511
2512    /// Create a builder for fluent entity construction.
2513    #[must_use]
2514    pub fn builder(text: impl Into<String>, entity_type: EntityType) -> EntityBuilder {
2515        EntityBuilder::new(text, entity_type)
2516    }
2517
2518    // =========================================================================
2519    // Validation Methods (Production Quality)
2520    // =========================================================================
2521
2522    /// Validate this entity against the source text.
2523    ///
2524    /// Returns a list of validation issues. Empty list means the entity is valid.
2525    ///
2526    /// # Checks Performed
2527    ///
2528    /// 1. **Span bounds**: `start < end`, both within text length
2529    /// 2. **Text match**: `text` matches the span in source
2530    /// 3. **Confidence range**: `confidence` in [0.0, 1.0]
2531    /// 4. **Type consistency**: Custom types have non-empty names
2532    /// 5. **Discontinuous consistency**: If present, segments are valid
2533    ///
2534    /// # Example
2535    ///
2536    /// ```rust
2537    /// use anno_core::{Entity, EntityType};
2538    ///
2539    /// let text = "John works at Apple";
2540    /// let entity = Entity::new("John", EntityType::Person, 0, 4, 0.95);
2541    ///
2542    /// let issues = entity.validate(text);
2543    /// assert!(issues.is_empty(), "Entity should be valid");
2544    ///
2545    /// // Invalid entity: span doesn't match text
2546    /// let bad = Entity::new("Jane", EntityType::Person, 0, 4, 0.95);
2547    /// let issues = bad.validate(text);
2548    /// assert!(!issues.is_empty(), "Entity text doesn't match span");
2549    /// ```
2550    #[must_use]
2551    pub fn validate(&self, source_text: &str) -> Vec<ValidationIssue> {
2552        // Performance: Calculate length once, delegate to optimized version
2553        let char_count = source_text.chars().count();
2554        self.validate_with_len(source_text, char_count)
2555    }
2556
2557    /// Validate entity with pre-computed text length (performance optimization).
2558    ///
2559    /// Use this when validating multiple entities from the same text to avoid
2560    /// recalculating `text.chars().count()` for each entity.
2561    ///
2562    /// # Arguments
2563    /// * `source_text` - The original text
2564    /// * `text_char_count` - Pre-computed character count (from `text.chars().count()`)
2565    ///
2566    /// # Returns
2567    /// Vector of validation issues (empty if valid)
2568    #[must_use]
2569    pub fn validate_with_len(
2570        &self,
2571        source_text: &str,
2572        text_char_count: usize,
2573    ) -> Vec<ValidationIssue> {
2574        let mut issues = Vec::new();
2575
2576        // 1. Span bounds
2577        if self.start >= self.end {
2578            issues.push(ValidationIssue::InvalidSpan {
2579                start: self.start,
2580                end: self.end,
2581                reason: "start must be less than end".to_string(),
2582            });
2583        }
2584
2585        if self.end > text_char_count {
2586            issues.push(ValidationIssue::SpanOutOfBounds {
2587                end: self.end,
2588                text_len: text_char_count,
2589            });
2590        }
2591
2592        // 2. Text match (only if span is valid)
2593        if self.start < self.end && self.end <= text_char_count {
2594            let actual = self.extract_text_with_len(source_text, text_char_count);
2595            if actual != self.text {
2596                issues.push(ValidationIssue::TextMismatch {
2597                    expected: self.text.clone(),
2598                    actual,
2599                    start: self.start,
2600                    end: self.end,
2601                });
2602            }
2603        }
2604
2605        // 3. Confidence range (now enforced by the Confidence type, so this is a no-op)
2606
2607        // 4. Type consistency
2608        if let EntityType::Custom { ref name, .. } = self.entity_type {
2609            if name.is_empty() {
2610                issues.push(ValidationIssue::InvalidType {
2611                    reason: "Custom entity type has empty name".to_string(),
2612                });
2613            }
2614        }
2615
2616        // 5. Discontinuous span consistency
2617        if let Some(ref disc_span) = self.discontinuous_span {
2618            for (i, seg) in disc_span.segments().iter().enumerate() {
2619                if seg.start >= seg.end {
2620                    issues.push(ValidationIssue::InvalidSpan {
2621                        start: seg.start,
2622                        end: seg.end,
2623                        reason: format!("discontinuous segment {} is invalid", i),
2624                    });
2625                }
2626                if seg.end > text_char_count {
2627                    issues.push(ValidationIssue::SpanOutOfBounds {
2628                        end: seg.end,
2629                        text_len: text_char_count,
2630                    });
2631                }
2632            }
2633        }
2634
2635        issues
2636    }
2637
2638    /// Check if this entity is valid against the source text.
2639    ///
2640    /// Convenience method that returns `true` if `validate()` returns empty.
2641    #[must_use]
2642    pub fn is_valid(&self, source_text: &str) -> bool {
2643        self.validate(source_text).is_empty()
2644    }
2645
2646    /// Validate a batch of entities efficiently.
2647    ///
2648    /// Returns a map of entity index -> validation issues.
2649    /// Only entities with issues are included.
2650    ///
2651    /// # Example
2652    ///
2653    /// ```rust
2654    /// use anno_core::{Entity, EntityType};
2655    ///
2656    /// let text = "John and Jane work at Apple";
2657    /// let entities = vec![
2658    ///     Entity::new("John", EntityType::Person, 0, 4, 0.95),
2659    ///     Entity::new("Wrong", EntityType::Person, 9, 13, 0.8),
2660    /// ];
2661    ///
2662    /// let issues = Entity::validate_batch(&entities, text);
2663    /// assert!(issues.is_empty() || issues.contains_key(&1)); // Second entity might fail
2664    /// ```
2665    #[must_use]
2666    pub fn validate_batch(
2667        entities: &[Entity],
2668        source_text: &str,
2669    ) -> std::collections::HashMap<usize, Vec<ValidationIssue>> {
2670        entities
2671            .iter()
2672            .enumerate()
2673            .filter_map(|(idx, entity)| {
2674                let issues = entity.validate(source_text);
2675                if issues.is_empty() {
2676                    None
2677                } else {
2678                    Some((idx, issues))
2679                }
2680            })
2681            .collect()
2682    }
2683}
2684
2685/// Validation issue found during entity validation.
2686#[derive(Debug, Clone, PartialEq)]
2687pub enum ValidationIssue {
2688    /// Span bounds are invalid (start >= end).
2689    InvalidSpan {
2690        /// Start position of the invalid span.
2691        start: usize,
2692        /// End position of the invalid span.
2693        end: usize,
2694        /// Description of why the span is invalid.
2695        reason: String,
2696    },
2697    /// Span extends beyond text length.
2698    SpanOutOfBounds {
2699        /// End position that exceeds the text.
2700        end: usize,
2701        /// Actual length of the text.
2702        text_len: usize,
2703    },
2704    /// Entity text doesn't match the span in source.
2705    TextMismatch {
2706        /// Text stored in the entity.
2707        expected: String,
2708        /// Text found at the span in source.
2709        actual: String,
2710        /// Start position of the span.
2711        start: usize,
2712        /// End position of the span.
2713        end: usize,
2714    },
2715    /// Confidence is outside [0.0, 1.0].
2716    InvalidConfidence {
2717        /// The invalid confidence value.
2718        value: f64,
2719    },
2720    /// Entity type is invalid.
2721    InvalidType {
2722        /// Description of why the type is invalid.
2723        reason: String,
2724    },
2725}
2726
2727impl std::fmt::Display for ValidationIssue {
2728    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
2729        match self {
2730            ValidationIssue::InvalidSpan { start, end, reason } => {
2731                write!(f, "Invalid span [{}, {}): {}", start, end, reason)
2732            }
2733            ValidationIssue::SpanOutOfBounds { end, text_len } => {
2734                write!(f, "Span end {} exceeds text length {}", end, text_len)
2735            }
2736            ValidationIssue::TextMismatch {
2737                expected,
2738                actual,
2739                start,
2740                end,
2741            } => {
2742                write!(
2743                    f,
2744                    "Text mismatch at [{}, {}): expected '{}', got '{}'",
2745                    start, end, expected, actual
2746                )
2747            }
2748            ValidationIssue::InvalidConfidence { value } => {
2749                write!(f, "Confidence {} outside [0.0, 1.0]", value)
2750            }
2751            ValidationIssue::InvalidType { reason } => {
2752                write!(f, "Invalid entity type: {}", reason)
2753            }
2754        }
2755    }
2756}
2757
2758/// Fluent builder for constructing entities with optional fields.
2759///
2760/// # Example
2761///
2762/// ```rust
2763/// use anno_core::{Entity, EntityType, Provenance};
2764///
2765/// let entity = Entity::builder("Marie Curie", EntityType::Person)
2766///     .span(0, 11)
2767///     .confidence(0.95)
2768///     .kb_id("Q7186")
2769///     .provenance(Provenance::ml("bert", 0.95))
2770///     .build();
2771/// ```
2772#[derive(Debug, Clone)]
2773pub struct EntityBuilder {
2774    text: String,
2775    entity_type: EntityType,
2776    start: usize,
2777    end: usize,
2778    confidence: Confidence,
2779    normalized: Option<String>,
2780    provenance: Option<Provenance>,
2781    kb_id: Option<String>,
2782    canonical_id: Option<super::types::CanonicalId>,
2783    hierarchical_confidence: Option<HierarchicalConfidence>,
2784    visual_span: Option<Span>,
2785    discontinuous_span: Option<DiscontinuousSpan>,
2786    valid_from: Option<chrono::DateTime<chrono::Utc>>,
2787    valid_until: Option<chrono::DateTime<chrono::Utc>>,
2788    viewport: Option<EntityViewport>,
2789    phi_features: Option<PhiFeatures>,
2790    mention_type: Option<MentionType>,
2791}
2792
2793impl EntityBuilder {
2794    /// Create a new builder.
2795    #[must_use]
2796    pub fn new(text: impl Into<String>, entity_type: EntityType) -> Self {
2797        let text = text.into();
2798        let end = text.chars().count();
2799        Self {
2800            text,
2801            entity_type,
2802            start: 0,
2803            end,
2804            confidence: Confidence::ONE,
2805            normalized: None,
2806            provenance: None,
2807            kb_id: None,
2808            canonical_id: None,
2809            hierarchical_confidence: None,
2810            visual_span: None,
2811            discontinuous_span: None,
2812            valid_from: None,
2813            valid_until: None,
2814            viewport: None,
2815            phi_features: None,
2816            mention_type: None,
2817        }
2818    }
2819
2820    /// Set span offsets.
2821    #[must_use]
2822    pub const fn span(mut self, start: usize, end: usize) -> Self {
2823        self.start = start;
2824        self.end = end;
2825        self
2826    }
2827
2828    /// Set confidence score.
2829    #[must_use]
2830    pub fn confidence(mut self, confidence: impl Into<Confidence>) -> Self {
2831        self.confidence = confidence.into();
2832        self
2833    }
2834
2835    /// Set hierarchical confidence.
2836    #[must_use]
2837    pub fn hierarchical_confidence(mut self, confidence: HierarchicalConfidence) -> Self {
2838        self.confidence = Confidence::new(confidence.as_f64());
2839        self.hierarchical_confidence = Some(confidence);
2840        self
2841    }
2842
2843    /// Set normalized form.
2844    #[must_use]
2845    pub fn normalized(mut self, normalized: impl Into<String>) -> Self {
2846        self.normalized = Some(normalized.into());
2847        self
2848    }
2849
2850    /// Set provenance.
2851    #[must_use]
2852    pub fn provenance(mut self, provenance: Provenance) -> Self {
2853        self.provenance = Some(provenance);
2854        self
2855    }
2856
2857    /// Set knowledge base ID.
2858    #[must_use]
2859    pub fn kb_id(mut self, kb_id: impl Into<String>) -> Self {
2860        self.kb_id = Some(kb_id.into());
2861        self
2862    }
2863
2864    /// Set canonical (coreference) ID.
2865    #[must_use]
2866    pub const fn canonical_id(mut self, canonical_id: u64) -> Self {
2867        self.canonical_id = Some(super::types::CanonicalId::new(canonical_id));
2868        self
2869    }
2870
2871    /// Set visual span.
2872    #[must_use]
2873    pub fn visual_span(mut self, span: Span) -> Self {
2874        self.visual_span = Some(span);
2875        self
2876    }
2877
2878    /// Set discontinuous span for non-contiguous entities.
2879    ///
2880    /// This automatically updates `start` and `end` to the bounding range.
2881    #[must_use]
2882    pub fn discontinuous_span(mut self, span: DiscontinuousSpan) -> Self {
2883        // Update start/end to bounding range
2884        if let Some(bounding) = span.bounding_range() {
2885            self.start = bounding.start;
2886            self.end = bounding.end;
2887        }
2888        self.discontinuous_span = Some(span);
2889        self
2890    }
2891
2892    /// Set temporal validity start (when this entity assertion became true).
2893    ///
2894    /// # Example
2895    /// ```rust
2896    /// use anno_core::{EntityBuilder, EntityType};
2897    /// use chrono::{TimeZone, Utc};
2898    ///
2899    /// let entity = EntityBuilder::new("CEO of Microsoft", EntityType::Person)
2900    ///     .span(0, 12)
2901    ///     .valid_from(Utc.with_ymd_and_hms(2008, 10, 1, 0, 0, 0).unwrap())
2902    ///     .build();
2903    /// assert!(entity.valid_from.is_some());
2904    /// ```
2905    #[must_use]
2906    pub fn valid_from(mut self, dt: chrono::DateTime<chrono::Utc>) -> Self {
2907        self.valid_from = Some(dt);
2908        self
2909    }
2910
2911    /// Set temporal validity end (when this entity assertion stopped being true).
2912    #[must_use]
2913    pub fn valid_until(mut self, dt: chrono::DateTime<chrono::Utc>) -> Self {
2914        self.valid_until = Some(dt);
2915        self
2916    }
2917
2918    /// Set temporal validity range (convenience method).
2919    #[must_use]
2920    pub fn temporal_range(
2921        mut self,
2922        from: chrono::DateTime<chrono::Utc>,
2923        until: chrono::DateTime<chrono::Utc>,
2924    ) -> Self {
2925        self.valid_from = Some(from);
2926        self.valid_until = Some(until);
2927        self
2928    }
2929
2930    /// Set the viewport context for multi-faceted entity representation.
2931    ///
2932    /// # Example
2933    /// ```rust
2934    /// use anno_core::{EntityBuilder, EntityType, EntityViewport};
2935    ///
2936    /// let entity = EntityBuilder::new("Marie Curie", EntityType::Person)
2937    ///     .span(0, 11)
2938    ///     .viewport(EntityViewport::Academic)
2939    ///     .build();
2940    /// assert_eq!(entity.viewport, Some(EntityViewport::Academic));
2941    /// ```
2942    #[must_use]
2943    pub fn viewport(mut self, viewport: EntityViewport) -> Self {
2944        self.viewport = Some(viewport);
2945        self
2946    }
2947
2948    /// Set phi-features (person, number, gender) for morphological agreement.
2949    #[must_use]
2950    pub fn phi_features(mut self, phi_features: PhiFeatures) -> Self {
2951        self.phi_features = Some(phi_features);
2952        self
2953    }
2954
2955    /// Set mention type classification.
2956    #[must_use]
2957    pub fn mention_type(mut self, mention_type: MentionType) -> Self {
2958        self.mention_type = Some(mention_type);
2959        self
2960    }
2961
2962    /// Build the entity.
2963    #[must_use]
2964    pub fn build(self) -> Entity {
2965        Entity {
2966            text: self.text,
2967            entity_type: self.entity_type,
2968            start: self.start,
2969            end: self.end,
2970            confidence: self.confidence,
2971            normalized: self.normalized,
2972            provenance: self.provenance,
2973            kb_id: self.kb_id,
2974            canonical_id: self.canonical_id,
2975            hierarchical_confidence: self.hierarchical_confidence,
2976            visual_span: self.visual_span,
2977            discontinuous_span: self.discontinuous_span,
2978            valid_from: self.valid_from,
2979            valid_until: self.valid_until,
2980            viewport: self.viewport,
2981            phi_features: self.phi_features,
2982            mention_type: self.mention_type,
2983        }
2984    }
2985}
2986
2987// ============================================================================
2988// Relation (for Knowledge Graph Construction)
2989// ============================================================================
2990
2991/// A relation between two entities, forming a knowledge graph triple.
2992///
2993/// In the GLiNER bi-encoder paradigm, relations are detected just like entities:
2994/// the relation trigger text ("CEO of", "located in") is matched against
2995/// relation type labels in the same latent space.
2996///
2997/// # Structure
2998///
2999/// ```text
3000/// Triple: (Head, Relation, Tail)
3001///
3002/// "Marie Curie worked at the Sorbonne"
3003///  ^^^^^^^^^^^ ~~~~~~~~~ ^^^^^^^^
3004///  Head        Rel       Tail
3005///  (Person)  (Employment)  (Organization)
3006/// ```
3007///
3008/// # TPLinker/Joint Extraction
3009///
3010/// For joint extraction, relations are extracted in a single pass with entities.
3011/// The `trigger_span` captures the text that indicates the relation.
3012#[derive(Debug, Clone, Serialize, Deserialize)]
3013pub struct Relation {
3014    /// The source entity (head of the triple)
3015    pub head: Entity,
3016    /// The target entity (tail of the triple)
3017    pub tail: Entity,
3018    /// Relation type label (e.g., "EMPLOYMENT", "LOCATED_IN", "FOUNDED_BY")
3019    pub relation_type: String,
3020    /// Optional trigger span: the text that indicates this relation
3021    /// For "CEO of", this would be the span covering "CEO of"
3022    pub trigger_span: Option<(usize, usize)>,
3023    /// Confidence score for this relation (0.0-1.0).
3024    pub confidence: Confidence,
3025}
3026
3027impl Relation {
3028    /// Create a new relation between two entities.
3029    #[must_use]
3030    pub fn new(
3031        head: Entity,
3032        tail: Entity,
3033        relation_type: impl Into<String>,
3034        confidence: impl Into<Confidence>,
3035    ) -> Self {
3036        Self {
3037            head,
3038            tail,
3039            relation_type: relation_type.into(),
3040            trigger_span: None,
3041            confidence: confidence.into(),
3042        }
3043    }
3044
3045    /// Create a relation with an explicit trigger span.
3046    #[must_use]
3047    pub fn with_trigger(
3048        head: Entity,
3049        tail: Entity,
3050        relation_type: impl Into<String>,
3051        trigger_start: usize,
3052        trigger_end: usize,
3053        confidence: impl Into<Confidence>,
3054    ) -> Self {
3055        Self {
3056            head,
3057            tail,
3058            relation_type: relation_type.into(),
3059            trigger_span: Some((trigger_start, trigger_end)),
3060            confidence: confidence.into(),
3061        }
3062    }
3063
3064    /// Convert to a triple string representation (for debugging/display).
3065    #[must_use]
3066    pub fn as_triple(&self) -> String {
3067        format!(
3068            "({}, {}, {})",
3069            self.head.text, self.relation_type, self.tail.text
3070        )
3071    }
3072
3073    /// Check if the head and tail entities are adjacent (within n tokens).
3074    /// Useful for filtering spurious long-distance relations.
3075    #[must_use]
3076    pub fn span_distance(&self) -> usize {
3077        if self.head.end <= self.tail.start {
3078            self.tail.start.saturating_sub(self.head.end)
3079        } else if self.tail.end <= self.head.start {
3080            self.head.start.saturating_sub(self.tail.end)
3081        } else {
3082            0 // Overlapping spans
3083        }
3084    }
3085}
3086
3087#[cfg(test)]
3088mod tests {
3089    #![allow(clippy::unwrap_used)] // unwrap() is acceptable in test code
3090    use super::*;
3091
3092    #[test]
3093    fn test_entity_type_roundtrip() {
3094        let types = [
3095            EntityType::Person,
3096            EntityType::Organization,
3097            EntityType::Location,
3098            EntityType::Date,
3099            EntityType::Money,
3100            EntityType::Percent,
3101        ];
3102
3103        for t in types {
3104            let label = t.as_label();
3105            let parsed = EntityType::from_label(label);
3106            assert_eq!(t, parsed);
3107        }
3108    }
3109
3110    #[test]
3111    fn test_entity_overlap() {
3112        let e1 = Entity::new("John", EntityType::Person, 0, 4, 0.9);
3113        let e2 = Entity::new("Smith", EntityType::Person, 5, 10, 0.9);
3114        let e3 = Entity::new("John Smith", EntityType::Person, 0, 10, 0.9);
3115
3116        assert!(!e1.overlaps(&e2)); // No overlap
3117        assert!(e1.overlaps(&e3)); // e1 is contained in e3
3118        assert!(e3.overlaps(&e2)); // e3 contains e2
3119    }
3120
3121    #[test]
3122    fn test_confidence_clamping() {
3123        let e1 = Entity::new("test", EntityType::Person, 0, 4, 1.5);
3124        assert!((e1.confidence - 1.0).abs() < f64::EPSILON);
3125
3126        let e2 = Entity::new("test", EntityType::Person, 0, 4, -0.5);
3127        assert!(e2.confidence.abs() < f64::EPSILON);
3128    }
3129
3130    #[test]
3131    fn test_entity_categories() {
3132        // Agent/Org/Place entities require ML
3133        assert_eq!(EntityType::Person.category(), EntityCategory::Agent);
3134        assert_eq!(
3135            EntityType::Organization.category(),
3136            EntityCategory::Organization
3137        );
3138        assert_eq!(EntityType::Location.category(), EntityCategory::Place);
3139        assert!(EntityType::Person.requires_ml());
3140        assert!(!EntityType::Person.pattern_detectable());
3141
3142        // Temporal entities are pattern-detectable
3143        assert_eq!(EntityType::Date.category(), EntityCategory::Temporal);
3144        assert_eq!(EntityType::Time.category(), EntityCategory::Temporal);
3145        assert!(EntityType::Date.pattern_detectable());
3146        assert!(!EntityType::Date.requires_ml());
3147
3148        // Numeric entities are pattern-detectable
3149        assert_eq!(EntityType::Money.category(), EntityCategory::Numeric);
3150        assert_eq!(EntityType::Percent.category(), EntityCategory::Numeric);
3151        assert!(EntityType::Money.pattern_detectable());
3152
3153        // Contact entities are pattern-detectable
3154        assert_eq!(EntityType::Email.category(), EntityCategory::Contact);
3155        assert_eq!(EntityType::Url.category(), EntityCategory::Contact);
3156        assert_eq!(EntityType::Phone.category(), EntityCategory::Contact);
3157        assert!(EntityType::Email.pattern_detectable());
3158    }
3159
3160    #[test]
3161    fn test_new_types_roundtrip() {
3162        let types = [
3163            EntityType::Time,
3164            EntityType::Email,
3165            EntityType::Url,
3166            EntityType::Phone,
3167            EntityType::Quantity,
3168            EntityType::Cardinal,
3169            EntityType::Ordinal,
3170        ];
3171
3172        for t in types {
3173            let label = t.as_label();
3174            let parsed = EntityType::from_label(label);
3175            assert_eq!(t, parsed, "Roundtrip failed for {}", label);
3176        }
3177    }
3178
3179    #[test]
3180    fn test_custom_entity_type() {
3181        let disease = EntityType::custom("DISEASE", EntityCategory::Agent);
3182        assert_eq!(disease.as_label(), "DISEASE");
3183        assert!(disease.requires_ml());
3184
3185        let product_id = EntityType::custom("PRODUCT_ID", EntityCategory::Misc);
3186        assert_eq!(product_id.as_label(), "PRODUCT_ID");
3187        assert!(!product_id.requires_ml());
3188        assert!(!product_id.pattern_detectable());
3189    }
3190
3191    #[test]
3192    fn test_entity_normalization() {
3193        let mut e = Entity::new("Jan 15", EntityType::Date, 0, 6, 0.95);
3194        assert!(e.normalized.is_none());
3195        assert_eq!(e.normalized_or_text(), "Jan 15");
3196
3197        e.set_normalized("2024-01-15");
3198        assert_eq!(e.normalized.as_deref(), Some("2024-01-15"));
3199        assert_eq!(e.normalized_or_text(), "2024-01-15");
3200    }
3201
3202    #[test]
3203    fn test_entity_helpers() {
3204        let named = Entity::new("John", EntityType::Person, 0, 4, 0.9);
3205        assert!(named.is_named());
3206        assert!(!named.is_structured());
3207        assert_eq!(named.category(), EntityCategory::Agent);
3208
3209        let structured = Entity::new("$100", EntityType::Money, 0, 4, 0.95);
3210        assert!(!structured.is_named());
3211        assert!(structured.is_structured());
3212        assert_eq!(structured.category(), EntityCategory::Numeric);
3213    }
3214
3215    #[test]
3216    fn test_knowledge_linking() {
3217        let mut entity = Entity::new("Marie Curie", EntityType::Person, 0, 11, 0.95);
3218        assert!(!entity.is_linked());
3219        assert!(!entity.has_coreference());
3220
3221        entity.link_to_kb("Q7186"); // Wikidata ID
3222        assert!(entity.is_linked());
3223        assert_eq!(entity.kb_id.as_deref(), Some("Q7186"));
3224
3225        entity.set_canonical(42);
3226        assert!(entity.has_coreference());
3227        assert_eq!(
3228            entity.canonical_id,
3229            Some(crate::core::types::CanonicalId::new(42))
3230        );
3231    }
3232
3233    #[test]
3234    fn test_relation_creation() {
3235        let head = Entity::new("Marie Curie", EntityType::Person, 0, 11, 0.95);
3236        let tail = Entity::new("Sorbonne", EntityType::Organization, 24, 32, 0.90);
3237
3238        let relation = Relation::new(head.clone(), tail.clone(), "WORKED_AT", 0.85);
3239        assert_eq!(relation.relation_type, "WORKED_AT");
3240        assert_eq!(relation.as_triple(), "(Marie Curie, WORKED_AT, Sorbonne)");
3241        assert!(relation.trigger_span.is_none());
3242
3243        // With trigger span
3244        let relation2 = Relation::with_trigger(head, tail, "EMPLOYMENT", 13, 19, 0.85);
3245        assert_eq!(relation2.trigger_span, Some((13, 19)));
3246    }
3247
3248    #[test]
3249    fn test_relation_span_distance() {
3250        // Head at 0-11, tail at 24-32 -> distance is 24-11 = 13
3251        let head = Entity::new("Marie Curie", EntityType::Person, 0, 11, 0.95);
3252        let tail = Entity::new("Sorbonne", EntityType::Organization, 24, 32, 0.90);
3253        let relation = Relation::new(head, tail, "WORKED_AT", 0.85);
3254        assert_eq!(relation.span_distance(), 13);
3255    }
3256
3257    #[test]
3258    fn test_relation_category() {
3259        // Relation types should be categorized as Relation
3260        let rel_type = EntityType::custom("CEO_OF", EntityCategory::Relation);
3261        assert_eq!(rel_type.category(), EntityCategory::Relation);
3262        assert!(rel_type.category().is_relation());
3263        assert!(rel_type.requires_ml()); // Relations require ML
3264    }
3265
3266    // ========================================================================
3267    // Span Tests
3268    // ========================================================================
3269
3270    #[test]
3271    fn test_span_text() {
3272        let span = Span::text(10, 20);
3273        assert!(span.is_text());
3274        assert!(!span.is_visual());
3275        assert_eq!(span.text_offsets(), Some((10, 20)));
3276        assert_eq!(span.len(), 10);
3277        assert!(!span.is_empty());
3278    }
3279
3280    #[test]
3281    fn test_span_bbox() {
3282        let span = Span::bbox(0.1, 0.2, 0.3, 0.4);
3283        assert!(!span.is_text());
3284        assert!(span.is_visual());
3285        assert_eq!(span.text_offsets(), None);
3286        assert_eq!(span.len(), 0); // No text length
3287    }
3288
3289    #[test]
3290    fn test_span_bbox_with_page() {
3291        let span = Span::bbox_on_page(0.1, 0.2, 0.3, 0.4, 5);
3292        if let Span::BoundingBox { page, .. } = span {
3293            assert_eq!(page, Some(5));
3294        } else {
3295            panic!("Expected BoundingBox");
3296        }
3297    }
3298
3299    #[test]
3300    fn test_span_hybrid() {
3301        let bbox = Span::bbox(0.1, 0.2, 0.3, 0.4);
3302        let hybrid = Span::Hybrid {
3303            start: 10,
3304            end: 20,
3305            bbox: Box::new(bbox),
3306        };
3307        assert!(hybrid.is_text());
3308        assert!(hybrid.is_visual());
3309        assert_eq!(hybrid.text_offsets(), Some((10, 20)));
3310        assert_eq!(hybrid.len(), 10);
3311    }
3312
3313    // ========================================================================
3314    // Hierarchical Confidence Tests
3315    // ========================================================================
3316
3317    #[test]
3318    fn test_hierarchical_confidence_new() {
3319        let hc = HierarchicalConfidence::new(0.9, 0.8, 0.7);
3320        assert!((hc.linkage - 0.9).abs() < f64::EPSILON);
3321        assert!((hc.type_score - 0.8).abs() < f64::EPSILON);
3322        assert!((hc.boundary - 0.7).abs() < f64::EPSILON);
3323    }
3324
3325    #[test]
3326    fn test_hierarchical_confidence_clamping() {
3327        let hc = HierarchicalConfidence::new(1.5, -0.5, 0.5);
3328        assert_eq!(hc.linkage, 1.0);
3329        assert_eq!(hc.type_score, 0.0);
3330        assert_eq!(hc.boundary, 0.5);
3331    }
3332
3333    #[test]
3334    fn test_hierarchical_confidence_from_single() {
3335        let hc = HierarchicalConfidence::from_single(0.8);
3336        assert!((hc.linkage - 0.8).abs() < f64::EPSILON);
3337        assert!((hc.type_score - 0.8).abs() < f64::EPSILON);
3338        assert!((hc.boundary - 0.8).abs() < f64::EPSILON);
3339    }
3340
3341    #[test]
3342    fn test_hierarchical_confidence_combined() {
3343        let hc = HierarchicalConfidence::new(1.0, 1.0, 1.0);
3344        assert!((hc.combined() - 1.0).abs() < f64::EPSILON);
3345
3346        let hc2 = HierarchicalConfidence::new(0.8, 0.8, 0.8);
3347        assert!((hc2.combined() - 0.8).abs() < 0.001);
3348
3349        // Geometric mean: (0.5 * 0.5 * 0.5)^(1/3) = 0.5
3350        let hc3 = HierarchicalConfidence::new(0.5, 0.5, 0.5);
3351        assert!((hc3.combined() - 0.5).abs() < 0.001);
3352    }
3353
3354    #[test]
3355    fn test_hierarchical_confidence_threshold() {
3356        let hc = HierarchicalConfidence::new(0.9, 0.8, 0.7);
3357        assert!(hc.passes_threshold(0.5, 0.5, 0.5));
3358        assert!(hc.passes_threshold(0.9, 0.8, 0.7));
3359        assert!(!hc.passes_threshold(0.95, 0.8, 0.7)); // linkage too high
3360        assert!(!hc.passes_threshold(0.9, 0.85, 0.7)); // type too high
3361    }
3362
3363    #[test]
3364    fn test_hierarchical_confidence_from_f64() {
3365        let hc: HierarchicalConfidence = 0.85_f64.into();
3366        assert!((hc.linkage - 0.85).abs() < 0.001);
3367    }
3368
3369    // ========================================================================
3370    // RaggedBatch Tests
3371    // ========================================================================
3372
3373    #[test]
3374    fn test_ragged_batch_from_sequences() {
3375        let seqs = vec![vec![1, 2, 3], vec![4, 5], vec![6, 7, 8, 9]];
3376        let batch = RaggedBatch::from_sequences(&seqs);
3377
3378        assert_eq!(batch.batch_size(), 3);
3379        assert_eq!(batch.total_tokens(), 9);
3380        assert_eq!(batch.max_seq_len, 4);
3381        assert_eq!(batch.cumulative_offsets, vec![0, 3, 5, 9]);
3382    }
3383
3384    #[test]
3385    fn test_ragged_batch_doc_range() {
3386        let seqs = vec![vec![1, 2, 3], vec![4, 5]];
3387        let batch = RaggedBatch::from_sequences(&seqs);
3388
3389        assert_eq!(batch.doc_range(0), Some(0..3));
3390        assert_eq!(batch.doc_range(1), Some(3..5));
3391        assert_eq!(batch.doc_range(2), None);
3392    }
3393
3394    #[test]
3395    fn test_ragged_batch_doc_tokens() {
3396        let seqs = vec![vec![1, 2, 3], vec![4, 5]];
3397        let batch = RaggedBatch::from_sequences(&seqs);
3398
3399        assert_eq!(batch.doc_tokens(0), Some(&[1, 2, 3][..]));
3400        assert_eq!(batch.doc_tokens(1), Some(&[4, 5][..]));
3401    }
3402
3403    #[test]
3404    fn test_ragged_batch_padding_savings() {
3405        // 3 docs: [3, 2, 4] tokens, max = 4
3406        // Padded: 3 * 4 = 12, actual: 9
3407        // Savings: 1 - 9/12 = 0.25
3408        let seqs = vec![vec![1, 2, 3], vec![4, 5], vec![6, 7, 8, 9]];
3409        let batch = RaggedBatch::from_sequences(&seqs);
3410        let savings = batch.padding_savings();
3411        assert!((savings - 0.25).abs() < 0.001);
3412    }
3413
3414    // ========================================================================
3415    // SpanCandidate Tests
3416    // ========================================================================
3417
3418    #[test]
3419    fn test_span_candidate() {
3420        let sc = SpanCandidate::new(0, 5, 10);
3421        assert_eq!(sc.doc_idx, 0);
3422        assert_eq!(sc.start, 5);
3423        assert_eq!(sc.end, 10);
3424        assert_eq!(sc.width(), 5);
3425    }
3426
3427    #[test]
3428    fn test_generate_span_candidates() {
3429        let seqs = vec![vec![1, 2, 3]]; // doc with 3 tokens
3430        let batch = RaggedBatch::from_sequences(&seqs);
3431        let candidates = generate_span_candidates(&batch, 2);
3432
3433        // With max_width=2: [0,1], [1,2], [2,3], [0,2], [1,3]
3434        // = spans: (0,1), (0,2), (1,2), (1,3), (2,3)
3435        assert_eq!(candidates.len(), 5);
3436
3437        // Verify all candidates are valid
3438        for c in &candidates {
3439            assert_eq!(c.doc_idx, 0);
3440            assert!(c.end as usize <= 3);
3441            assert!(c.width() as usize <= 2);
3442        }
3443    }
3444
3445    #[test]
3446    fn test_generate_filtered_candidates() {
3447        let seqs = vec![vec![1, 2, 3]];
3448        let batch = RaggedBatch::from_sequences(&seqs);
3449
3450        // With max_width=2, we have 5 candidates
3451        // Set mask: only first 2 pass threshold
3452        let mask = vec![0.9, 0.9, 0.1, 0.1, 0.1];
3453        let candidates = generate_filtered_candidates(&batch, 2, &mask, 0.5);
3454
3455        assert_eq!(candidates.len(), 2);
3456    }
3457
3458    // ========================================================================
3459    // EntityBuilder Tests
3460    // ========================================================================
3461
3462    #[test]
3463    fn test_entity_builder_basic() {
3464        let entity = Entity::builder("John", EntityType::Person)
3465            .span(0, 4)
3466            .confidence(0.95)
3467            .build();
3468
3469        assert_eq!(entity.text, "John");
3470        assert_eq!(entity.entity_type, EntityType::Person);
3471        assert_eq!(entity.start, 0);
3472        assert_eq!(entity.end, 4);
3473        assert!((entity.confidence - 0.95).abs() < f64::EPSILON);
3474    }
3475
3476    #[test]
3477    fn test_entity_builder_full() {
3478        let entity = Entity::builder("Marie Curie", EntityType::Person)
3479            .span(0, 11)
3480            .confidence(0.95)
3481            .kb_id("Q7186")
3482            .canonical_id(42)
3483            .normalized("Marie Salomea Skłodowska Curie")
3484            .provenance(Provenance::ml("bert", 0.95))
3485            .build();
3486
3487        assert_eq!(entity.text, "Marie Curie");
3488        assert_eq!(entity.kb_id.as_deref(), Some("Q7186"));
3489        assert_eq!(
3490            entity.canonical_id,
3491            Some(crate::core::types::CanonicalId::new(42))
3492        );
3493        assert_eq!(
3494            entity.normalized.as_deref(),
3495            Some("Marie Salomea Skłodowska Curie")
3496        );
3497        assert!(entity.provenance.is_some());
3498    }
3499
3500    #[test]
3501    fn test_entity_builder_hierarchical() {
3502        let hc = HierarchicalConfidence::new(0.9, 0.8, 0.7);
3503        let entity = Entity::builder("test", EntityType::Person)
3504            .span(0, 4)
3505            .hierarchical_confidence(hc)
3506            .build();
3507
3508        assert!(entity.hierarchical_confidence.is_some());
3509        assert!((entity.linkage_confidence() - 0.9).abs() < 0.001);
3510        assert!((entity.type_confidence() - 0.8).abs() < 0.001);
3511        assert!((entity.boundary_confidence() - 0.7).abs() < 0.001);
3512    }
3513
3514    #[test]
3515    fn test_entity_builder_visual() {
3516        let bbox = Span::bbox(0.1, 0.2, 0.3, 0.4);
3517        let entity = Entity::builder("receipt item", EntityType::Money)
3518            .visual_span(bbox)
3519            .confidence(0.9)
3520            .build();
3521
3522        assert!(entity.is_visual());
3523        assert!(entity.visual_span.is_some());
3524    }
3525
3526    // ========================================================================
3527    // Entity Helper Method Tests
3528    // ========================================================================
3529
3530    #[test]
3531    fn test_entity_hierarchical_confidence_helpers() {
3532        let mut entity = Entity::new("test", EntityType::Person, 0, 4, 0.8);
3533
3534        // Without hierarchical confidence, falls back to main confidence
3535        assert!((entity.linkage_confidence() - 0.8).abs() < 0.001);
3536        assert!((entity.type_confidence() - 0.8).abs() < 0.001);
3537        assert!((entity.boundary_confidence() - 0.8).abs() < 0.001);
3538
3539        // Set hierarchical confidence
3540        entity.set_hierarchical_confidence(HierarchicalConfidence::new(0.95, 0.85, 0.75));
3541        assert!((entity.linkage_confidence() - 0.95).abs() < 0.001);
3542        assert!((entity.type_confidence() - 0.85).abs() < 0.001);
3543        assert!((entity.boundary_confidence() - 0.75).abs() < 0.001);
3544    }
3545
3546    #[test]
3547    fn test_entity_from_visual() {
3548        let entity = Entity::from_visual(
3549            "receipt total",
3550            EntityType::Money,
3551            Span::bbox(0.5, 0.8, 0.2, 0.05),
3552            0.92,
3553        );
3554
3555        assert!(entity.is_visual());
3556        assert_eq!(entity.start, 0);
3557        assert_eq!(entity.end, 0);
3558        assert!((entity.confidence - 0.92).abs() < f64::EPSILON);
3559    }
3560
3561    #[test]
3562    fn test_entity_span_helpers() {
3563        let entity = Entity::new("test", EntityType::Person, 10, 20, 0.9);
3564        assert_eq!(entity.text_span(), (10, 20));
3565        assert_eq!(entity.span_len(), 10);
3566    }
3567
3568    // ========================================================================
3569    // Provenance Tests
3570    // ========================================================================
3571
3572    #[test]
3573    fn test_provenance_pattern() {
3574        let prov = Provenance::pattern("EMAIL");
3575        assert_eq!(prov.method, ExtractionMethod::Pattern);
3576        assert_eq!(prov.pattern.as_deref(), Some("EMAIL"));
3577        assert_eq!(prov.raw_confidence, Some(Confidence::new(1.0))); // Patterns are deterministic
3578    }
3579
3580    #[test]
3581    fn test_provenance_ml() {
3582        let prov = Provenance::ml("bert-ner", 0.87);
3583        assert_eq!(prov.method, ExtractionMethod::Neural);
3584        assert_eq!(prov.source.as_ref(), "bert-ner");
3585        assert_eq!(prov.raw_confidence, Some(Confidence::new(0.87)));
3586    }
3587
3588    #[test]
3589    fn test_provenance_with_version() {
3590        let prov = Provenance::ml("gliner", 0.92).with_version("v2.1.0");
3591
3592        assert_eq!(prov.model_version.as_deref(), Some("v2.1.0"));
3593        assert_eq!(prov.source.as_ref(), "gliner");
3594    }
3595
3596    #[test]
3597    fn test_provenance_with_timestamp() {
3598        let prov = Provenance::pattern("DATE").with_timestamp("2024-01-15T10:30:00Z");
3599
3600        assert_eq!(prov.timestamp.as_deref(), Some("2024-01-15T10:30:00Z"));
3601    }
3602
3603    #[test]
3604    fn test_provenance_builder_chain() {
3605        let prov = Provenance::ml("modernbert-ner", 0.95)
3606            .with_version("v1.0.0")
3607            .with_timestamp("2024-11-27T12:00:00Z");
3608
3609        assert_eq!(prov.method, ExtractionMethod::Neural);
3610        assert_eq!(prov.source.as_ref(), "modernbert-ner");
3611        assert_eq!(prov.raw_confidence, Some(Confidence::new(0.95)));
3612        assert_eq!(prov.model_version.as_deref(), Some("v1.0.0"));
3613        assert_eq!(prov.timestamp.as_deref(), Some("2024-11-27T12:00:00Z"));
3614    }
3615
3616    #[test]
3617    fn test_provenance_serialization() {
3618        let prov = Provenance::ml("test", 0.9)
3619            .with_version("v1.0")
3620            .with_timestamp("2024-01-01");
3621
3622        let json = serde_json::to_string(&prov).unwrap();
3623        assert!(json.contains("model_version"));
3624        assert!(json.contains("v1.0"));
3625
3626        let restored: Provenance = serde_json::from_str(&json).unwrap();
3627        assert_eq!(restored.model_version.as_deref(), Some("v1.0"));
3628        assert_eq!(restored.timestamp.as_deref(), Some("2024-01-01"));
3629    }
3630}
3631
3632#[cfg(test)]
3633mod proptests {
3634    #![allow(clippy::unwrap_used)] // unwrap() is acceptable in property tests
3635    use super::*;
3636    use proptest::prelude::*;
3637
3638    proptest! {
3639        #[test]
3640        fn confidence_always_clamped(conf in -10.0f64..10.0) {
3641            let e = Entity::new("test", EntityType::Person, 0, 4, conf);
3642            prop_assert!(e.confidence >= 0.0);
3643            prop_assert!(e.confidence <= 1.0);
3644        }
3645
3646        #[test]
3647        fn entity_type_roundtrip(label in "[A-Z]{3,10}") {
3648            let et = EntityType::from_label(&label);
3649            let back = EntityType::from_label(et.as_label());
3650            // Custom types may round-trip to themselves or normalize
3651            let is_custom = matches!(back, EntityType::Custom { .. });
3652            prop_assert!(is_custom || back == et);
3653        }
3654
3655        #[test]
3656        fn overlap_is_symmetric(
3657            s1 in 0usize..100,
3658            len1 in 1usize..50,
3659            s2 in 0usize..100,
3660            len2 in 1usize..50,
3661        ) {
3662            let e1 = Entity::new("a", EntityType::Person, s1, s1 + len1, 1.0);
3663            let e2 = Entity::new("b", EntityType::Person, s2, s2 + len2, 1.0);
3664            prop_assert_eq!(e1.overlaps(&e2), e2.overlaps(&e1));
3665        }
3666
3667        #[test]
3668        fn overlap_ratio_bounded(
3669            s1 in 0usize..100,
3670            len1 in 1usize..50,
3671            s2 in 0usize..100,
3672            len2 in 1usize..50,
3673        ) {
3674            let e1 = Entity::new("a", EntityType::Person, s1, s1 + len1, 1.0);
3675            let e2 = Entity::new("b", EntityType::Person, s2, s2 + len2, 1.0);
3676            let ratio = e1.overlap_ratio(&e2);
3677            prop_assert!(ratio >= 0.0);
3678            prop_assert!(ratio <= 1.0);
3679        }
3680
3681        #[test]
3682        fn self_overlap_ratio_is_one(s in 0usize..100, len in 1usize..50) {
3683            let e = Entity::new("test", EntityType::Person, s, s + len, 1.0);
3684            let ratio = e.overlap_ratio(&e);
3685            prop_assert!((ratio - 1.0).abs() < 1e-10);
3686        }
3687
3688        #[test]
3689        fn hierarchical_confidence_always_clamped(
3690            linkage in -2.0f32..2.0,
3691            type_score in -2.0f32..2.0,
3692            boundary in -2.0f32..2.0,
3693        ) {
3694            let hc = HierarchicalConfidence::new(linkage, type_score, boundary);
3695            prop_assert!(hc.linkage >= 0.0 && hc.linkage <= 1.0);
3696            prop_assert!(hc.type_score >= 0.0 && hc.type_score <= 1.0);
3697            prop_assert!(hc.boundary >= 0.0 && hc.boundary <= 1.0);
3698            prop_assert!(hc.combined() >= 0.0 && hc.combined() <= 1.0);
3699        }
3700
3701        #[test]
3702        fn span_candidate_width_consistent(
3703            doc in 0u32..10,
3704            start in 0u32..100,
3705            end in 1u32..100,
3706        ) {
3707            let actual_end = start.max(end);
3708            let sc = SpanCandidate::new(doc, start, actual_end);
3709            prop_assert_eq!(sc.width(), actual_end.saturating_sub(start));
3710        }
3711
3712        #[test]
3713        fn ragged_batch_preserves_tokens(
3714            seq_lens in proptest::collection::vec(1usize..10, 1..5),
3715        ) {
3716            // Create sequences with sequential token IDs
3717            let mut counter = 0u32;
3718            let seqs: Vec<Vec<u32>> = seq_lens.iter().map(|&len| {
3719                let seq: Vec<u32> = (counter..counter + len as u32).collect();
3720                counter += len as u32;
3721                seq
3722            }).collect();
3723
3724            let batch = RaggedBatch::from_sequences(&seqs);
3725
3726            // Verify batch properties
3727            prop_assert_eq!(batch.batch_size(), seqs.len());
3728            prop_assert_eq!(batch.total_tokens(), seq_lens.iter().sum::<usize>());
3729
3730            // Verify each doc can be retrieved correctly
3731            for (i, seq) in seqs.iter().enumerate() {
3732                let doc_tokens = batch.doc_tokens(i).unwrap();
3733                prop_assert_eq!(doc_tokens, seq.as_slice());
3734            }
3735        }
3736
3737        #[test]
3738        fn span_text_offsets_consistent(start in 0usize..100, len in 0usize..50) {
3739            let end = start + len;
3740            let span = Span::text(start, end);
3741            let (s, e) = span.text_offsets().unwrap();
3742            prop_assert_eq!(s, start);
3743            prop_assert_eq!(e, end);
3744            prop_assert_eq!(span.len(), len);
3745        }
3746
3747        // =================================================================
3748        // Property tests for core type invariants
3749        // =================================================================
3750
3751        /// Entity with start < end always passes the span validity check in validate().
3752        #[test]
3753        fn entity_span_validity(
3754            start in 0usize..10000,
3755            len in 1usize..500,
3756            conf in 0.0f64..=1.0,
3757        ) {
3758            let end = start + len;
3759            // Build a source text long enough to cover the span
3760            let text_content: String = "x".repeat(end);
3761            let entity_text: String = text_content.chars().skip(start).take(len).collect();
3762            let e = Entity::new(&entity_text, EntityType::Person, start, end, conf);
3763            let issues = e.validate(&text_content);
3764            // No InvalidSpan or SpanOutOfBounds issues
3765            for issue in &issues {
3766                match issue {
3767                    ValidationIssue::InvalidSpan { .. } => {
3768                        prop_assert!(false, "start < end should never produce InvalidSpan");
3769                    }
3770                    ValidationIssue::SpanOutOfBounds { .. } => {
3771                        prop_assert!(false, "span within text should never produce SpanOutOfBounds");
3772                    }
3773                    _ => {} // TextMismatch or others are fine to check separately
3774                }
3775            }
3776        }
3777
3778        /// EntityType::from_label(et.as_label()) == et for all standard (non-Custom, non-Other) types.
3779        #[test]
3780        fn entity_type_label_roundtrip_standard(
3781            idx in 0usize..13,
3782        ) {
3783            let standard_types = [
3784                EntityType::Person,
3785                EntityType::Organization,
3786                EntityType::Location,
3787                EntityType::Date,
3788                EntityType::Time,
3789                EntityType::Money,
3790                EntityType::Percent,
3791                EntityType::Quantity,
3792                EntityType::Cardinal,
3793                EntityType::Ordinal,
3794                EntityType::Email,
3795                EntityType::Url,
3796                EntityType::Phone,
3797            ];
3798            let et = &standard_types[idx];
3799            let label = et.as_label();
3800            let roundtripped = EntityType::from_label(label);
3801            prop_assert_eq!(&roundtripped, et,
3802                "from_label(as_label()) must roundtrip for {:?} (label={:?})", et, label);
3803        }
3804
3805        /// Span containment: if span A contains span B, then A.start <= B.start && A.end >= B.end.
3806        #[test]
3807        fn span_containment_property(
3808            a_start in 0usize..5000,
3809            a_len in 1usize..5000,
3810            b_offset in 0usize..5000,
3811            b_len in 1usize..5000,
3812        ) {
3813            let a_end = a_start + a_len;
3814            let b_start = a_start + (b_offset % a_len); // B starts within A
3815            let b_end_candidate = b_start + b_len;
3816
3817            // Only test the containment invariant when B is actually inside A
3818            if b_start >= a_start && b_end_candidate <= a_end {
3819                // B is contained in A
3820                prop_assert!(a_start <= b_start);
3821                prop_assert!(a_end >= b_end_candidate);
3822
3823                // Also verify via Entity overlap: A must overlap B if A contains B
3824                let ea = Entity::new("a", EntityType::Person, a_start, a_end, 1.0);
3825                let eb = Entity::new("b", EntityType::Person, b_start, b_end_candidate, 1.0);
3826                prop_assert!(ea.overlaps(&eb),
3827                    "containing span must overlap contained span");
3828            }
3829        }
3830
3831        /// Serde roundtrip preserves all fields of Entity.
3832        #[test]
3833        fn entity_serde_roundtrip(
3834            start in 0usize..10000,
3835            len in 1usize..500,
3836            conf in 0.0f64..=1.0,
3837            type_idx in 0usize..5,
3838        ) {
3839            let end = start + len;
3840            let types = [
3841                EntityType::Person,
3842                EntityType::Organization,
3843                EntityType::Location,
3844                EntityType::Date,
3845                EntityType::Email,
3846            ];
3847            let et = types[type_idx].clone();
3848            let text = format!("entity_{}", start);
3849            let e = Entity::new(&text, et, start, end, conf);
3850
3851            let json = serde_json::to_string(&e).unwrap();
3852            let e2: Entity = serde_json::from_str(&json).unwrap();
3853
3854            prop_assert_eq!(&e.text, &e2.text);
3855            prop_assert_eq!(&e.entity_type, &e2.entity_type);
3856            prop_assert_eq!(e.start, e2.start);
3857            prop_assert_eq!(e.end, e2.end);
3858            // f64 roundtrip through JSON: compare with tolerance
3859            prop_assert!((e.confidence - e2.confidence).abs() < 1e-10,
3860                "confidence roundtrip: {} vs {}", e.confidence, e2.confidence);
3861            prop_assert_eq!(&e.normalized, &e2.normalized);
3862            prop_assert_eq!(&e.kb_id, &e2.kb_id);
3863        }
3864
3865        /// DiscontinuousSpan: total_len() == sum of individual segment lengths.
3866        #[test]
3867        fn discontinuous_span_total_length(
3868            segments in proptest::collection::vec(
3869                (0usize..5000, 1usize..500),
3870                1..6
3871            ),
3872        ) {
3873            let ranges: Vec<std::ops::Range<usize>> = segments.iter()
3874                .map(|&(start, len)| start..start + len)
3875                .collect();
3876            let expected_sum: usize = ranges.iter().map(|r| r.end - r.start).sum();
3877            let span = DiscontinuousSpan::new(ranges);
3878            prop_assert_eq!(span.total_len(), expected_sum,
3879                "total_len must equal sum of segment lengths");
3880        }
3881    }
3882
3883    // ========================================================================
3884    // EntityViewport Tests
3885    // ========================================================================
3886
3887    #[test]
3888    fn test_entity_viewport_as_str() {
3889        assert_eq!(EntityViewport::Business.as_str(), "business");
3890        assert_eq!(EntityViewport::Legal.as_str(), "legal");
3891        assert_eq!(EntityViewport::Technical.as_str(), "technical");
3892        assert_eq!(EntityViewport::Academic.as_str(), "academic");
3893        assert_eq!(EntityViewport::Personal.as_str(), "personal");
3894        assert_eq!(EntityViewport::Political.as_str(), "political");
3895        assert_eq!(EntityViewport::Media.as_str(), "media");
3896        assert_eq!(EntityViewport::Historical.as_str(), "historical");
3897        assert_eq!(EntityViewport::General.as_str(), "general");
3898        assert_eq!(
3899            EntityViewport::Custom("custom".to_string()).as_str(),
3900            "custom"
3901        );
3902    }
3903
3904    #[test]
3905    fn test_entity_viewport_is_professional() {
3906        assert!(EntityViewport::Business.is_professional());
3907        assert!(EntityViewport::Legal.is_professional());
3908        assert!(EntityViewport::Technical.is_professional());
3909        assert!(EntityViewport::Academic.is_professional());
3910        assert!(EntityViewport::Political.is_professional());
3911
3912        assert!(!EntityViewport::Personal.is_professional());
3913        assert!(!EntityViewport::Media.is_professional());
3914        assert!(!EntityViewport::Historical.is_professional());
3915        assert!(!EntityViewport::General.is_professional());
3916        assert!(!EntityViewport::Custom("test".to_string()).is_professional());
3917    }
3918
3919    #[test]
3920    fn test_entity_viewport_from_str() {
3921        assert_eq!(
3922            "business".parse::<EntityViewport>().unwrap(),
3923            EntityViewport::Business
3924        );
3925        assert_eq!(
3926            "financial".parse::<EntityViewport>().unwrap(),
3927            EntityViewport::Business
3928        );
3929        assert_eq!(
3930            "corporate".parse::<EntityViewport>().unwrap(),
3931            EntityViewport::Business
3932        );
3933
3934        assert_eq!(
3935            "legal".parse::<EntityViewport>().unwrap(),
3936            EntityViewport::Legal
3937        );
3938        assert_eq!(
3939            "law".parse::<EntityViewport>().unwrap(),
3940            EntityViewport::Legal
3941        );
3942
3943        assert_eq!(
3944            "technical".parse::<EntityViewport>().unwrap(),
3945            EntityViewport::Technical
3946        );
3947        assert_eq!(
3948            "engineering".parse::<EntityViewport>().unwrap(),
3949            EntityViewport::Technical
3950        );
3951
3952        assert_eq!(
3953            "academic".parse::<EntityViewport>().unwrap(),
3954            EntityViewport::Academic
3955        );
3956        assert_eq!(
3957            "research".parse::<EntityViewport>().unwrap(),
3958            EntityViewport::Academic
3959        );
3960
3961        assert_eq!(
3962            "personal".parse::<EntityViewport>().unwrap(),
3963            EntityViewport::Personal
3964        );
3965        assert_eq!(
3966            "biographical".parse::<EntityViewport>().unwrap(),
3967            EntityViewport::Personal
3968        );
3969
3970        assert_eq!(
3971            "political".parse::<EntityViewport>().unwrap(),
3972            EntityViewport::Political
3973        );
3974        assert_eq!(
3975            "policy".parse::<EntityViewport>().unwrap(),
3976            EntityViewport::Political
3977        );
3978
3979        assert_eq!(
3980            "media".parse::<EntityViewport>().unwrap(),
3981            EntityViewport::Media
3982        );
3983        assert_eq!(
3984            "press".parse::<EntityViewport>().unwrap(),
3985            EntityViewport::Media
3986        );
3987
3988        assert_eq!(
3989            "historical".parse::<EntityViewport>().unwrap(),
3990            EntityViewport::Historical
3991        );
3992        assert_eq!(
3993            "history".parse::<EntityViewport>().unwrap(),
3994            EntityViewport::Historical
3995        );
3996
3997        assert_eq!(
3998            "general".parse::<EntityViewport>().unwrap(),
3999            EntityViewport::General
4000        );
4001        assert_eq!(
4002            "generic".parse::<EntityViewport>().unwrap(),
4003            EntityViewport::General
4004        );
4005        assert_eq!(
4006            "".parse::<EntityViewport>().unwrap(),
4007            EntityViewport::General
4008        );
4009
4010        // Custom viewport
4011        assert_eq!(
4012            "custom_viewport".parse::<EntityViewport>().unwrap(),
4013            EntityViewport::Custom("custom_viewport".to_string())
4014        );
4015    }
4016
4017    #[test]
4018    fn test_entity_viewport_from_str_case_insensitive() {
4019        assert_eq!(
4020            "BUSINESS".parse::<EntityViewport>().unwrap(),
4021            EntityViewport::Business
4022        );
4023        assert_eq!(
4024            "Business".parse::<EntityViewport>().unwrap(),
4025            EntityViewport::Business
4026        );
4027        assert_eq!(
4028            "BuSiNeSs".parse::<EntityViewport>().unwrap(),
4029            EntityViewport::Business
4030        );
4031    }
4032
4033    #[test]
4034    fn test_entity_viewport_display() {
4035        assert_eq!(format!("{}", EntityViewport::Business), "business");
4036        assert_eq!(format!("{}", EntityViewport::Academic), "academic");
4037        assert_eq!(
4038            format!("{}", EntityViewport::Custom("test".to_string())),
4039            "test"
4040        );
4041    }
4042
4043    #[test]
4044    fn test_entity_viewport_methods() {
4045        let mut entity = Entity::new("Marie Curie", EntityType::Person, 0, 11, 0.9);
4046
4047        // Initially no viewport
4048        assert!(!entity.has_viewport());
4049        assert_eq!(entity.viewport_or_default(), EntityViewport::General);
4050        assert!(entity.matches_viewport(&EntityViewport::Academic)); // No viewport matches any
4051
4052        // Set viewport
4053        entity.set_viewport(EntityViewport::Academic);
4054        assert!(entity.has_viewport());
4055        assert_eq!(entity.viewport_or_default(), EntityViewport::Academic);
4056        assert!(entity.matches_viewport(&EntityViewport::Academic));
4057        assert!(!entity.matches_viewport(&EntityViewport::Business));
4058    }
4059
4060    #[test]
4061    fn test_entity_builder_with_viewport() {
4062        let entity = Entity::builder("Marie Curie", EntityType::Person)
4063            .span(0, 11)
4064            .viewport(EntityViewport::Academic)
4065            .build();
4066
4067        assert_eq!(entity.viewport, Some(EntityViewport::Academic));
4068        assert!(entity.has_viewport());
4069    }
4070
4071    // ========================================================================
4072    // EntityCategory Tests
4073    // ========================================================================
4074
4075    #[test]
4076    fn test_entity_category_requires_ml() {
4077        assert!(EntityCategory::Agent.requires_ml());
4078        assert!(EntityCategory::Organization.requires_ml());
4079        assert!(EntityCategory::Place.requires_ml());
4080        assert!(EntityCategory::Creative.requires_ml());
4081        assert!(EntityCategory::Relation.requires_ml());
4082
4083        assert!(!EntityCategory::Temporal.requires_ml());
4084        assert!(!EntityCategory::Numeric.requires_ml());
4085        assert!(!EntityCategory::Contact.requires_ml());
4086        assert!(!EntityCategory::Misc.requires_ml());
4087    }
4088
4089    #[test]
4090    fn test_entity_category_pattern_detectable() {
4091        assert!(EntityCategory::Temporal.pattern_detectable());
4092        assert!(EntityCategory::Numeric.pattern_detectable());
4093        assert!(EntityCategory::Contact.pattern_detectable());
4094
4095        assert!(!EntityCategory::Agent.pattern_detectable());
4096        assert!(!EntityCategory::Organization.pattern_detectable());
4097        assert!(!EntityCategory::Place.pattern_detectable());
4098        assert!(!EntityCategory::Creative.pattern_detectable());
4099        assert!(!EntityCategory::Relation.pattern_detectable());
4100        assert!(!EntityCategory::Misc.pattern_detectable());
4101    }
4102
4103    #[test]
4104    fn test_entity_category_is_relation() {
4105        assert!(EntityCategory::Relation.is_relation());
4106
4107        assert!(!EntityCategory::Agent.is_relation());
4108        assert!(!EntityCategory::Organization.is_relation());
4109        assert!(!EntityCategory::Place.is_relation());
4110        assert!(!EntityCategory::Temporal.is_relation());
4111        assert!(!EntityCategory::Numeric.is_relation());
4112        assert!(!EntityCategory::Contact.is_relation());
4113        assert!(!EntityCategory::Creative.is_relation());
4114        assert!(!EntityCategory::Misc.is_relation());
4115    }
4116
4117    #[test]
4118    fn test_entity_category_as_str() {
4119        assert_eq!(EntityCategory::Agent.as_str(), "agent");
4120        assert_eq!(EntityCategory::Organization.as_str(), "organization");
4121        assert_eq!(EntityCategory::Place.as_str(), "place");
4122        assert_eq!(EntityCategory::Creative.as_str(), "creative");
4123        assert_eq!(EntityCategory::Temporal.as_str(), "temporal");
4124        assert_eq!(EntityCategory::Numeric.as_str(), "numeric");
4125        assert_eq!(EntityCategory::Contact.as_str(), "contact");
4126        assert_eq!(EntityCategory::Relation.as_str(), "relation");
4127        assert_eq!(EntityCategory::Misc.as_str(), "misc");
4128    }
4129
4130    #[test]
4131    fn test_entity_category_display() {
4132        assert_eq!(format!("{}", EntityCategory::Agent), "agent");
4133        assert_eq!(format!("{}", EntityCategory::Temporal), "temporal");
4134        assert_eq!(format!("{}", EntityCategory::Relation), "relation");
4135    }
4136
4137    // ========================================================================
4138    // EntityType serde tests (N20: flat string serialization)
4139    // ========================================================================
4140
4141    #[test]
4142    fn test_entity_type_serializes_to_flat_string() {
4143        assert_eq!(
4144            serde_json::to_string(&EntityType::Person).unwrap(),
4145            r#""PER""#
4146        );
4147        assert_eq!(
4148            serde_json::to_string(&EntityType::Organization).unwrap(),
4149            r#""ORG""#
4150        );
4151        assert_eq!(
4152            serde_json::to_string(&EntityType::Location).unwrap(),
4153            r#""LOC""#
4154        );
4155        assert_eq!(
4156            serde_json::to_string(&EntityType::Date).unwrap(),
4157            r#""DATE""#
4158        );
4159        assert_eq!(
4160            serde_json::to_string(&EntityType::Money).unwrap(),
4161            r#""MONEY""#
4162        );
4163    }
4164
4165    #[test]
4166    fn test_custom_entity_type_serializes_flat() {
4167        let misc = EntityType::custom("MISC", EntityCategory::Misc);
4168        assert_eq!(serde_json::to_string(&misc).unwrap(), r#""MISC""#);
4169
4170        let disease = EntityType::custom("DISEASE", EntityCategory::Agent);
4171        assert_eq!(serde_json::to_string(&disease).unwrap(), r#""DISEASE""#);
4172    }
4173
4174    #[test]
4175    fn test_entity_type_deserializes_from_flat_string() {
4176        let per: EntityType = serde_json::from_str(r#""PER""#).unwrap();
4177        assert_eq!(per, EntityType::Person);
4178
4179        let org: EntityType = serde_json::from_str(r#""ORG""#).unwrap();
4180        assert_eq!(org, EntityType::Organization);
4181
4182        let misc: EntityType = serde_json::from_str(r#""MISC""#).unwrap();
4183        assert_eq!(misc, EntityType::custom("MISC", EntityCategory::Misc));
4184    }
4185
4186    #[test]
4187    fn test_entity_type_deserializes_backward_compat_custom() {
4188        // Old format: {"Custom":{"name":"MISC","category":"Misc"}}
4189        let json = r#"{"Custom":{"name":"MISC","category":"Misc"}}"#;
4190        let et: EntityType = serde_json::from_str(json).unwrap();
4191        assert_eq!(et, EntityType::custom("MISC", EntityCategory::Misc));
4192    }
4193
4194    #[test]
4195    fn test_entity_type_deserializes_backward_compat_other() {
4196        // Old format: {"Other":"foo"} -- now routes to Custom with Misc category
4197        let json = r#"{"Other":"foo"}"#;
4198        let et: EntityType = serde_json::from_str(json).unwrap();
4199        assert_eq!(et, EntityType::custom("foo", EntityCategory::Misc));
4200    }
4201
4202    #[test]
4203    fn test_entity_type_serde_roundtrip() {
4204        let types = vec![
4205            EntityType::Person,
4206            EntityType::Organization,
4207            EntityType::Location,
4208            EntityType::Date,
4209            EntityType::Time,
4210            EntityType::Money,
4211            EntityType::Percent,
4212            EntityType::Quantity,
4213            EntityType::Cardinal,
4214            EntityType::Ordinal,
4215            EntityType::Email,
4216            EntityType::Url,
4217            EntityType::Phone,
4218            EntityType::custom("MISC", EntityCategory::Misc),
4219            EntityType::custom("DISEASE", EntityCategory::Agent),
4220        ];
4221
4222        for t in &types {
4223            let json = serde_json::to_string(t).unwrap();
4224            let back: EntityType = serde_json::from_str(&json).unwrap();
4225            // All variants roundtrip through from_label, so Custom types
4226            // survive as Custom (not as a built-in variant).
4227            assert_eq!(
4228                t.as_label(),
4229                back.as_label(),
4230                "roundtrip failed for {:?}",
4231                t
4232            );
4233        }
4234    }
4235}