anno_core/core/
entity.rs

1//! Entity types and structures for NER.
2//!
3//! # Design Philosophy (Research-Aligned)
4//!
5//! This module implements entity types informed by modern NER research:
6//!
7//! - **GLiNER/Bi-Encoder**: Entity types are *labels to match against*, not fixed classes.
8//!   Relations ("CEO of") are entities too - they're just labels in the same latent space.
9//!
10//! - **TPLinker/Joint Extraction**: Entities and relations can be extracted in a single pass.
11//!   The type system supports relation triggers as first-class mentions.
12//!
13//! - **Knowledge Graphs**: Entities can link to external knowledge bases (`kb_id`) for
14//!   coreference resolution and GraphRAG applications.
15//!
16//! # Type Hierarchy
17//!
18//! ```text
19//! Mention
20//! ├── Entity (single span)
21//! │   ├── Named (ML): Person, Organization, Location
22//! │   ├── Temporal (Pattern): Date, Time
23//! │   ├── Numeric (Pattern): Money, Percent, Quantity, Cardinal, Ordinal
24//! │   └── Contact (Pattern): Email, Url, Phone
25//! │
26//! └── Relation (connects entities)
27//!     └── Trigger text: "CEO of", "located in", "born on"
28//! ```
29//!
30//! # Design Principles
31//!
32//! 1. **Bi-encoder compatible**: Types are semantic labels, not fixed enums
33//! 2. **Joint extraction**: Relations are mentions with trigger spans
34//! 3. **Knowledge linking**: `kb_id` for connecting to external KBs
35//! 4. **Hierarchical confidence**: Coarse (linkage) + fine (type) scores
36//! 5. **Multi-modal ready**: Spans can be text offsets or visual bboxes
37
38use super::confidence::Confidence;
39use super::types::MentionType;
40use serde::{Deserialize, Serialize};
41use std::borrow::Cow;
42
43// ============================================================================
44// Entity Category (OntoNotes-inspired)
45// ============================================================================
46
47/// Category of entity based on detection characteristics and semantics.
48///
49/// Based on OntoNotes 5.0 categories with extensions for:
50/// - Structured data (Contact, patterns)
51/// - Knowledge graphs (Relation, for TPLinker/GLiNER joint extraction)
52#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
53#[non_exhaustive]
54pub enum EntityCategory {
55    /// Named entities for people/groups (ML-required).
56    /// Types: Person, NORP (nationalities/religious/political groups)
57    Agent,
58    /// Named entities for organizations/facilities (ML-required).
59    /// Types: Organization, Facility
60    Organization,
61    /// Named entities for places (ML-required).
62    /// Types: GPE (geo-political), Location (geographic)
63    Place,
64    /// Named entities for creative/conceptual (ML-required).
65    /// Types: Event, Product, WorkOfArt, Law, Language
66    Creative,
67    /// Temporal entities (pattern-detectable).
68    /// Types: Date, Time
69    Temporal,
70    /// Numeric entities (pattern-detectable).
71    /// Types: Money, Percent, Quantity, Cardinal, Ordinal
72    Numeric,
73    /// Contact/identifier entities (pattern-detectable).
74    /// Types: Email, Url, Phone
75    Contact,
76    /// Relation triggers for knowledge graph construction (ML-required).
77    /// Examples: "CEO of", "located in", "founded by"
78    /// In GLiNER bi-encoder, relations are just another label to match.
79    Relation,
80    /// Miscellaneous/unknown category
81    Misc,
82}
83
84impl EntityCategory {
85    /// Returns true if this category requires ML for detection.
86    #[must_use]
87    pub const fn requires_ml(&self) -> bool {
88        matches!(
89            self,
90            EntityCategory::Agent
91                | EntityCategory::Organization
92                | EntityCategory::Place
93                | EntityCategory::Creative
94                | EntityCategory::Relation
95        )
96    }
97
98    /// Returns true if this category can be detected via patterns.
99    #[must_use]
100    pub const fn pattern_detectable(&self) -> bool {
101        matches!(
102            self,
103            EntityCategory::Temporal | EntityCategory::Numeric | EntityCategory::Contact
104        )
105    }
106
107    /// Returns true if this is a relation (for knowledge graph construction).
108    #[must_use]
109    pub const fn is_relation(&self) -> bool {
110        matches!(self, EntityCategory::Relation)
111    }
112
113    /// Returns OntoNotes-compatible category name.
114    #[must_use]
115    pub const fn as_str(&self) -> &'static str {
116        match self {
117            EntityCategory::Agent => "agent",
118            EntityCategory::Organization => "organization",
119            EntityCategory::Place => "place",
120            EntityCategory::Creative => "creative",
121            EntityCategory::Temporal => "temporal",
122            EntityCategory::Numeric => "numeric",
123            EntityCategory::Contact => "contact",
124            EntityCategory::Relation => "relation",
125            EntityCategory::Misc => "misc",
126        }
127    }
128}
129
130impl std::fmt::Display for EntityCategory {
131    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
132        write!(f, "{}", self.as_str())
133    }
134}
135
136// ============================================================================
137// Entity Type
138// ============================================================================
139
140/// Entity type classification.
141///
142/// Organized into categories:
143/// - **Named** (ML-required): Person, Organization, Location
144/// - **Temporal** (pattern): Date, Time
145/// - **Numeric** (pattern): Money, Percent, Quantity, Cardinal, Ordinal
146/// - **Contact** (pattern): Email, Url, Phone
147///
148/// # Examples
149///
150/// ```
151/// use anno_core::EntityType;
152///
153/// let ty = EntityType::Email;
154/// assert!(ty.category().pattern_detectable());
155/// assert!(!ty.category().requires_ml());
156///
157/// let ty = EntityType::Person;
158/// assert!(ty.category().requires_ml());
159/// ```
160#[derive(Debug, Clone, PartialEq, Eq, Hash)]
161#[non_exhaustive]
162pub enum EntityType {
163    // === Named Entities (ML-required) ===
164    /// Person name (PER) - requires ML/context
165    Person,
166    /// Organization name (ORG) - requires ML/context
167    Organization,
168    /// Location/Place (LOC/GPE) - requires ML/context
169    Location,
170
171    // === Temporal Entities (Pattern-detectable) ===
172    /// Date expression (DATE) - pattern-detectable
173    Date,
174    /// Time expression (TIME) - pattern-detectable
175    Time,
176
177    // === Numeric Entities (Pattern-detectable) ===
178    /// Monetary value (MONEY) - pattern-detectable
179    Money,
180    /// Percentage (PERCENT) - pattern-detectable
181    Percent,
182    /// Quantity with unit (QUANTITY) - pattern-detectable
183    Quantity,
184    /// Cardinal number (CARDINAL) - pattern-detectable
185    Cardinal,
186    /// Ordinal number (ORDINAL) - pattern-detectable
187    Ordinal,
188
189    // === Contact Entities (Pattern-detectable) ===
190    /// Email address - pattern-detectable
191    Email,
192    /// URL/URI - pattern-detectable
193    Url,
194    /// Phone number - pattern-detectable
195    Phone,
196
197    // === Extensibility ===
198    /// Domain-specific custom type with explicit category
199    Custom {
200        /// Type name (e.g., "DISEASE", "PRODUCT", "EVENT")
201        name: String,
202        /// Category for this custom type
203        category: EntityCategory,
204    },
205}
206
207impl EntityType {
208    /// Get the category of this entity type.
209    #[must_use]
210    pub fn category(&self) -> EntityCategory {
211        match self {
212            // Agent entities (people/groups)
213            EntityType::Person => EntityCategory::Agent,
214            // Organization entities
215            EntityType::Organization => EntityCategory::Organization,
216            // Place entities (locations)
217            EntityType::Location => EntityCategory::Place,
218            // Temporal entities
219            EntityType::Date | EntityType::Time => EntityCategory::Temporal,
220            // Numeric entities
221            EntityType::Money
222            | EntityType::Percent
223            | EntityType::Quantity
224            | EntityType::Cardinal
225            | EntityType::Ordinal => EntityCategory::Numeric,
226            // Contact entities
227            EntityType::Email | EntityType::Url | EntityType::Phone => EntityCategory::Contact,
228            // Custom with explicit category
229            EntityType::Custom { category, .. } => *category,
230        }
231    }
232
233    /// Returns true if this entity type requires ML for detection.
234    #[must_use]
235    pub fn requires_ml(&self) -> bool {
236        self.category().requires_ml()
237    }
238
239    /// Returns true if this entity type can be detected via patterns.
240    #[must_use]
241    pub fn pattern_detectable(&self) -> bool {
242        self.category().pattern_detectable()
243    }
244
245    /// Convert to standard label string (CoNLL/OntoNotes format).
246    ///
247    /// ```
248    /// use anno_core::EntityType;
249    ///
250    /// assert_eq!(EntityType::Person.as_label(), "PER");
251    /// assert_eq!(EntityType::Location.as_label(), "LOC");
252    /// ```
253    #[must_use]
254    pub fn as_label(&self) -> &str {
255        match self {
256            EntityType::Person => "PER",
257            EntityType::Organization => "ORG",
258            EntityType::Location => "LOC",
259            EntityType::Date => "DATE",
260            EntityType::Time => "TIME",
261            EntityType::Money => "MONEY",
262            EntityType::Percent => "PERCENT",
263            EntityType::Quantity => "QUANTITY",
264            EntityType::Cardinal => "CARDINAL",
265            EntityType::Ordinal => "ORDINAL",
266            EntityType::Email => "EMAIL",
267            EntityType::Url => "URL",
268            EntityType::Phone => "PHONE",
269            EntityType::Custom { name, .. } => name.as_str(),
270        }
271    }
272
273    /// Parse from standard label string.
274    ///
275    /// Handles various formats: CoNLL (PER), OntoNotes (PERSON), BIO (B-PER).
276    ///
277    /// ```
278    /// use anno_core::EntityType;
279    ///
280    /// assert_eq!(EntityType::from_label("PER"), EntityType::Person);
281    /// assert_eq!(EntityType::from_label("B-ORG"), EntityType::Organization);
282    /// assert_eq!(EntityType::from_label("PERSON"), EntityType::Person);
283    /// ```
284    #[must_use]
285    pub fn from_label(label: &str) -> Self {
286        // Strip BIO prefix if present
287        let label = label
288            .strip_prefix("B-")
289            .or_else(|| label.strip_prefix("I-"))
290            .or_else(|| label.strip_prefix("E-"))
291            .or_else(|| label.strip_prefix("S-"))
292            .unwrap_or(label);
293
294        match label.to_uppercase().as_str() {
295            // Named entities (multiple variations)
296            "PER" | "PERSON" => EntityType::Person,
297            "ORG" | "ORGANIZATION" | "COMPANY" | "CORPORATION" => EntityType::Organization,
298            "LOC" | "LOCATION" | "GPE" | "GEO-LOC" => EntityType::Location,
299            // WNUT / FewNERD specific types (common in social media / Wikipedia)
300            "FACILITY" | "FAC" | "BUILDING" => {
301                EntityType::custom("BUILDING", EntityCategory::Place)
302            }
303            "PRODUCT" | "PROD" => EntityType::custom("PRODUCT", EntityCategory::Misc),
304            "EVENT" => EntityType::custom("EVENT", EntityCategory::Creative),
305            "CREATIVE-WORK" | "WORK_OF_ART" | "ART" => {
306                EntityType::custom("CREATIVE_WORK", EntityCategory::Creative)
307            }
308            "GROUP" | "NORP" => EntityType::custom("GROUP", EntityCategory::Agent),
309            // Temporal
310            "DATE" => EntityType::Date,
311            "TIME" => EntityType::Time,
312            // Numeric
313            "MONEY" | "CURRENCY" => EntityType::Money,
314            "PERCENT" | "PERCENTAGE" => EntityType::Percent,
315            "QUANTITY" => EntityType::Quantity,
316            "CARDINAL" => EntityType::Cardinal,
317            "ORDINAL" => EntityType::Ordinal,
318            // Contact
319            "EMAIL" => EntityType::Email,
320            "URL" | "URI" => EntityType::Url,
321            "PHONE" | "TELEPHONE" => EntityType::Phone,
322            // MISC variations
323            "MISC" | "MISCELLANEOUS" | "OTHER" => EntityType::custom("MISC", EntityCategory::Misc),
324            // Biomedical types
325            "DISEASE" | "DISORDER" => EntityType::custom("DISEASE", EntityCategory::Misc),
326            "CHEMICAL" | "DRUG" => EntityType::custom("CHEMICAL", EntityCategory::Misc),
327            "GENE" => EntityType::custom("GENE", EntityCategory::Misc),
328            "PROTEIN" => EntityType::custom("PROTEIN", EntityCategory::Misc),
329            // Unknown -> Custom with Misc category
330            other => EntityType::custom(other, EntityCategory::Misc),
331        }
332    }
333
334    /// Create a custom domain-specific entity type.
335    ///
336    /// # Examples
337    ///
338    /// ```
339    /// use anno_core::{EntityType, EntityCategory};
340    ///
341    /// let disease = EntityType::custom("DISEASE", EntityCategory::Agent);
342    /// assert!(disease.requires_ml());
343    ///
344    /// let product_id = EntityType::custom("PRODUCT_ID", EntityCategory::Misc);
345    /// assert!(!product_id.requires_ml());
346    /// ```
347    #[must_use]
348    pub fn custom(name: impl Into<String>, category: EntityCategory) -> Self {
349        EntityType::Custom {
350            name: name.into(),
351            category,
352        }
353    }
354}
355
356impl std::fmt::Display for EntityType {
357    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
358        write!(f, "{}", self.as_label())
359    }
360}
361
362impl std::str::FromStr for EntityType {
363    type Err = std::convert::Infallible;
364
365    /// Parse from standard label string. Never fails -- unknown labels become `Custom`.
366    fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
367        Ok(Self::from_label(s))
368    }
369}
370
371// Flatten EntityType to its label string for JSON serialization.
372// `Custom { name: "MISC", .. }` -> `"MISC"`, `Person` -> `"PER"`, etc.
373// Deserialization accepts both the flat string (new format) and the
374// tagged-enum object (backward compat with existing serialized data).
375impl Serialize for EntityType {
376    fn serialize<S: serde::Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
377        serializer.serialize_str(self.as_label())
378    }
379}
380
381impl<'de> Deserialize<'de> for EntityType {
382    fn deserialize<D: serde::Deserializer<'de>>(deserializer: D) -> Result<Self, D::Error> {
383        struct EntityTypeVisitor;
384
385        impl<'de> serde::de::Visitor<'de> for EntityTypeVisitor {
386            type Value = EntityType;
387
388            fn expecting(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
389                f.write_str("a string label or a tagged enum object")
390            }
391
392            // New flat format: "PER", "ORG", "MISC", etc.
393            fn visit_str<E: serde::de::Error>(self, v: &str) -> Result<EntityType, E> {
394                Ok(EntityType::from_label(v))
395            }
396
397            // Backward-compat: {"Custom":{"name":"MISC","category":"Misc"}}
398            // or {"Other":"foo"} or "Person" (unit variant as map key)
399            fn visit_map<A: serde::de::MapAccess<'de>>(
400                self,
401                mut map: A,
402            ) -> Result<EntityType, A::Error> {
403                let key: String = map
404                    .next_key()?
405                    .ok_or_else(|| serde::de::Error::custom("empty object"))?;
406                match key.as_str() {
407                    "Custom" => {
408                        #[derive(Deserialize)]
409                        struct CustomFields {
410                            name: String,
411                            category: EntityCategory,
412                        }
413                        let fields: CustomFields = map.next_value()?;
414                        Ok(EntityType::Custom {
415                            name: fields.name,
416                            category: fields.category,
417                        })
418                    }
419                    "Other" => {
420                        // Route legacy Other to Custom with Misc category
421                        let val: String = map.next_value()?;
422                        Ok(EntityType::custom(val, EntityCategory::Misc))
423                    }
424                    // Unit variants serialized as {"Person":null} etc.
425                    variant => {
426                        // Consume the value (null or unit)
427                        let _: serde::de::IgnoredAny = map.next_value()?;
428                        Ok(EntityType::from_label(variant))
429                    }
430                }
431            }
432        }
433
434        deserializer.deserialize_any(EntityTypeVisitor)
435    }
436}
437
438// =============================================================================
439// Type Mapping for Domain-Specific Datasets
440// =============================================================================
441
442/// Maps domain-specific entity types to standard NER types.
443///
444/// # Research Context (Familiarity paper, arXiv:2412.10121)
445///
446/// Type mapping creates "label overlap" between training and evaluation:
447/// - Mapping ACTOR → Person increases overlap
448/// - This can inflate zero-shot F1 scores
449///
450/// Use `LabelShift::from_type_sets()` to quantify how much overlap exists.
451/// High overlap (>80%) means the evaluation is NOT truly zero-shot.
452///
453/// # When to Use TypeMapper
454///
455/// - Cross-dataset comparison (normalize schemas for fair eval)
456/// - Domain adaptation (map new labels to known types)
457///
458/// # When NOT to Use TypeMapper
459///
460/// - True zero-shot evaluation (keep labels distinct)
461/// - Measuring generalization (overlap hides generalization failures)
462///
463/// # Example
464///
465/// ```rust
466/// use anno_core::{TypeMapper, EntityType, EntityCategory};
467///
468/// // MIT Movie dataset mapping
469/// let mut mapper = TypeMapper::new();
470/// mapper.add("ACTOR", EntityType::Person);
471/// mapper.add("DIRECTOR", EntityType::Person);
472/// mapper.add("TITLE", EntityType::custom("WORK_OF_ART", EntityCategory::Creative));
473///
474/// assert_eq!(mapper.map("ACTOR"), Some(&EntityType::Person));
475/// assert_eq!(mapper.normalize("DIRECTOR"), EntityType::Person);
476/// ```
477#[derive(Debug, Clone, Default)]
478pub struct TypeMapper {
479    mappings: std::collections::HashMap<String, EntityType>,
480}
481
482impl TypeMapper {
483    /// Create empty mapper.
484    #[must_use]
485    pub fn new() -> Self {
486        Self::default()
487    }
488
489    /// Create mapper for MIT Movie dataset.
490    #[must_use]
491    pub fn mit_movie() -> Self {
492        let mut mapper = Self::new();
493        // Map to standard types where possible
494        mapper.add("ACTOR", EntityType::Person);
495        mapper.add("DIRECTOR", EntityType::Person);
496        mapper.add("CHARACTER", EntityType::Person);
497        mapper.add(
498            "TITLE",
499            EntityType::custom("WORK_OF_ART", EntityCategory::Creative),
500        );
501        mapper.add("GENRE", EntityType::custom("GENRE", EntityCategory::Misc));
502        mapper.add("YEAR", EntityType::Date);
503        mapper.add("RATING", EntityType::custom("RATING", EntityCategory::Misc));
504        mapper.add("PLOT", EntityType::custom("PLOT", EntityCategory::Misc));
505        mapper
506    }
507
508    /// Create mapper for MIT Restaurant dataset.
509    #[must_use]
510    pub fn mit_restaurant() -> Self {
511        let mut mapper = Self::new();
512        mapper.add("RESTAURANT_NAME", EntityType::Organization);
513        mapper.add("LOCATION", EntityType::Location);
514        mapper.add(
515            "CUISINE",
516            EntityType::custom("CUISINE", EntityCategory::Misc),
517        );
518        mapper.add("DISH", EntityType::custom("DISH", EntityCategory::Misc));
519        mapper.add("PRICE", EntityType::Money);
520        mapper.add(
521            "AMENITY",
522            EntityType::custom("AMENITY", EntityCategory::Misc),
523        );
524        mapper.add("HOURS", EntityType::Time);
525        mapper
526    }
527
528    /// Create mapper for biomedical datasets (BC5CDR, NCBI).
529    #[must_use]
530    pub fn biomedical() -> Self {
531        let mut mapper = Self::new();
532        mapper.add(
533            "DISEASE",
534            EntityType::custom("DISEASE", EntityCategory::Agent),
535        );
536        mapper.add(
537            "CHEMICAL",
538            EntityType::custom("CHEMICAL", EntityCategory::Misc),
539        );
540        mapper.add("DRUG", EntityType::custom("DRUG", EntityCategory::Misc));
541        mapper.add("GENE", EntityType::custom("GENE", EntityCategory::Misc));
542        mapper.add(
543            "PROTEIN",
544            EntityType::custom("PROTEIN", EntityCategory::Misc),
545        );
546        // GENIA types
547        mapper.add("DNA", EntityType::custom("DNA", EntityCategory::Misc));
548        mapper.add("RNA", EntityType::custom("RNA", EntityCategory::Misc));
549        mapper.add(
550            "cell_line",
551            EntityType::custom("CELL_LINE", EntityCategory::Misc),
552        );
553        mapper.add(
554            "cell_type",
555            EntityType::custom("CELL_TYPE", EntityCategory::Misc),
556        );
557        mapper
558    }
559
560    /// Create mapper for social media NER datasets (TweetNER7, etc.).
561    #[must_use]
562    pub fn social_media() -> Self {
563        let mut mapper = Self::new();
564        // TweetNER7 types
565        mapper.add("person", EntityType::Person);
566        mapper.add("corporation", EntityType::Organization);
567        mapper.add("location", EntityType::Location);
568        mapper.add("group", EntityType::Organization);
569        mapper.add(
570            "product",
571            EntityType::custom("PRODUCT", EntityCategory::Misc),
572        );
573        mapper.add(
574            "creative_work",
575            EntityType::custom("WORK_OF_ART", EntityCategory::Creative),
576        );
577        mapper.add("event", EntityType::custom("EVENT", EntityCategory::Misc));
578        mapper
579    }
580
581    /// Create mapper for manufacturing domain datasets (FabNER, etc.).
582    #[must_use]
583    pub fn manufacturing() -> Self {
584        let mut mapper = Self::new();
585        // FabNER entity types
586        mapper.add("MATE", EntityType::custom("MATERIAL", EntityCategory::Misc));
587        mapper.add("MANP", EntityType::custom("PROCESS", EntityCategory::Misc));
588        mapper.add("MACEQ", EntityType::custom("MACHINE", EntityCategory::Misc));
589        mapper.add(
590            "APPL",
591            EntityType::custom("APPLICATION", EntityCategory::Misc),
592        );
593        mapper.add("FEAT", EntityType::custom("FEATURE", EntityCategory::Misc));
594        mapper.add(
595            "PARA",
596            EntityType::custom("PARAMETER", EntityCategory::Misc),
597        );
598        mapper.add("PRO", EntityType::custom("PROPERTY", EntityCategory::Misc));
599        mapper.add(
600            "CHAR",
601            EntityType::custom("CHARACTERISTIC", EntityCategory::Misc),
602        );
603        mapper.add(
604            "ENAT",
605            EntityType::custom("ENABLING_TECHNOLOGY", EntityCategory::Misc),
606        );
607        mapper.add(
608            "CONPRI",
609            EntityType::custom("CONCEPT_PRINCIPLE", EntityCategory::Misc),
610        );
611        mapper.add(
612            "BIOP",
613            EntityType::custom("BIO_PROCESS", EntityCategory::Misc),
614        );
615        mapper.add(
616            "MANS",
617            EntityType::custom("MAN_STANDARD", EntityCategory::Misc),
618        );
619        mapper
620    }
621
622    /// Add a mapping from source label to target type.
623    pub fn add(&mut self, source: impl Into<String>, target: EntityType) {
624        self.mappings.insert(source.into().to_uppercase(), target);
625    }
626
627    /// Get mapped type for a label (returns None if not mapped).
628    #[must_use]
629    pub fn map(&self, label: &str) -> Option<&EntityType> {
630        self.mappings.get(&label.to_uppercase())
631    }
632
633    /// Normalize a label to EntityType, using mapping if available.
634    ///
635    /// Falls back to `EntityType::from_label()` if no mapping exists.
636    #[must_use]
637    pub fn normalize(&self, label: &str) -> EntityType {
638        self.map(label)
639            .cloned()
640            .unwrap_or_else(|| EntityType::from_label(label))
641    }
642
643    /// Check if a label is mapped.
644    #[must_use]
645    pub fn contains(&self, label: &str) -> bool {
646        self.mappings.contains_key(&label.to_uppercase())
647    }
648
649    /// Get all source labels.
650    pub fn labels(&self) -> impl Iterator<Item = &String> {
651        self.mappings.keys()
652    }
653}
654
655/// Extraction method used to identify an entity.
656///
657/// # Research Context
658///
659/// Different extraction methods have different strengths:
660///
661/// | Method | Precision | Recall | Generalization | Use Case |
662/// |--------|-----------|--------|----------------|----------|
663/// | Pattern | Very High | Low | N/A (format-based) | Dates, emails, money |
664/// | Neural | High | High | Good | General NER |
665///
666/// See `docs/` for repo-local notes and entry points.
667#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, Default)]
668#[non_exhaustive]
669pub enum ExtractionMethod {
670    /// Regex pattern matching (high precision for structured data like dates, money).
671    /// Does not generalize - only detects format-based entities.
672    Pattern,
673
674    /// Neural model inference (BERT, GLiNER, etc.).
675    /// The recommended default for general NER. Generalizes to unseen entities.
676    #[default]
677    Neural,
678
679    /// Multiple methods agreed on this entity (high confidence).
680    Consensus,
681
682    /// Heuristic-based extraction (capitalization, word shape, context).
683    /// Used by heuristic backends that don't use neural models.
684    Heuristic,
685
686    /// Unknown or unspecified extraction method.
687    Unknown,
688}
689
690impl ExtractionMethod {
691    /// Returns true if this extraction method produces probabilistically calibrated
692    /// confidence scores suitable for calibration analysis (ECE, Brier score, etc.).
693    ///
694    /// # Calibrated Methods
695    ///
696    /// - **Neural**: Softmax outputs are intended to be probabilistic (though may need
697    ///   temperature scaling for true calibration)
698    ///
699    /// # Uncalibrated Methods
700    ///
701    /// - **Pattern**: Binary (match/no-match); confidence is typically hardcoded
702    /// - **Heuristic**: Arbitrary scores from hand-crafted rules
703    /// - **Consensus**: Agreement count, not a probability
704    ///
705    /// # Example
706    ///
707    /// ```rust
708    /// use anno_core::ExtractionMethod;
709    ///
710    /// assert!(ExtractionMethod::Neural.is_calibrated());
711    /// assert!(!ExtractionMethod::Pattern.is_calibrated());
712    /// assert!(!ExtractionMethod::Heuristic.is_calibrated());
713    /// ```
714    #[must_use]
715    pub const fn is_calibrated(&self) -> bool {
716        match self {
717            ExtractionMethod::Neural => true,
718            // Everything else is not calibrated
719            ExtractionMethod::Pattern => false,
720            ExtractionMethod::Consensus => false,
721            ExtractionMethod::Heuristic => false,
722            ExtractionMethod::Unknown => false,
723        }
724    }
725
726    /// Returns the confidence interpretation for this extraction method.
727    ///
728    /// This helps users understand what the confidence score means:
729    /// - `"probability"`: Score approximates P(correct)
730    /// - `"heuristic_score"`: Score is a non-probabilistic quality measure
731    /// - `"binary"`: Score is 0 or 1 (or a fixed value for matches)
732    #[must_use]
733    pub const fn confidence_interpretation(&self) -> &'static str {
734        match self {
735            ExtractionMethod::Neural => "probability",
736            ExtractionMethod::Pattern => "binary",
737            ExtractionMethod::Heuristic => "heuristic_score",
738            ExtractionMethod::Consensus => "agreement_ratio",
739            ExtractionMethod::Unknown => "unknown",
740        }
741    }
742}
743
744impl std::fmt::Display for ExtractionMethod {
745    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
746        match self {
747            ExtractionMethod::Pattern => write!(f, "pattern"),
748            ExtractionMethod::Neural => write!(f, "neural"),
749            ExtractionMethod::Consensus => write!(f, "consensus"),
750            ExtractionMethod::Heuristic => write!(f, "heuristic"),
751            ExtractionMethod::Unknown => write!(f, "unknown"),
752        }
753    }
754}
755
756// =============================================================================
757// Lexicon Traits
758// =============================================================================
759
760/// Exact-match lexicon/gazetteer for entity lookup.
761///
762/// # Research Context
763///
764/// Gazetteers (lists of known entities) are a classic NER technique. Modern research
765/// suggests they are most valuable when:
766///
767/// 1. **Domain is closed**: Stock tickers, medical codes, known product catalogs
768/// 2. **Text is short**: where context is insufficient
769/// 3. **Used as features**: Input to neural model, not final output (Song et al. 2020)
770///
771/// They're harmful when:
772/// 1. **Domain is open**: Novel entities not in the list get missed
773/// 2. **Used as authority**: Hardcoded lookups inflate test scores but fail in production
774///
775/// # When to Use
776///
777/// ```text
778/// Decision: Should I use a Lexicon?
779///
780/// Is entity type CLOSED (fixed, known list)?
781/// ├─ Yes: Lexicon is appropriate
782/// │       Examples: stock tickers, ICD-10 codes, country names
783/// └─ No:  Use Neural extraction instead
784///         Examples: person names, organization names, products
785/// ```
786///
787/// # Example
788///
789/// ```rust
790/// use anno_core::{Lexicon, EntityType, HashMapLexicon};
791///
792/// // Create a domain-specific lexicon
793/// let mut lexicon = HashMapLexicon::new("stock_tickers");
794/// lexicon.insert("AAPL", EntityType::Organization, 0.99);
795/// lexicon.insert("GOOGL", EntityType::Organization, 0.99);
796///
797/// // Lookup
798/// if let Some((entity_type, confidence)) = lexicon.lookup("AAPL") {
799///     assert_eq!(entity_type, EntityType::Organization);
800///     assert!(confidence > 0.9);
801/// }
802/// ```
803///
804/// # References
805///
806/// - Song et al. (2020). "Improving Neural NER with Gazetteers"
807/// - Nie et al. (2021). "GEMNET: Effective Gated Gazetteer Representations"
808/// - Rijhwani et al. (2020). "Soft Gazetteers for Low-Resource NER"
809pub trait Lexicon: Send + Sync {
810    /// Lookup an exact string, returning entity type and confidence if found.
811    ///
812    /// Returns `None` if the text is not in the lexicon.
813    fn lookup(&self, text: &str) -> Option<(EntityType, Confidence)>;
814
815    /// Check if the lexicon contains this exact string.
816    fn contains(&self, text: &str) -> bool {
817        self.lookup(text).is_some()
818    }
819
820    /// Get the lexicon source identifier (for provenance tracking).
821    fn source(&self) -> &str;
822
823    /// Get approximate number of entries (for debugging/metrics).
824    fn len(&self) -> usize;
825
826    /// Check if lexicon is empty.
827    fn is_empty(&self) -> bool {
828        self.len() == 0
829    }
830}
831
832/// Simple HashMap-based lexicon implementation.
833///
834/// Suitable for small to medium lexicons (<100k entries).
835/// For larger lexicons, consider a trie-based or FST implementation.
836#[derive(Debug, Clone)]
837pub struct HashMapLexicon {
838    entries: std::collections::HashMap<String, (EntityType, Confidence)>,
839    source: String,
840}
841
842impl HashMapLexicon {
843    /// Create a new empty lexicon with the given source identifier.
844    #[must_use]
845    pub fn new(source: impl Into<String>) -> Self {
846        Self {
847            entries: std::collections::HashMap::new(),
848            source: source.into(),
849        }
850    }
851
852    /// Insert an entry into the lexicon.
853    pub fn insert(
854        &mut self,
855        text: impl Into<String>,
856        entity_type: EntityType,
857        confidence: impl Into<Confidence>,
858    ) {
859        self.entries
860            .insert(text.into(), (entity_type, confidence.into()));
861    }
862
863    /// Create from an iterator of (text, type, confidence) tuples.
864    pub fn from_iter<I, S, C>(source: impl Into<String>, entries: I) -> Self
865    where
866        I: IntoIterator<Item = (S, EntityType, C)>,
867        S: Into<String>,
868        C: Into<Confidence>,
869    {
870        let mut lexicon = Self::new(source);
871        for (text, entity_type, conf) in entries {
872            lexicon.insert(text, entity_type, conf);
873        }
874        lexicon
875    }
876
877    /// Get all entries as an iterator (for debugging).
878    pub fn entries(&self) -> impl Iterator<Item = (&str, &EntityType, Confidence)> {
879        self.entries.iter().map(|(k, (t, c))| (k.as_str(), t, *c))
880    }
881}
882
883impl Lexicon for HashMapLexicon {
884    fn lookup(&self, text: &str) -> Option<(EntityType, Confidence)> {
885        self.entries.get(text).cloned()
886    }
887
888    fn source(&self) -> &str {
889        &self.source
890    }
891
892    fn len(&self) -> usize {
893        self.entries.len()
894    }
895}
896
897/// Provenance information for an extracted entity.
898///
899/// Tracks where an entity came from for debugging, explainability,
900/// and confidence calibration in hybrid/ensemble systems.
901#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Default)]
902pub struct Provenance {
903    /// Name of the backend that produced this entity (e.g., "pattern", "bert-onnx")
904    pub source: Cow<'static, str>,
905    /// Extraction method used
906    pub method: ExtractionMethod,
907    /// Specific pattern/rule name (for pattern/rule-based extraction)
908    pub pattern: Option<Cow<'static, str>>,
909    /// Raw confidence from the source model (before any calibration)
910    pub raw_confidence: Option<Confidence>,
911    /// Model version for reproducibility (e.g., "gliner-v2.1", "bert-base-uncased-2024-01")
912    #[serde(default, skip_serializing_if = "Option::is_none")]
913    pub model_version: Option<Cow<'static, str>>,
914    /// Timestamp when extraction occurred (ISO 8601)
915    #[serde(default, skip_serializing_if = "Option::is_none")]
916    pub timestamp: Option<String>,
917}
918
919impl Provenance {
920    /// Create provenance for regex-based extraction.
921    #[must_use]
922    pub fn pattern(pattern_name: &'static str) -> Self {
923        Self {
924            source: Cow::Borrowed("pattern"),
925            method: ExtractionMethod::Pattern,
926            pattern: Some(Cow::Borrowed(pattern_name)),
927            raw_confidence: Some(Confidence::ONE), // Patterns are deterministic
928            model_version: None,
929            timestamp: None,
930        }
931    }
932
933    /// Create provenance for ML-based extraction.
934    ///
935    /// Accepts both static strings and owned strings:
936    /// ```rust
937    /// use anno_core::Provenance;
938    ///
939    /// // Static string (zero allocation)
940    /// let p1 = Provenance::ml("gliner", 0.95);
941    ///
942    /// // Owned string (dynamic model name)
943    /// let model_name = "bert-base";
944    /// let p2 = Provenance::ml(model_name.to_string(), 0.95);
945    /// ```
946    #[must_use]
947    pub fn ml(model_name: impl Into<Cow<'static, str>>, confidence: impl Into<Confidence>) -> Self {
948        Self {
949            source: model_name.into(),
950            method: ExtractionMethod::Neural,
951            pattern: None,
952            raw_confidence: Some(confidence.into()),
953            model_version: None,
954            timestamp: None,
955        }
956    }
957
958    /// Create provenance for ensemble/hybrid extraction.
959    #[must_use]
960    pub fn ensemble(sources: &'static str) -> Self {
961        Self {
962            source: Cow::Borrowed(sources),
963            method: ExtractionMethod::Consensus,
964            pattern: None,
965            raw_confidence: None,
966            model_version: None,
967            timestamp: None,
968        }
969    }
970
971    /// Create provenance with model version for reproducibility.
972    #[must_use]
973    pub fn with_version(mut self, version: &'static str) -> Self {
974        self.model_version = Some(Cow::Borrowed(version));
975        self
976    }
977
978    /// Create provenance with timestamp.
979    #[must_use]
980    pub fn with_timestamp(mut self, timestamp: impl Into<String>) -> Self {
981        self.timestamp = Some(timestamp.into());
982        self
983    }
984}
985
986// ============================================================================
987// Span Types (Multi-Modal Support)
988// ============================================================================
989
990/// A span locator for text and visual modalities.
991///
992/// `Span` is a **simplified subset** of [`grounded::Location`] designed for
993/// the detection layer (`Entity`). It covers the most common cases:
994///
995/// - Text offsets (traditional NER)
996/// - Bounding boxes (visual document understanding)
997/// - Hybrid (OCR with both text and visual location)
998///
999/// # Relationship to `Location`
1000///
1001/// | `Span` variant | `Location` equivalent |
1002/// |----------------|-----------------------|
1003/// | `Text` | `Location::Text` |
1004/// | `BoundingBox` | `Location::BoundingBox` |
1005/// | `Hybrid` | `Location::TextWithBbox` |
1006///
1007/// For modalities not covered by `Span` (temporal, cuboid, genomic, discontinuous),
1008/// use `Location` directly via the canonical `Signal` → `Track` → `Identity` pipeline.
1009///
1010/// # Conversion
1011///
1012/// - `Span → Location`: Always succeeds via `Location::from(&span)`
1013/// - `Location → Span`: Use `location.to_span()`, returns `None` for unsupported variants
1014///
1015/// [`grounded::Location`]: super::grounded::Location
1016#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
1017pub enum Span {
1018    /// Text span with **character offsets** (start, end).
1019    ///
1020    /// Offsets are Unicode scalar value indices (what `text.chars()` counts),
1021    /// consistent with `Entity.start/end` and `grounded::Location::Text`.
1022    Text {
1023        /// Start character offset (inclusive)
1024        start: usize,
1025        /// End character offset (exclusive)
1026        end: usize,
1027    },
1028    /// Visual bounding box (normalized 0.0-1.0 coordinates)
1029    /// For ColPali: image patch locations
1030    BoundingBox {
1031        /// X coordinate (normalized 0.0-1.0)
1032        x: f32,
1033        /// Y coordinate (normalized 0.0-1.0)
1034        y: f32,
1035        /// Width (normalized 0.0-1.0)
1036        width: f32,
1037        /// Height (normalized 0.0-1.0)
1038        height: f32,
1039        /// Optional page number (for multi-page documents)
1040        page: Option<u32>,
1041    },
1042    /// Hybrid: both text and visual location (for OCR-verified extraction)
1043    Hybrid {
1044        /// Start character offset (inclusive)
1045        start: usize,
1046        /// End character offset (exclusive)
1047        end: usize,
1048        /// Bounding box for visual location
1049        bbox: Box<Span>,
1050    },
1051}
1052
1053impl Span {
1054    /// Create a text span.
1055    #[must_use]
1056    pub const fn text(start: usize, end: usize) -> Self {
1057        Self::Text { start, end }
1058    }
1059
1060    /// Create a bounding box span with normalized coordinates.
1061    #[must_use]
1062    pub fn bbox(x: f32, y: f32, width: f32, height: f32) -> Self {
1063        Self::BoundingBox {
1064            x,
1065            y,
1066            width,
1067            height,
1068            page: None,
1069        }
1070    }
1071
1072    /// Create a bounding box with page number.
1073    #[must_use]
1074    pub fn bbox_on_page(x: f32, y: f32, width: f32, height: f32, page: u32) -> Self {
1075        Self::BoundingBox {
1076            x,
1077            y,
1078            width,
1079            height,
1080            page: Some(page),
1081        }
1082    }
1083
1084    /// Check if this is a text span.
1085    #[must_use]
1086    pub const fn is_text(&self) -> bool {
1087        matches!(self, Self::Text { .. } | Self::Hybrid { .. })
1088    }
1089
1090    /// Check if this has visual location.
1091    #[must_use]
1092    pub const fn is_visual(&self) -> bool {
1093        matches!(self, Self::BoundingBox { .. } | Self::Hybrid { .. })
1094    }
1095
1096    /// Get text offsets if available.
1097    #[must_use]
1098    pub const fn text_offsets(&self) -> Option<(usize, usize)> {
1099        match self {
1100            Self::Text { start, end } => Some((*start, *end)),
1101            Self::Hybrid { start, end, .. } => Some((*start, *end)),
1102            Self::BoundingBox { .. } => None,
1103        }
1104    }
1105
1106    /// Calculate span length for text spans.
1107    #[must_use]
1108    pub fn len(&self) -> usize {
1109        match self {
1110            Self::Text { start, end } => end.saturating_sub(*start),
1111            Self::Hybrid { start, end, .. } => end.saturating_sub(*start),
1112            Self::BoundingBox { .. } => 0,
1113        }
1114    }
1115
1116    /// Check if span is empty.
1117    #[must_use]
1118    pub fn is_empty(&self) -> bool {
1119        self.len() == 0
1120    }
1121}
1122
1123// ============================================================================
1124// Discontinuous Spans (W2NER/ACE-style)
1125// ============================================================================
1126
1127/// A discontinuous span representing non-contiguous entity mentions.
1128///
1129/// Some entities span multiple non-adjacent text regions:
1130/// - "severe \[pain\] in the \[abdomen\]" → "severe abdominal pain"
1131/// - "the \[president\] ... \[Obama\]" → coreference
1132///
1133/// This is required for:
1134/// - **Medical NER**: Anatomical modifiers separated from findings
1135/// - **Legal NER**: Parties referenced across clauses
1136/// - **W2NER**: Word-word relation grids that detect discontinuous entities
1137///
1138/// # Offset Unit (CRITICAL)
1139///
1140/// `DiscontinuousSpan` uses **character offsets** (Unicode scalar value indices),
1141/// consistent with [`Entity::start`](super::entity::Entity::start) /
1142/// [`Entity::end`](super::entity::Entity::end) and `anno::core::grounded::Location`.
1143///
1144/// This is intentionally *not* byte offsets. If you have byte offsets (from regex,
1145/// `str::find`, tokenizers, etc.), convert them to character offsets first (see
1146/// `anno::offset::SpanConverter` in the `anno` crate).
1147///
1148/// # Example
1149///
1150/// ```rust
1151/// use anno_core::DiscontinuousSpan;
1152///
1153/// // "severe pain in the abdomen" where "severe" modifies "pain"
1154/// // but they're separated by other words
1155/// let span = DiscontinuousSpan::new(vec![
1156///     0..6,   // "severe"
1157///     12..16, // "pain"
1158/// ]);
1159///
1160/// assert_eq!(span.num_segments(), 2);
1161/// assert!(span.is_discontinuous());
1162/// ```
1163#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
1164pub struct DiscontinuousSpan {
1165    /// Non-overlapping segments, sorted by start position.
1166    /// Each `Range<usize>` represents (start_char, end_char).
1167    segments: Vec<std::ops::Range<usize>>,
1168}
1169
1170impl DiscontinuousSpan {
1171    /// Create a new discontinuous span from segments.
1172    ///
1173    /// Segments are sorted by start position and overlapping segments are
1174    /// merged. Empty segments (where `start >= end`) are discarded.
1175    #[must_use]
1176    pub fn new(mut segments: Vec<std::ops::Range<usize>>) -> Self {
1177        // Discard empty segments
1178        segments.retain(|r| r.start < r.end);
1179        // Sort by start position
1180        segments.sort_by_key(|r| r.start);
1181        // Merge overlapping or adjacent segments
1182        let mut merged: Vec<std::ops::Range<usize>> = Vec::with_capacity(segments.len());
1183        for seg in segments {
1184            if let Some(last) = merged.last_mut() {
1185                if seg.start <= last.end {
1186                    // Overlapping or adjacent -- extend
1187                    last.end = last.end.max(seg.end);
1188                    continue;
1189                }
1190            }
1191            merged.push(seg);
1192        }
1193        Self { segments: merged }
1194    }
1195
1196    /// Create from a single contiguous span.
1197    #[must_use]
1198    #[allow(clippy::single_range_in_vec_init)] // Intentional: contiguous is special case of discontinuous
1199    pub fn contiguous(start: usize, end: usize) -> Self {
1200        Self {
1201            segments: vec![start..end],
1202        }
1203    }
1204
1205    /// Number of segments.
1206    #[must_use]
1207    pub fn num_segments(&self) -> usize {
1208        self.segments.len()
1209    }
1210
1211    /// True if this spans multiple non-adjacent regions.
1212    #[must_use]
1213    pub fn is_discontinuous(&self) -> bool {
1214        self.segments.len() > 1
1215    }
1216
1217    /// True if this is a single contiguous span.
1218    #[must_use]
1219    pub fn is_contiguous(&self) -> bool {
1220        self.segments.len() <= 1
1221    }
1222
1223    /// Get the segments.
1224    #[must_use]
1225    pub fn segments(&self) -> &[std::ops::Range<usize>] {
1226        &self.segments
1227    }
1228
1229    /// Get the overall bounding range (start of first to end of last).
1230    #[must_use]
1231    pub fn bounding_range(&self) -> Option<std::ops::Range<usize>> {
1232        if self.segments.is_empty() {
1233            return None;
1234        }
1235        let start = self.segments.first()?.start;
1236        let end = self.segments.last()?.end;
1237        Some(start..end)
1238    }
1239
1240    /// Total character length (sum of all segments).
1241    ///
1242    #[must_use]
1243    pub fn total_len(&self) -> usize {
1244        self.segments.iter().map(|r| r.end - r.start).sum()
1245    }
1246
1247    /// Extract text from each segment and join with separator.
1248    #[must_use]
1249    pub fn extract_text(&self, text: &str, separator: &str) -> String {
1250        self.segments
1251            .iter()
1252            .map(|r| {
1253                let start = r.start;
1254                let len = r.end.saturating_sub(r.start);
1255                text.chars().skip(start).take(len).collect::<String>()
1256            })
1257            .collect::<Vec<_>>()
1258            .join(separator)
1259    }
1260
1261    /// Check if a character position falls within any segment.
1262    ///
1263    /// # Arguments
1264    ///
1265    /// * `pos` - Character offset to check (Unicode scalar value index)
1266    ///
1267    /// # Returns
1268    ///
1269    /// `true` if the character position falls within any segment of this span.
1270    #[must_use]
1271    pub fn contains(&self, pos: usize) -> bool {
1272        self.segments.iter().any(|r| r.contains(&pos))
1273    }
1274
1275    /// Convert to a regular Span (uses bounding range, loses discontinuity info).
1276    #[must_use]
1277    pub fn to_span(&self) -> Option<Span> {
1278        self.bounding_range().map(|r| Span::Text {
1279            start: r.start,
1280            end: r.end,
1281        })
1282    }
1283}
1284
1285impl From<std::ops::Range<usize>> for DiscontinuousSpan {
1286    fn from(range: std::ops::Range<usize>) -> Self {
1287        Self::contiguous(range.start, range.end)
1288    }
1289}
1290
1291impl Default for Span {
1292    fn default() -> Self {
1293        Self::Text { start: 0, end: 0 }
1294    }
1295}
1296
1297// ============================================================================
1298// Hierarchical Confidence (Coarse-to-Fine)
1299// ============================================================================
1300
1301/// Hierarchical confidence scores for coarse-to-fine extraction.
1302///
1303/// Research (HiNet, InfoHier) shows that extraction benefits from
1304/// decomposed confidence:
1305/// - **Linkage**: "Is there ANY entity here?" (binary, fast filter)
1306/// - **Type**: "What type is it?" (fine-grained classification)
1307/// - **Boundary**: "Where exactly does it start/end?" (span refinement)
1308#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
1309pub struct HierarchicalConfidence {
1310    /// Coarse: probability that this span contains ANY entity (0.0-1.0)
1311    /// Used for early filtering in the TPLinker "handshaking" matrix.
1312    pub linkage: Confidence,
1313    /// Fine: probability that the type classification is correct (0.0-1.0)
1314    pub type_score: Confidence,
1315    /// Boundary: confidence in the exact span boundaries (0.0-1.0)
1316    /// Low for entities with fuzzy boundaries (e.g., "the CEO" vs "CEO")
1317    pub boundary: Confidence,
1318}
1319
1320impl HierarchicalConfidence {
1321    /// Create hierarchical confidence with all scores.
1322    ///
1323    /// Accepts any type convertible to `Confidence` (f32, f64, Confidence).
1324    /// Out-of-range values are clamped to [0.0, 1.0].
1325    #[must_use]
1326    pub fn new(
1327        linkage: impl Into<Confidence>,
1328        type_score: impl Into<Confidence>,
1329        boundary: impl Into<Confidence>,
1330    ) -> Self {
1331        Self {
1332            linkage: linkage.into(),
1333            type_score: type_score.into(),
1334            boundary: boundary.into(),
1335        }
1336    }
1337
1338    /// Create from a single confidence score (legacy compatibility).
1339    /// Assigns same score to all levels.
1340    #[must_use]
1341    pub fn from_single(confidence: impl Into<Confidence>) -> Self {
1342        let c = confidence.into();
1343        Self {
1344            linkage: c,
1345            type_score: c,
1346            boundary: c,
1347        }
1348    }
1349
1350    /// Calculate combined confidence (geometric mean).
1351    /// Geometric mean penalizes low scores more than arithmetic mean.
1352    #[must_use]
1353    pub fn combined(&self) -> Confidence {
1354        let product = self.linkage.value() * self.type_score.value() * self.boundary.value();
1355        Confidence::new(product.powf(1.0 / 3.0))
1356    }
1357
1358    /// Calculate combined confidence as f64 for legacy compatibility.
1359    #[must_use]
1360    pub fn as_f64(&self) -> f64 {
1361        self.combined().value()
1362    }
1363
1364    /// Check if passes minimum threshold at all levels.
1365    #[must_use]
1366    pub fn passes_threshold(&self, linkage_min: f64, type_min: f64, boundary_min: f64) -> bool {
1367        self.linkage >= linkage_min && self.type_score >= type_min && self.boundary >= boundary_min
1368    }
1369}
1370
1371impl Default for HierarchicalConfidence {
1372    fn default() -> Self {
1373        Self {
1374            linkage: Confidence::ONE,
1375            type_score: Confidence::ONE,
1376            boundary: Confidence::ONE,
1377        }
1378    }
1379}
1380
1381impl From<f64> for HierarchicalConfidence {
1382    fn from(confidence: f64) -> Self {
1383        Self::from_single(confidence)
1384    }
1385}
1386
1387impl From<f32> for HierarchicalConfidence {
1388    fn from(confidence: f32) -> Self {
1389        Self::from_single(confidence)
1390    }
1391}
1392
1393impl From<Confidence> for HierarchicalConfidence {
1394    fn from(confidence: Confidence) -> Self {
1395        Self::from_single(confidence)
1396    }
1397}
1398
1399// ============================================================================
1400// Ragged Batch (ModernBERT Unpadding)
1401// ============================================================================
1402
1403/// A ragged (unpadded) batch for efficient ModernBERT inference.
1404///
1405/// ModernBERT achieves its speed advantage by avoiding padding tokens entirely.
1406/// Instead of `[batch, max_seq_len]`, it uses a single contiguous 1D sequence
1407/// with offset indices to track document boundaries.
1408///
1409/// # Memory Layout
1410///
1411/// ```text
1412/// Traditional (padded):
1413/// [doc1_tok1, doc1_tok2, PAD, PAD, PAD]  <- wasted compute
1414/// [doc2_tok1, doc2_tok2, doc2_tok3, PAD, PAD]
1415///
1416/// Ragged (unpadded):
1417/// [doc1_tok1, doc1_tok2, doc2_tok1, doc2_tok2, doc2_tok3]
1418/// cumulative_offsets: [0, 2, 5]  <- doc1 is [0..2], doc2 is [2..5]
1419/// ```
1420#[derive(Debug, Clone)]
1421pub struct RaggedBatch {
1422    /// Token IDs flattened into a single contiguous array.
1423    /// Shape: `[total_tokens]` (1D, no padding)
1424    pub token_ids: Vec<u32>,
1425    /// Cumulative sequence lengths.
1426    /// Length: batch_size + 1
1427    /// Document i spans tokens \[offsets\[i\]..offsets\[i+1\])
1428    pub cumulative_offsets: Vec<u32>,
1429    /// Maximum sequence length in this batch (for kernel bounds).
1430    pub max_seq_len: usize,
1431}
1432
1433impl RaggedBatch {
1434    /// Create a new ragged batch from sequences.
1435    pub fn from_sequences(sequences: &[Vec<u32>]) -> Self {
1436        let total_tokens: usize = sequences.iter().map(|s| s.len()).sum();
1437        let mut token_ids = Vec::with_capacity(total_tokens);
1438        let mut cumulative_offsets = Vec::with_capacity(sequences.len() + 1);
1439        let mut max_seq_len = 0;
1440
1441        cumulative_offsets.push(0);
1442        for seq in sequences {
1443            token_ids.extend_from_slice(seq);
1444            // Check for overflow: u32::MAX is 4,294,967,295
1445            // If token_ids.len() exceeds this, we'll truncate (which is a bug)
1446            // but in practice, this is unlikely for reasonable batch sizes
1447            let len = token_ids.len();
1448            if len > u32::MAX as usize {
1449                // This would overflow - use saturating cast to prevent panic
1450                // but log a warning as this indicates a problem
1451                log::warn!(
1452                    "Token count {} exceeds u32::MAX, truncating to {}",
1453                    len,
1454                    u32::MAX
1455                );
1456                cumulative_offsets.push(u32::MAX);
1457            } else {
1458                cumulative_offsets.push(len as u32);
1459            }
1460            max_seq_len = max_seq_len.max(seq.len());
1461        }
1462
1463        Self {
1464            token_ids,
1465            cumulative_offsets,
1466            max_seq_len,
1467        }
1468    }
1469
1470    /// Get the number of documents in this batch.
1471    #[must_use]
1472    pub fn batch_size(&self) -> usize {
1473        self.cumulative_offsets.len().saturating_sub(1)
1474    }
1475
1476    /// Get the total number of tokens (no padding).
1477    #[must_use]
1478    pub fn total_tokens(&self) -> usize {
1479        self.token_ids.len()
1480    }
1481
1482    /// Get token range for a specific document.
1483    #[must_use]
1484    pub fn doc_range(&self, doc_idx: usize) -> Option<std::ops::Range<usize>> {
1485        if doc_idx + 1 < self.cumulative_offsets.len() {
1486            let start = self.cumulative_offsets[doc_idx] as usize;
1487            let end = self.cumulative_offsets[doc_idx + 1] as usize;
1488            Some(start..end)
1489        } else {
1490            None
1491        }
1492    }
1493
1494    /// Get tokens for a specific document.
1495    #[must_use]
1496    pub fn doc_tokens(&self, doc_idx: usize) -> Option<&[u32]> {
1497        self.doc_range(doc_idx).map(|r| &self.token_ids[r])
1498    }
1499
1500    /// Calculate memory saved vs padded batch.
1501    #[must_use]
1502    pub fn padding_savings(&self) -> f64 {
1503        let padded_size = self.batch_size() * self.max_seq_len;
1504        if padded_size == 0 {
1505            return 0.0;
1506        }
1507        1.0 - (self.total_tokens() as f64 / padded_size as f64)
1508    }
1509}
1510
1511// ============================================================================
1512// Span Candidate Generation
1513// ============================================================================
1514
1515/// A candidate span for entity extraction.
1516///
1517/// In GLiNER/bi-encoder systems, we generate all possible spans up to a
1518/// maximum width and score them against entity type embeddings.
1519#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
1520pub struct SpanCandidate {
1521    /// Document index in the batch
1522    pub doc_idx: u32,
1523    /// Start token index (within the document)
1524    pub start: u32,
1525    /// End token index (exclusive)
1526    pub end: u32,
1527}
1528
1529impl SpanCandidate {
1530    /// Create a new span candidate.
1531    #[must_use]
1532    pub const fn new(doc_idx: u32, start: u32, end: u32) -> Self {
1533        Self {
1534            doc_idx,
1535            start,
1536            end,
1537        }
1538    }
1539
1540    /// Get span width (number of tokens).
1541    #[must_use]
1542    pub const fn width(&self) -> u32 {
1543        self.end.saturating_sub(self.start)
1544    }
1545}
1546
1547/// Generate all valid span candidates for a ragged batch.
1548///
1549/// This is the "gnarly" operation in GLiNER - efficiently enumerating
1550/// all valid spans without O(N^2) memory allocation.
1551pub fn generate_span_candidates(batch: &RaggedBatch, max_width: usize) -> Vec<SpanCandidate> {
1552    let mut candidates = Vec::new();
1553
1554    for doc_idx in 0..batch.batch_size() {
1555        if let Some(range) = batch.doc_range(doc_idx) {
1556            let doc_len = range.len();
1557            // Generate all spans [i, j) where j - i <= max_width
1558            for start in 0..doc_len {
1559                let max_end = (start + max_width).min(doc_len);
1560                for end in (start + 1)..=max_end {
1561                    candidates.push(SpanCandidate::new(doc_idx as u32, start as u32, end as u32));
1562                }
1563            }
1564        }
1565    }
1566
1567    candidates
1568}
1569
1570/// Generate span candidates with early filtering.
1571///
1572/// Uses a linkage mask to skip low-probability spans (TPLinker optimization).
1573pub fn generate_filtered_candidates(
1574    batch: &RaggedBatch,
1575    max_width: usize,
1576    linkage_mask: &[f32],
1577    threshold: f32,
1578) -> Vec<SpanCandidate> {
1579    let mut candidates = Vec::new();
1580    let mut mask_idx = 0;
1581
1582    for doc_idx in 0..batch.batch_size() {
1583        if let Some(range) = batch.doc_range(doc_idx) {
1584            let doc_len = range.len();
1585            for start in 0..doc_len {
1586                let max_end = (start + max_width).min(doc_len);
1587                for end in (start + 1)..=max_end {
1588                    // Only include if linkage probability exceeds threshold
1589                    if mask_idx < linkage_mask.len() && linkage_mask[mask_idx] >= threshold {
1590                        candidates.push(SpanCandidate::new(
1591                            doc_idx as u32,
1592                            start as u32,
1593                            end as u32,
1594                        ));
1595                    }
1596                    mask_idx += 1;
1597                }
1598            }
1599        }
1600    }
1601
1602    candidates
1603}
1604
1605// ============================================================================
1606// Entity (Extended)
1607// ============================================================================
1608
1609/// A recognized named entity or relation trigger.
1610///
1611/// # Entity Structure
1612///
1613/// ```text
1614/// "Contact John at john@example.com on Jan 15"
1615///          ^^^^    ^^^^^^^^^^^^^^^^    ^^^^^^
1616///          PER     EMAIL               DATE
1617///          |       |                   |
1618///          Named   Contact             Temporal
1619///          (ML)    (Pattern)           (Pattern)
1620/// ```
1621///
1622/// # Core Fields (Stable API)
1623///
1624/// - `text`, `entity_type`, `start`, `end`, `confidence` — always present
1625/// - `normalized`, `provenance` — commonly used optional fields
1626/// - `kb_id`, `canonical_id` — knowledge graph and coreference support
1627///
1628/// # Extended Fields (Research/Experimental)
1629///
1630/// The following fields support advanced research applications but may evolve:
1631///
1632/// | Field | Purpose | Status |
1633/// |-------|---------|--------|
1634/// | `visual_span` | Multi-modal (ColPali) extraction | Experimental |
1635/// | `discontinuous_span` | W2NER non-contiguous entities | Experimental |
1636/// | `hierarchical_confidence` | Coarse-to-fine NER | Experimental |
1637///
1638/// These fields are `#[serde(skip_serializing_if = "Option::is_none")]` so they
1639/// have no overhead when unused.
1640///
1641/// # Knowledge Graph Support
1642///
1643/// For GraphRAG and coreference resolution, entities support:
1644/// - `kb_id`: External knowledge base identifier (e.g., Wikidata Q-ID)
1645/// - `canonical_id`: Local coreference cluster ID (links "John" and "he")
1646///
1647/// # Normalization
1648///
1649/// Entities can have a normalized form for downstream processing:
1650/// - Dates: "Jan 15" → "2024-01-15" (ISO 8601)
1651/// - Money: "$1.5M" → "1500000 USD"
1652/// - Locations: "NYC" → "New York City"
1653#[derive(Debug, Clone, Serialize)]
1654pub struct Entity {
1655    /// Entity text (surface form as it appears in source)
1656    pub text: String,
1657    /// Entity type classification
1658    pub entity_type: EntityType,
1659    /// Start position (character offset, NOT byte offset).
1660    ///
1661    /// For Unicode text, character offsets differ from byte offsets.
1662    /// Use `anno::offset::bytes_to_chars` to convert if needed.
1663    ///
1664    /// Access via [`Entity::start()`] / [`Entity::set_start()`].
1665    start: usize,
1666    /// End position (character offset, exclusive).
1667    ///
1668    /// For Unicode text, character offsets differ from byte offsets.
1669    /// Use `anno::offset::bytes_to_chars` to convert if needed.
1670    ///
1671    /// Access via [`Entity::end()`] / [`Entity::set_end()`].
1672    end: usize,
1673    /// Confidence score (0.0-1.0, calibrated).
1674    ///
1675    /// Construction via [`Confidence::new`] clamps to `[0.0, 1.0]`.
1676    /// Use `.value()` or `Into<f64>` to extract the raw score.
1677    pub confidence: Confidence,
1678    /// Normalized/canonical form (e.g., "Jan 15" → "2024-01-15")
1679    #[serde(default, skip_serializing_if = "Option::is_none")]
1680    pub normalized: Option<String>,
1681    /// Provenance: which backend/method produced this entity
1682    #[serde(default, skip_serializing_if = "Option::is_none")]
1683    pub provenance: Option<Provenance>,
1684    /// External knowledge base ID (e.g., "Q7186" for Marie Curie in Wikidata).
1685    /// Used for entity linking and GraphRAG applications.
1686    #[serde(default, skip_serializing_if = "Option::is_none")]
1687    pub kb_id: Option<String>,
1688    /// Local coreference cluster ID.
1689    /// Multiple mentions with the same `canonical_id` refer to the same entity.
1690    /// Example: "Marie Curie" and "she" might share `canonical_id = CanonicalId(42)`.
1691    #[serde(default, skip_serializing_if = "Option::is_none")]
1692    pub canonical_id: Option<super::types::CanonicalId>,
1693    /// Hierarchical confidence (coarse-to-fine).
1694    /// Provides linkage, type, and boundary scores separately.
1695    #[serde(default, skip_serializing_if = "Option::is_none")]
1696    pub hierarchical_confidence: Option<HierarchicalConfidence>,
1697    /// Visual span for multi-modal (ColPali) extraction.
1698    /// When set, provides bounding box location in addition to text offsets.
1699    #[serde(default, skip_serializing_if = "Option::is_none")]
1700    pub visual_span: Option<Span>,
1701    /// Discontinuous span for non-contiguous entity mentions (W2NER support).
1702    /// When set, overrides `start`/`end` for length calculations.
1703    /// Example: "New York and LA \[airports\]" where "airports" modifies both.
1704    #[serde(default, skip_serializing_if = "Option::is_none")]
1705    pub discontinuous_span: Option<DiscontinuousSpan>,
1706    /// Mention type classification (Proper, Nominal, Pronominal, Zero).
1707    ///
1708    /// Classifies the referring expression type for coreference resolution.
1709    /// Follows the Accessibility Hierarchy (Ariel 1990):
1710    /// Proper > Nominal > Pronominal > Zero.
1711    #[serde(default, skip_serializing_if = "Option::is_none")]
1712    pub mention_type: Option<MentionType>,
1713}
1714
1715impl<'de> Deserialize<'de> for Entity {
1716    fn deserialize<D: serde::Deserializer<'de>>(deserializer: D) -> Result<Self, D::Error> {
1717        /// Helper that mirrors Entity's fields so serde can derive the parsing,
1718        /// then we route through `Entity::new()` to enforce invariants (e.g.
1719        /// inverted span normalization).
1720        #[derive(Deserialize)]
1721        struct EntityHelper {
1722            text: String,
1723            entity_type: EntityType,
1724            start: usize,
1725            end: usize,
1726            confidence: Confidence,
1727            #[serde(default)]
1728            normalized: Option<String>,
1729            #[serde(default)]
1730            provenance: Option<Provenance>,
1731            #[serde(default)]
1732            kb_id: Option<String>,
1733            #[serde(default)]
1734            canonical_id: Option<super::types::CanonicalId>,
1735            #[serde(default)]
1736            hierarchical_confidence: Option<HierarchicalConfidence>,
1737            #[serde(default)]
1738            visual_span: Option<Span>,
1739            #[serde(default)]
1740            discontinuous_span: Option<DiscontinuousSpan>,
1741            #[serde(default)]
1742            mention_type: Option<MentionType>,
1743        }
1744
1745        let h = EntityHelper::deserialize(deserializer)?;
1746        let mut entity = Entity::new(h.text, h.entity_type, h.start, h.end, h.confidence);
1747        entity.normalized = h.normalized;
1748        entity.provenance = h.provenance;
1749        entity.kb_id = h.kb_id;
1750        entity.canonical_id = h.canonical_id;
1751        entity.hierarchical_confidence = h.hierarchical_confidence;
1752        entity.visual_span = h.visual_span;
1753        entity.discontinuous_span = h.discontinuous_span;
1754        entity.mention_type = h.mention_type;
1755        Ok(entity)
1756    }
1757}
1758
1759impl Entity {
1760    /// Create a new entity.
1761    ///
1762    /// ```
1763    /// use anno_core::{Entity, EntityType};
1764    ///
1765    /// let e = Entity::new("Berlin", EntityType::Location, 10, 16, 0.95);
1766    /// assert_eq!(e.text, "Berlin");
1767    /// assert_eq!(e.entity_type, EntityType::Location);
1768    /// assert_eq!((e.start(), e.end()), (10, 16));
1769    /// ```
1770    #[must_use]
1771    pub fn new(
1772        text: impl Into<String>,
1773        entity_type: EntityType,
1774        start: usize,
1775        end: usize,
1776        confidence: impl Into<Confidence>,
1777    ) -> Self {
1778        // Normalize inverted spans (same as CharSpan::new)
1779        let (start, end) = if start > end {
1780            (end, start)
1781        } else {
1782            (start, end)
1783        };
1784        Self {
1785            text: text.into(),
1786            entity_type,
1787            start,
1788            end,
1789            confidence: confidence.into(),
1790            normalized: None,
1791            provenance: None,
1792            kb_id: None,
1793            canonical_id: None,
1794            hierarchical_confidence: None,
1795            visual_span: None,
1796            discontinuous_span: None,
1797            mention_type: None,
1798        }
1799    }
1800
1801    /// Start character offset (inclusive, 0-indexed).
1802    #[inline]
1803    #[must_use]
1804    pub fn start(&self) -> usize {
1805        self.start
1806    }
1807
1808    /// End character offset (exclusive).
1809    #[inline]
1810    #[must_use]
1811    pub fn end(&self) -> usize {
1812        self.end
1813    }
1814
1815    /// Set the start offset. For use in post-processing pipelines.
1816    #[inline]
1817    pub fn set_start(&mut self, start: usize) {
1818        self.start = start;
1819    }
1820
1821    /// Set the end offset. For use in post-processing pipelines.
1822    #[inline]
1823    pub fn set_end(&mut self, end: usize) {
1824        self.end = end;
1825    }
1826
1827    /// Create a new entity with provenance information.
1828    #[must_use]
1829    pub fn with_provenance(
1830        text: impl Into<String>,
1831        entity_type: EntityType,
1832        start: usize,
1833        end: usize,
1834        confidence: impl Into<Confidence>,
1835        provenance: Provenance,
1836    ) -> Self {
1837        let (start, end) = if start > end {
1838            (end, start)
1839        } else {
1840            (start, end)
1841        };
1842        Self {
1843            text: text.into(),
1844            entity_type,
1845            start,
1846            end,
1847            confidence: confidence.into(),
1848            normalized: None,
1849            provenance: Some(provenance),
1850            kb_id: None,
1851            canonical_id: None,
1852            hierarchical_confidence: None,
1853            visual_span: None,
1854            discontinuous_span: None,
1855            mention_type: None,
1856        }
1857    }
1858
1859    /// Create an entity with hierarchical confidence scores.
1860    #[must_use]
1861    pub fn with_hierarchical_confidence(
1862        text: impl Into<String>,
1863        entity_type: EntityType,
1864        start: usize,
1865        end: usize,
1866        confidence: HierarchicalConfidence,
1867    ) -> Self {
1868        let (start, end) = if start > end {
1869            (end, start)
1870        } else {
1871            (start, end)
1872        };
1873        Self {
1874            text: text.into(),
1875            entity_type,
1876            start,
1877            end,
1878            confidence: Confidence::new(confidence.as_f64()),
1879            normalized: None,
1880            provenance: None,
1881            kb_id: None,
1882            canonical_id: None,
1883            hierarchical_confidence: Some(confidence),
1884            visual_span: None,
1885            discontinuous_span: None,
1886            mention_type: None,
1887        }
1888    }
1889
1890    /// Create an entity from a visual bounding box (ColPali multi-modal).
1891    #[must_use]
1892    pub fn from_visual(
1893        text: impl Into<String>,
1894        entity_type: EntityType,
1895        bbox: Span,
1896        confidence: impl Into<Confidence>,
1897    ) -> Self {
1898        Self {
1899            text: text.into(),
1900            entity_type,
1901            start: 0,
1902            end: 0,
1903            confidence: confidence.into(),
1904            normalized: None,
1905            provenance: None,
1906            kb_id: None,
1907            canonical_id: None,
1908            hierarchical_confidence: None,
1909            visual_span: Some(bbox),
1910            discontinuous_span: None,
1911            mention_type: None,
1912        }
1913    }
1914
1915    /// Create an entity with default confidence (1.0).
1916    #[must_use]
1917    pub fn with_type(
1918        text: impl Into<String>,
1919        entity_type: EntityType,
1920        start: usize,
1921        end: usize,
1922    ) -> Self {
1923        Self::new(text, entity_type, start, end, 1.0)
1924    }
1925
1926    /// Link this entity to an external knowledge base.
1927    ///
1928    /// # Examples
1929    /// ```
1930    /// use anno_core::{Entity, EntityType};
1931    /// let mut e = Entity::new("Marie Curie", EntityType::Person, 0, 11, 0.95);
1932    /// e.link_to_kb("Q7186");
1933    /// assert_eq!(e.kb_id.as_deref(), Some("Q7186"));
1934    /// ```
1935    pub fn link_to_kb(&mut self, kb_id: impl Into<String>) {
1936        self.kb_id = Some(kb_id.into());
1937    }
1938
1939    /// Assign this entity to a coreference cluster.
1940    ///
1941    /// Entities with the same `canonical_id` refer to the same real-world entity.
1942    pub fn set_canonical(&mut self, canonical_id: impl Into<super::types::CanonicalId>) {
1943        self.canonical_id = Some(canonical_id.into());
1944    }
1945
1946    /// Builder-style method to set canonical ID.
1947    ///
1948    /// # Example
1949    /// ```
1950    /// use anno_core::{CanonicalId, Entity, EntityType};
1951    /// let entity = Entity::new("John", EntityType::Person, 0, 4, 0.9)
1952    ///     .with_canonical_id(42);
1953    /// assert_eq!(entity.canonical_id, Some(CanonicalId::new(42)));
1954    /// ```
1955    #[must_use]
1956    pub fn with_canonical_id(mut self, canonical_id: impl Into<super::types::CanonicalId>) -> Self {
1957        self.canonical_id = Some(canonical_id.into());
1958        self
1959    }
1960
1961    /// Check if this entity is linked to a knowledge base.
1962    #[must_use]
1963    pub fn is_linked(&self) -> bool {
1964        self.kb_id.is_some()
1965    }
1966
1967    /// Check if this entity has coreference information.
1968    #[must_use]
1969    pub fn has_coreference(&self) -> bool {
1970        self.canonical_id.is_some()
1971    }
1972
1973    /// Check if this entity has a discontinuous span.
1974    ///
1975    /// Discontinuous entities span non-contiguous text regions.
1976    /// Example: "New York and LA airports" contains "New York airports"
1977    /// as a discontinuous entity.
1978    #[must_use]
1979    pub fn is_discontinuous(&self) -> bool {
1980        self.discontinuous_span
1981            .as_ref()
1982            .map(|s| s.is_discontinuous())
1983            .unwrap_or(false)
1984    }
1985
1986    /// Get the discontinuous segments if present.
1987    ///
1988    /// Returns `None` if this is a contiguous entity.
1989    #[must_use]
1990    pub fn discontinuous_segments(&self) -> Option<Vec<std::ops::Range<usize>>> {
1991        self.discontinuous_span
1992            .as_ref()
1993            .filter(|s| s.is_discontinuous())
1994            .map(|s| s.segments().to_vec())
1995    }
1996
1997    /// Set a discontinuous span for this entity.
1998    ///
1999    /// This is used by W2NER and similar models that detect non-contiguous mentions.
2000    pub fn set_discontinuous_span(&mut self, span: DiscontinuousSpan) {
2001        // Update start/end to match the bounding range
2002        if let Some(bounding) = span.bounding_range() {
2003            self.start = bounding.start;
2004            self.end = bounding.end;
2005        }
2006        self.discontinuous_span = Some(span);
2007    }
2008
2009    /// Get the total length covered by this entity, in **characters**.
2010    ///
2011    /// - **Contiguous**: `end - start`
2012    /// - **Discontinuous**: sum of segment lengths
2013    ///
2014    /// This is intentionally consistent: all offsets in `anno::core` entity spans
2015    /// are **character offsets** (Unicode scalar values), not byte offsets.
2016    #[must_use]
2017    pub fn total_len(&self) -> usize {
2018        if let Some(ref span) = self.discontinuous_span {
2019            span.segments().iter().map(|r| r.end - r.start).sum()
2020        } else {
2021            self.end.saturating_sub(self.start)
2022        }
2023    }
2024
2025    /// Set the normalized form for this entity.
2026    ///
2027    /// # Examples
2028    ///
2029    /// ```rust
2030    /// use anno_core::{Entity, EntityType};
2031    ///
2032    /// let mut entity = Entity::new("Jan 15", EntityType::Date, 0, 6, 0.95);
2033    /// entity.set_normalized("2024-01-15");
2034    /// assert_eq!(entity.normalized.as_deref(), Some("2024-01-15"));
2035    /// ```
2036    pub fn set_normalized(&mut self, normalized: impl Into<String>) {
2037        self.normalized = Some(normalized.into());
2038    }
2039
2040    /// Get the normalized form, or the original text if not normalized.
2041    #[must_use]
2042    pub fn normalized_or_text(&self) -> &str {
2043        self.normalized.as_deref().unwrap_or(&self.text)
2044    }
2045
2046    /// Get the extraction method, if known.
2047    #[must_use]
2048    pub fn method(&self) -> ExtractionMethod {
2049        self.provenance
2050            .as_ref()
2051            .map_or(ExtractionMethod::Unknown, |p| p.method)
2052    }
2053
2054    /// Get the source backend name, if known.
2055    #[must_use]
2056    pub fn source(&self) -> Option<&str> {
2057        self.provenance.as_ref().map(|p| p.source.as_ref())
2058    }
2059
2060    /// Get the entity category.
2061    #[must_use]
2062    pub fn category(&self) -> EntityCategory {
2063        self.entity_type.category()
2064    }
2065
2066    /// Returns true if this entity was detected via patterns (not ML).
2067    #[must_use]
2068    pub fn is_structured(&self) -> bool {
2069        self.entity_type.pattern_detectable()
2070    }
2071
2072    /// Returns true if this entity required ML for detection.
2073    #[must_use]
2074    pub fn is_named(&self) -> bool {
2075        self.entity_type.requires_ml()
2076    }
2077
2078    /// Check if this entity overlaps with another.
2079    #[must_use]
2080    pub fn overlaps(&self, other: &Entity) -> bool {
2081        !(self.end <= other.start || other.end <= self.start)
2082    }
2083
2084    /// Calculate overlap ratio (IoU) with another entity.
2085    #[must_use]
2086    pub fn overlap_ratio(&self, other: &Entity) -> f64 {
2087        let intersection_start = self.start.max(other.start);
2088        let intersection_end = self.end.min(other.end);
2089
2090        if intersection_start >= intersection_end {
2091            return 0.0;
2092        }
2093
2094        let intersection = (intersection_end - intersection_start) as f64;
2095        let union = ((self.end - self.start) + (other.end - other.start)
2096            - (intersection_end - intersection_start)) as f64;
2097
2098        if union == 0.0 {
2099            return 1.0;
2100        }
2101
2102        intersection / union
2103    }
2104
2105    /// Set hierarchical confidence scores.
2106    pub fn set_hierarchical_confidence(&mut self, confidence: HierarchicalConfidence) {
2107        self.confidence = Confidence::new(confidence.as_f64());
2108        self.hierarchical_confidence = Some(confidence);
2109    }
2110
2111    /// Get the linkage confidence (coarse filter score).
2112    #[must_use]
2113    pub fn linkage_confidence(&self) -> Confidence {
2114        self.hierarchical_confidence
2115            .map_or(self.confidence, |h| h.linkage)
2116    }
2117
2118    /// Get the type classification confidence.
2119    #[must_use]
2120    pub fn type_confidence(&self) -> Confidence {
2121        self.hierarchical_confidence
2122            .map_or(self.confidence, |h| h.type_score)
2123    }
2124
2125    /// Get the boundary confidence.
2126    #[must_use]
2127    pub fn boundary_confidence(&self) -> Confidence {
2128        self.hierarchical_confidence
2129            .map_or(self.confidence, |h| h.boundary)
2130    }
2131
2132    /// Check if this entity has visual location (multi-modal).
2133    #[must_use]
2134    pub fn is_visual(&self) -> bool {
2135        self.visual_span.is_some()
2136    }
2137
2138    /// Get the text span (start, end).
2139    #[must_use]
2140    pub const fn text_span(&self) -> (usize, usize) {
2141        (self.start, self.end)
2142    }
2143
2144    /// Get the span length.
2145    #[must_use]
2146    pub const fn span_len(&self) -> usize {
2147        self.end.saturating_sub(self.start)
2148    }
2149
2150    /// Create a unified TextSpan with both byte and char offsets.
2151    ///
2152    /// This is useful when you need to work with both offset systems.
2153    /// The `text` parameter must be the original source text from which
2154    /// this entity was extracted.
2155    ///
2156    /// # Arguments
2157    /// * `source_text` - The original text (needed to compute byte offsets)
2158    ///
2159    /// # Returns
2160    /// A TextSpan with both byte and char offsets.
2161    ///
2162    /// # Note
2163    ///
2164    /// This method requires the offset conversion utilities from the `anno` crate.
2165    /// Use `anno::offset::char_to_byte_offsets()` directly for now.
2166    ///
2167    /// # Example
2168    /// ```rust,ignore
2169    /// use anno_core::{Entity, EntityType};
2170    ///
2171    /// let (byte_start, byte_end) = char_to_byte_offsets(text, entity.start(), entity.end());
2172    /// ```
2173    /// Set visual span for multi-modal extraction.
2174    pub fn set_visual_span(&mut self, span: Span) {
2175        self.visual_span = Some(span);
2176    }
2177
2178    /// Safely extract text from source using character offsets.
2179    ///
2180    /// Entity stores character offsets, not byte offsets. This method
2181    /// correctly extracts text by iterating over characters.
2182    ///
2183    /// # Arguments
2184    /// * `source_text` - The original text from which this entity was extracted
2185    ///
2186    /// # Returns
2187    /// The extracted text, or empty string if offsets are invalid
2188    ///
2189    /// # Example
2190    /// ```rust
2191    /// use anno_core::{Entity, EntityType};
2192    ///
2193    /// let text = "Hello, 日本!";
2194    /// let entity = Entity::new("日本", EntityType::Location, 7, 9, 0.95);
2195    /// assert_eq!(entity.extract_text(text), "日本");
2196    /// ```
2197    #[must_use]
2198    pub fn extract_text(&self, source_text: &str) -> String {
2199        // Performance: Use cached length if available, but fallback to counting
2200        // For single entity extraction, this is fine. For batch operations,
2201        // use extract_text_with_len with pre-computed length.
2202        let char_count = source_text.chars().count();
2203        self.extract_text_with_len(source_text, char_count)
2204    }
2205
2206    /// Extract text with pre-computed text length (performance optimization).
2207    ///
2208    /// Use this when validating/clamping multiple entities from the same text
2209    /// to avoid recalculating `text.chars().count()` for each entity.
2210    ///
2211    /// # Arguments
2212    /// * `source_text` - The original text
2213    /// * `text_char_count` - Pre-computed character count (from `text.chars().count()`)
2214    ///
2215    /// # Returns
2216    /// The extracted text, or empty string if offsets are invalid
2217    #[must_use]
2218    pub fn extract_text_with_len(&self, source_text: &str, text_char_count: usize) -> String {
2219        if self.start >= text_char_count || self.end > text_char_count || self.start >= self.end {
2220            return String::new();
2221        }
2222        source_text
2223            .chars()
2224            .skip(self.start)
2225            .take(self.end - self.start)
2226            .collect()
2227    }
2228
2229    /// Create a builder for fluent entity construction.
2230    #[must_use]
2231    pub fn builder(text: impl Into<String>, entity_type: EntityType) -> EntityBuilder {
2232        EntityBuilder::new(text, entity_type)
2233    }
2234
2235    // =========================================================================
2236    // Validation Methods (Production Quality)
2237    // =========================================================================
2238
2239    /// Validate this entity against the source text.
2240    ///
2241    /// Returns a list of validation issues. Empty list means the entity is valid.
2242    ///
2243    /// # Checks Performed
2244    ///
2245    /// 1. **Span bounds**: `start < end`, both within text length
2246    /// 2. **Text match**: `text` matches the span in source
2247    /// 3. **Confidence range**: `confidence` in [0.0, 1.0]
2248    /// 4. **Type consistency**: Custom types have non-empty names
2249    /// 5. **Discontinuous consistency**: If present, segments are valid
2250    ///
2251    /// # Example
2252    ///
2253    /// ```rust
2254    /// use anno_core::{Entity, EntityType};
2255    ///
2256    /// let text = "John works at Apple";
2257    /// let entity = Entity::new("John", EntityType::Person, 0, 4, 0.95);
2258    ///
2259    /// let issues = entity.validate(text);
2260    /// assert!(issues.is_empty(), "Entity should be valid");
2261    ///
2262    /// // Invalid entity: span doesn't match text
2263    /// let bad = Entity::new("Jane", EntityType::Person, 0, 4, 0.95);
2264    /// let issues = bad.validate(text);
2265    /// assert!(!issues.is_empty(), "Entity text doesn't match span");
2266    /// ```
2267    #[must_use]
2268    pub fn validate(&self, source_text: &str) -> Vec<ValidationIssue> {
2269        // Performance: Calculate length once, delegate to optimized version
2270        let char_count = source_text.chars().count();
2271        self.validate_with_len(source_text, char_count)
2272    }
2273
2274    /// Validate entity with pre-computed text length (performance optimization).
2275    ///
2276    /// Use this when validating multiple entities from the same text to avoid
2277    /// recalculating `text.chars().count()` for each entity.
2278    ///
2279    /// # Arguments
2280    /// * `source_text` - The original text
2281    /// * `text_char_count` - Pre-computed character count (from `text.chars().count()`)
2282    ///
2283    /// # Returns
2284    /// Vector of validation issues (empty if valid)
2285    #[must_use]
2286    pub fn validate_with_len(
2287        &self,
2288        source_text: &str,
2289        text_char_count: usize,
2290    ) -> Vec<ValidationIssue> {
2291        let mut issues = Vec::new();
2292
2293        // 1. Span bounds
2294        if self.start >= self.end {
2295            issues.push(ValidationIssue::InvalidSpan {
2296                start: self.start,
2297                end: self.end,
2298                reason: "start must be less than end".to_string(),
2299            });
2300        }
2301
2302        if self.end > text_char_count {
2303            issues.push(ValidationIssue::SpanOutOfBounds {
2304                end: self.end,
2305                text_len: text_char_count,
2306            });
2307        }
2308
2309        // 2. Text match (only if span is valid)
2310        if self.start < self.end && self.end <= text_char_count {
2311            let actual = self.extract_text_with_len(source_text, text_char_count);
2312            if actual != self.text {
2313                issues.push(ValidationIssue::TextMismatch {
2314                    expected: self.text.clone(),
2315                    actual,
2316                    start: self.start,
2317                    end: self.end,
2318                });
2319            }
2320        }
2321
2322        // 3. Confidence range (now enforced by the Confidence type, so this is a no-op)
2323
2324        // 4. Type consistency
2325        if let EntityType::Custom { ref name, .. } = self.entity_type {
2326            if name.is_empty() {
2327                issues.push(ValidationIssue::InvalidType {
2328                    reason: "Custom entity type has empty name".to_string(),
2329                });
2330            }
2331        }
2332
2333        // 5. Discontinuous span consistency
2334        if let Some(ref disc_span) = self.discontinuous_span {
2335            for (i, seg) in disc_span.segments().iter().enumerate() {
2336                if seg.start >= seg.end {
2337                    issues.push(ValidationIssue::InvalidSpan {
2338                        start: seg.start,
2339                        end: seg.end,
2340                        reason: format!("discontinuous segment {} is invalid", i),
2341                    });
2342                }
2343                if seg.end > text_char_count {
2344                    issues.push(ValidationIssue::SpanOutOfBounds {
2345                        end: seg.end,
2346                        text_len: text_char_count,
2347                    });
2348                }
2349            }
2350        }
2351
2352        issues
2353    }
2354
2355    /// Check if this entity is valid against the source text.
2356    ///
2357    /// Convenience method that returns `true` if `validate()` returns empty.
2358    #[must_use]
2359    pub fn is_valid(&self, source_text: &str) -> bool {
2360        self.validate(source_text).is_empty()
2361    }
2362
2363    /// Validate a batch of entities efficiently.
2364    ///
2365    /// Returns a map of entity index -> validation issues.
2366    /// Only entities with issues are included.
2367    ///
2368    /// # Example
2369    ///
2370    /// ```rust
2371    /// use anno_core::{Entity, EntityType};
2372    ///
2373    /// let text = "John and Jane work at Apple";
2374    /// let entities = vec![
2375    ///     Entity::new("John", EntityType::Person, 0, 4, 0.95),
2376    ///     Entity::new("Wrong", EntityType::Person, 9, 13, 0.8),
2377    /// ];
2378    ///
2379    /// let issues = Entity::validate_batch(&entities, text);
2380    /// assert!(issues.is_empty() || issues.contains_key(&1)); // Second entity might fail
2381    /// ```
2382    #[must_use]
2383    pub fn validate_batch(
2384        entities: &[Entity],
2385        source_text: &str,
2386    ) -> std::collections::HashMap<usize, Vec<ValidationIssue>> {
2387        entities
2388            .iter()
2389            .enumerate()
2390            .filter_map(|(idx, entity)| {
2391                let issues = entity.validate(source_text);
2392                if issues.is_empty() {
2393                    None
2394                } else {
2395                    Some((idx, issues))
2396                }
2397            })
2398            .collect()
2399    }
2400}
2401
2402/// Validation issue found during entity validation.
2403#[derive(Debug, Clone, PartialEq)]
2404pub enum ValidationIssue {
2405    /// Span bounds are invalid (start >= end).
2406    InvalidSpan {
2407        /// Start position of the invalid span.
2408        start: usize,
2409        /// End position of the invalid span.
2410        end: usize,
2411        /// Description of why the span is invalid.
2412        reason: String,
2413    },
2414    /// Span extends beyond text length.
2415    SpanOutOfBounds {
2416        /// End position that exceeds the text.
2417        end: usize,
2418        /// Actual length of the text.
2419        text_len: usize,
2420    },
2421    /// Entity text doesn't match the span in source.
2422    TextMismatch {
2423        /// Text stored in the entity.
2424        expected: String,
2425        /// Text found at the span in source.
2426        actual: String,
2427        /// Start position of the span.
2428        start: usize,
2429        /// End position of the span.
2430        end: usize,
2431    },
2432    /// Confidence is outside [0.0, 1.0].
2433    InvalidConfidence {
2434        /// The invalid confidence value.
2435        value: f64,
2436    },
2437    /// Entity type is invalid.
2438    InvalidType {
2439        /// Description of why the type is invalid.
2440        reason: String,
2441    },
2442}
2443
2444impl std::fmt::Display for ValidationIssue {
2445    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
2446        match self {
2447            ValidationIssue::InvalidSpan { start, end, reason } => {
2448                write!(f, "Invalid span [{}, {}): {}", start, end, reason)
2449            }
2450            ValidationIssue::SpanOutOfBounds { end, text_len } => {
2451                write!(f, "Span end {} exceeds text length {}", end, text_len)
2452            }
2453            ValidationIssue::TextMismatch {
2454                expected,
2455                actual,
2456                start,
2457                end,
2458            } => {
2459                write!(
2460                    f,
2461                    "Text mismatch at [{}, {}): expected '{}', got '{}'",
2462                    start, end, expected, actual
2463                )
2464            }
2465            ValidationIssue::InvalidConfidence { value } => {
2466                write!(f, "Confidence {} outside [0.0, 1.0]", value)
2467            }
2468            ValidationIssue::InvalidType { reason } => {
2469                write!(f, "Invalid entity type: {}", reason)
2470            }
2471        }
2472    }
2473}
2474
2475/// Fluent builder for constructing entities with optional fields.
2476///
2477/// # Example
2478///
2479/// ```rust
2480/// use anno_core::{Entity, EntityType, Provenance};
2481///
2482/// let entity = Entity::builder("Marie Curie", EntityType::Person)
2483///     .span(0, 11)
2484///     .confidence(0.95)
2485///     .kb_id("Q7186")
2486///     .provenance(Provenance::ml("bert", 0.95))
2487///     .build();
2488/// ```
2489#[derive(Debug, Clone)]
2490pub struct EntityBuilder {
2491    text: String,
2492    entity_type: EntityType,
2493    start: usize,
2494    end: usize,
2495    confidence: Confidence,
2496    normalized: Option<String>,
2497    provenance: Option<Provenance>,
2498    kb_id: Option<String>,
2499    canonical_id: Option<super::types::CanonicalId>,
2500    hierarchical_confidence: Option<HierarchicalConfidence>,
2501    visual_span: Option<Span>,
2502    discontinuous_span: Option<DiscontinuousSpan>,
2503    mention_type: Option<MentionType>,
2504}
2505
2506impl EntityBuilder {
2507    /// Create a new builder.
2508    #[must_use]
2509    pub fn new(text: impl Into<String>, entity_type: EntityType) -> Self {
2510        let text = text.into();
2511        let end = text.chars().count();
2512        Self {
2513            text,
2514            entity_type,
2515            start: 0,
2516            end,
2517            confidence: Confidence::ONE,
2518            normalized: None,
2519            provenance: None,
2520            kb_id: None,
2521            canonical_id: None,
2522            hierarchical_confidence: None,
2523            visual_span: None,
2524            discontinuous_span: None,
2525            mention_type: None,
2526        }
2527    }
2528
2529    /// Set span offsets.
2530    #[must_use]
2531    pub const fn span(mut self, start: usize, end: usize) -> Self {
2532        self.start = start;
2533        self.end = end;
2534        self
2535    }
2536
2537    /// Set confidence score.
2538    #[must_use]
2539    pub fn confidence(mut self, confidence: impl Into<Confidence>) -> Self {
2540        self.confidence = confidence.into();
2541        self
2542    }
2543
2544    /// Set hierarchical confidence.
2545    #[must_use]
2546    pub fn hierarchical_confidence(mut self, confidence: HierarchicalConfidence) -> Self {
2547        self.confidence = Confidence::new(confidence.as_f64());
2548        self.hierarchical_confidence = Some(confidence);
2549        self
2550    }
2551
2552    /// Set normalized form.
2553    #[must_use]
2554    pub fn normalized(mut self, normalized: impl Into<String>) -> Self {
2555        self.normalized = Some(normalized.into());
2556        self
2557    }
2558
2559    /// Set provenance.
2560    #[must_use]
2561    pub fn provenance(mut self, provenance: Provenance) -> Self {
2562        self.provenance = Some(provenance);
2563        self
2564    }
2565
2566    /// Set knowledge base ID.
2567    #[must_use]
2568    pub fn kb_id(mut self, kb_id: impl Into<String>) -> Self {
2569        self.kb_id = Some(kb_id.into());
2570        self
2571    }
2572
2573    /// Set canonical (coreference) ID.
2574    #[must_use]
2575    pub const fn canonical_id(mut self, canonical_id: u64) -> Self {
2576        self.canonical_id = Some(super::types::CanonicalId::new(canonical_id));
2577        self
2578    }
2579
2580    /// Set visual span.
2581    #[must_use]
2582    pub fn visual_span(mut self, span: Span) -> Self {
2583        self.visual_span = Some(span);
2584        self
2585    }
2586
2587    /// Set discontinuous span for non-contiguous entities.
2588    ///
2589    /// This automatically updates `start` and `end` to the bounding range.
2590    #[must_use]
2591    pub fn discontinuous_span(mut self, span: DiscontinuousSpan) -> Self {
2592        // Update start/end to bounding range
2593        if let Some(bounding) = span.bounding_range() {
2594            self.start = bounding.start;
2595            self.end = bounding.end;
2596        }
2597        self.discontinuous_span = Some(span);
2598        self
2599    }
2600
2601    /// Set mention type classification.
2602    #[must_use]
2603    pub fn mention_type(mut self, mention_type: MentionType) -> Self {
2604        self.mention_type = Some(mention_type);
2605        self
2606    }
2607
2608    /// Build the entity.
2609    #[must_use]
2610    pub fn build(self) -> Entity {
2611        Entity {
2612            text: self.text,
2613            entity_type: self.entity_type,
2614            start: self.start,
2615            end: self.end,
2616            confidence: self.confidence,
2617            normalized: self.normalized,
2618            provenance: self.provenance,
2619            kb_id: self.kb_id,
2620            canonical_id: self.canonical_id,
2621            hierarchical_confidence: self.hierarchical_confidence,
2622            visual_span: self.visual_span,
2623            discontinuous_span: self.discontinuous_span,
2624            mention_type: self.mention_type,
2625        }
2626    }
2627}
2628
2629// ============================================================================
2630// Relation (for Knowledge Graph Construction)
2631// ============================================================================
2632
2633/// A relation between two entities, forming a knowledge graph triple.
2634///
2635/// In the GLiNER bi-encoder paradigm, relations are detected just like entities:
2636/// the relation trigger text ("CEO of", "located in") is matched against
2637/// relation type labels in the same latent space.
2638///
2639/// # Structure
2640///
2641/// ```text
2642/// Triple: (Head, Relation, Tail)
2643///
2644/// "Marie Curie worked at the Sorbonne"
2645///  ^^^^^^^^^^^ ~~~~~~~~~ ^^^^^^^^
2646///  Head        Rel       Tail
2647///  (Person)  (Employment)  (Organization)
2648/// ```
2649///
2650/// # TPLinker/Joint Extraction
2651///
2652/// For joint extraction, relations are extracted in a single pass with entities.
2653/// The `trigger_span` captures the text that indicates the relation.
2654#[derive(Debug, Clone, Serialize, Deserialize)]
2655pub struct Relation {
2656    /// The source entity (head of the triple)
2657    pub head: Entity,
2658    /// The target entity (tail of the triple)
2659    pub tail: Entity,
2660    /// Relation type label (e.g., "EMPLOYMENT", "LOCATED_IN", "FOUNDED_BY")
2661    pub relation_type: String,
2662    /// Optional trigger span: the text that indicates this relation
2663    /// For "CEO of", this would be the span covering "CEO of"
2664    pub trigger_span: Option<(usize, usize)>,
2665    /// Confidence score for this relation (0.0-1.0).
2666    pub confidence: Confidence,
2667}
2668
2669impl Relation {
2670    /// Create a new relation between two entities.
2671    #[must_use]
2672    pub fn new(
2673        head: Entity,
2674        tail: Entity,
2675        relation_type: impl Into<String>,
2676        confidence: impl Into<Confidence>,
2677    ) -> Self {
2678        Self {
2679            head,
2680            tail,
2681            relation_type: relation_type.into(),
2682            trigger_span: None,
2683            confidence: confidence.into(),
2684        }
2685    }
2686
2687    /// Create a relation with an explicit trigger span.
2688    #[must_use]
2689    pub fn with_trigger(
2690        head: Entity,
2691        tail: Entity,
2692        relation_type: impl Into<String>,
2693        trigger_start: usize,
2694        trigger_end: usize,
2695        confidence: impl Into<Confidence>,
2696    ) -> Self {
2697        Self {
2698            head,
2699            tail,
2700            relation_type: relation_type.into(),
2701            trigger_span: Some((trigger_start, trigger_end)),
2702            confidence: confidence.into(),
2703        }
2704    }
2705
2706    /// Convert to a triple string representation (for debugging/display).
2707    #[must_use]
2708    pub fn as_triple(&self) -> String {
2709        format!(
2710            "({}, {}, {})",
2711            self.head.text, self.relation_type, self.tail.text
2712        )
2713    }
2714
2715    /// Check if the head and tail entities are adjacent (within n tokens).
2716    /// Useful for filtering spurious long-distance relations.
2717    #[must_use]
2718    pub fn span_distance(&self) -> usize {
2719        if self.head.end <= self.tail.start {
2720            self.tail.start.saturating_sub(self.head.end)
2721        } else if self.tail.end <= self.head.start {
2722            self.head.start.saturating_sub(self.tail.end)
2723        } else {
2724            0 // Overlapping spans
2725        }
2726    }
2727}
2728
2729#[cfg(test)]
2730mod tests {
2731    #![allow(clippy::unwrap_used)] // unwrap() is acceptable in test code
2732    use super::*;
2733
2734    #[test]
2735    fn entity_new_swaps_inverted_span() {
2736        let e = Entity::new("test", EntityType::Person, 10, 5, 0.9);
2737        assert_eq!(e.start(), 5);
2738        assert_eq!(e.end(), 10);
2739    }
2740
2741    #[test]
2742    fn entity_deserialize_swaps_inverted_span() {
2743        let json = r#"{"text":"test","entity_type":"PER","start":10,"end":5,"confidence":0.9}"#;
2744        let e: Entity = serde_json::from_str(json).unwrap();
2745        assert_eq!(e.start(), 5);
2746        assert_eq!(e.end(), 10);
2747    }
2748
2749    #[test]
2750    fn entity_serde_round_trip() {
2751        let original = Entity::new("Berlin", EntityType::Location, 10, 16, 0.95);
2752        let json = serde_json::to_string(&original).unwrap();
2753        let restored: Entity = serde_json::from_str(&json).unwrap();
2754        assert_eq!(restored.text, original.text);
2755        assert_eq!(restored.entity_type, original.entity_type);
2756        assert_eq!(restored.start(), original.start());
2757        assert_eq!(restored.end(), original.end());
2758        assert!((restored.confidence.value() - original.confidence.value()).abs() < f64::EPSILON);
2759    }
2760
2761    #[test]
2762    fn test_entity_type_roundtrip() {
2763        let types = [
2764            EntityType::Person,
2765            EntityType::Organization,
2766            EntityType::Location,
2767            EntityType::Date,
2768            EntityType::Money,
2769            EntityType::Percent,
2770        ];
2771
2772        for t in types {
2773            let label = t.as_label();
2774            let parsed = EntityType::from_label(label);
2775            assert_eq!(t, parsed);
2776        }
2777    }
2778
2779    #[test]
2780    fn test_entity_overlap() {
2781        let e1 = Entity::new("John", EntityType::Person, 0, 4, 0.9);
2782        let e2 = Entity::new("Smith", EntityType::Person, 5, 10, 0.9);
2783        let e3 = Entity::new("John Smith", EntityType::Person, 0, 10, 0.9);
2784
2785        assert!(!e1.overlaps(&e2)); // No overlap
2786        assert!(e1.overlaps(&e3)); // e1 is contained in e3
2787        assert!(e3.overlaps(&e2)); // e3 contains e2
2788    }
2789
2790    #[test]
2791    fn test_confidence_clamping() {
2792        let e1 = Entity::new("test", EntityType::Person, 0, 4, 1.5);
2793        assert!((e1.confidence - 1.0).abs() < f64::EPSILON);
2794
2795        let e2 = Entity::new("test", EntityType::Person, 0, 4, -0.5);
2796        assert!(e2.confidence.abs() < f64::EPSILON);
2797    }
2798
2799    #[test]
2800    fn test_entity_categories() {
2801        // Agent/Org/Place entities require ML
2802        assert_eq!(EntityType::Person.category(), EntityCategory::Agent);
2803        assert_eq!(
2804            EntityType::Organization.category(),
2805            EntityCategory::Organization
2806        );
2807        assert_eq!(EntityType::Location.category(), EntityCategory::Place);
2808        assert!(EntityType::Person.requires_ml());
2809        assert!(!EntityType::Person.pattern_detectable());
2810
2811        // Temporal entities are pattern-detectable
2812        assert_eq!(EntityType::Date.category(), EntityCategory::Temporal);
2813        assert_eq!(EntityType::Time.category(), EntityCategory::Temporal);
2814        assert!(EntityType::Date.pattern_detectable());
2815        assert!(!EntityType::Date.requires_ml());
2816
2817        // Numeric entities are pattern-detectable
2818        assert_eq!(EntityType::Money.category(), EntityCategory::Numeric);
2819        assert_eq!(EntityType::Percent.category(), EntityCategory::Numeric);
2820        assert!(EntityType::Money.pattern_detectable());
2821
2822        // Contact entities are pattern-detectable
2823        assert_eq!(EntityType::Email.category(), EntityCategory::Contact);
2824        assert_eq!(EntityType::Url.category(), EntityCategory::Contact);
2825        assert_eq!(EntityType::Phone.category(), EntityCategory::Contact);
2826        assert!(EntityType::Email.pattern_detectable());
2827    }
2828
2829    #[test]
2830    fn test_new_types_roundtrip() {
2831        let types = [
2832            EntityType::Time,
2833            EntityType::Email,
2834            EntityType::Url,
2835            EntityType::Phone,
2836            EntityType::Quantity,
2837            EntityType::Cardinal,
2838            EntityType::Ordinal,
2839        ];
2840
2841        for t in types {
2842            let label = t.as_label();
2843            let parsed = EntityType::from_label(label);
2844            assert_eq!(t, parsed, "Roundtrip failed for {}", label);
2845        }
2846    }
2847
2848    #[test]
2849    fn test_custom_entity_type() {
2850        let disease = EntityType::custom("DISEASE", EntityCategory::Agent);
2851        assert_eq!(disease.as_label(), "DISEASE");
2852        assert!(disease.requires_ml());
2853
2854        let product_id = EntityType::custom("PRODUCT_ID", EntityCategory::Misc);
2855        assert_eq!(product_id.as_label(), "PRODUCT_ID");
2856        assert!(!product_id.requires_ml());
2857        assert!(!product_id.pattern_detectable());
2858    }
2859
2860    #[test]
2861    fn test_entity_normalization() {
2862        let mut e = Entity::new("Jan 15", EntityType::Date, 0, 6, 0.95);
2863        assert!(e.normalized.is_none());
2864        assert_eq!(e.normalized_or_text(), "Jan 15");
2865
2866        e.set_normalized("2024-01-15");
2867        assert_eq!(e.normalized.as_deref(), Some("2024-01-15"));
2868        assert_eq!(e.normalized_or_text(), "2024-01-15");
2869    }
2870
2871    #[test]
2872    fn test_entity_helpers() {
2873        let named = Entity::new("John", EntityType::Person, 0, 4, 0.9);
2874        assert!(named.is_named());
2875        assert!(!named.is_structured());
2876        assert_eq!(named.category(), EntityCategory::Agent);
2877
2878        let structured = Entity::new("$100", EntityType::Money, 0, 4, 0.95);
2879        assert!(!structured.is_named());
2880        assert!(structured.is_structured());
2881        assert_eq!(structured.category(), EntityCategory::Numeric);
2882    }
2883
2884    #[test]
2885    fn test_knowledge_linking() {
2886        let mut entity = Entity::new("Marie Curie", EntityType::Person, 0, 11, 0.95);
2887        assert!(!entity.is_linked());
2888        assert!(!entity.has_coreference());
2889
2890        entity.link_to_kb("Q7186"); // Wikidata ID
2891        assert!(entity.is_linked());
2892        assert_eq!(entity.kb_id.as_deref(), Some("Q7186"));
2893
2894        entity.set_canonical(42);
2895        assert!(entity.has_coreference());
2896        assert_eq!(
2897            entity.canonical_id,
2898            Some(crate::core::types::CanonicalId::new(42))
2899        );
2900    }
2901
2902    #[test]
2903    fn test_relation_creation() {
2904        let head = Entity::new("Marie Curie", EntityType::Person, 0, 11, 0.95);
2905        let tail = Entity::new("Sorbonne", EntityType::Organization, 24, 32, 0.90);
2906
2907        let relation = Relation::new(head.clone(), tail.clone(), "WORKED_AT", 0.85);
2908        assert_eq!(relation.relation_type, "WORKED_AT");
2909        assert_eq!(relation.as_triple(), "(Marie Curie, WORKED_AT, Sorbonne)");
2910        assert!(relation.trigger_span.is_none());
2911
2912        // With trigger span
2913        let relation2 = Relation::with_trigger(head, tail, "EMPLOYMENT", 13, 19, 0.85);
2914        assert_eq!(relation2.trigger_span, Some((13, 19)));
2915    }
2916
2917    #[test]
2918    fn test_relation_span_distance() {
2919        // Head at 0-11, tail at 24-32 -> distance is 24-11 = 13
2920        let head = Entity::new("Marie Curie", EntityType::Person, 0, 11, 0.95);
2921        let tail = Entity::new("Sorbonne", EntityType::Organization, 24, 32, 0.90);
2922        let relation = Relation::new(head, tail, "WORKED_AT", 0.85);
2923        assert_eq!(relation.span_distance(), 13);
2924    }
2925
2926    #[test]
2927    fn test_relation_category() {
2928        // Relation types should be categorized as Relation
2929        let rel_type = EntityType::custom("CEO_OF", EntityCategory::Relation);
2930        assert_eq!(rel_type.category(), EntityCategory::Relation);
2931        assert!(rel_type.category().is_relation());
2932        assert!(rel_type.requires_ml()); // Relations require ML
2933    }
2934
2935    // ========================================================================
2936    // Span Tests
2937    // ========================================================================
2938
2939    #[test]
2940    fn test_span_text() {
2941        let span = Span::text(10, 20);
2942        assert!(span.is_text());
2943        assert!(!span.is_visual());
2944        assert_eq!(span.text_offsets(), Some((10, 20)));
2945        assert_eq!(span.len(), 10);
2946        assert!(!span.is_empty());
2947    }
2948
2949    #[test]
2950    fn test_span_bbox() {
2951        let span = Span::bbox(0.1, 0.2, 0.3, 0.4);
2952        assert!(!span.is_text());
2953        assert!(span.is_visual());
2954        assert_eq!(span.text_offsets(), None);
2955        assert_eq!(span.len(), 0); // No text length
2956    }
2957
2958    #[test]
2959    fn test_span_bbox_with_page() {
2960        let span = Span::bbox_on_page(0.1, 0.2, 0.3, 0.4, 5);
2961        if let Span::BoundingBox { page, .. } = span {
2962            assert_eq!(page, Some(5));
2963        } else {
2964            panic!("Expected BoundingBox");
2965        }
2966    }
2967
2968    #[test]
2969    fn test_span_hybrid() {
2970        let bbox = Span::bbox(0.1, 0.2, 0.3, 0.4);
2971        let hybrid = Span::Hybrid {
2972            start: 10,
2973            end: 20,
2974            bbox: Box::new(bbox),
2975        };
2976        assert!(hybrid.is_text());
2977        assert!(hybrid.is_visual());
2978        assert_eq!(hybrid.text_offsets(), Some((10, 20)));
2979        assert_eq!(hybrid.len(), 10);
2980    }
2981
2982    // ========================================================================
2983    // Hierarchical Confidence Tests
2984    // ========================================================================
2985
2986    #[test]
2987    fn test_hierarchical_confidence_new() {
2988        let hc = HierarchicalConfidence::new(0.9, 0.8, 0.7);
2989        assert!((hc.linkage - 0.9).abs() < f64::EPSILON);
2990        assert!((hc.type_score - 0.8).abs() < f64::EPSILON);
2991        assert!((hc.boundary - 0.7).abs() < f64::EPSILON);
2992    }
2993
2994    #[test]
2995    fn test_hierarchical_confidence_clamping() {
2996        let hc = HierarchicalConfidence::new(1.5, -0.5, 0.5);
2997        assert_eq!(hc.linkage, 1.0);
2998        assert_eq!(hc.type_score, 0.0);
2999        assert_eq!(hc.boundary, 0.5);
3000    }
3001
3002    #[test]
3003    fn test_hierarchical_confidence_from_single() {
3004        let hc = HierarchicalConfidence::from_single(0.8);
3005        assert!((hc.linkage - 0.8).abs() < f64::EPSILON);
3006        assert!((hc.type_score - 0.8).abs() < f64::EPSILON);
3007        assert!((hc.boundary - 0.8).abs() < f64::EPSILON);
3008    }
3009
3010    #[test]
3011    fn test_hierarchical_confidence_combined() {
3012        let hc = HierarchicalConfidence::new(1.0, 1.0, 1.0);
3013        assert!((hc.combined() - 1.0).abs() < f64::EPSILON);
3014
3015        let hc2 = HierarchicalConfidence::new(0.8, 0.8, 0.8);
3016        assert!((hc2.combined() - 0.8).abs() < 0.001);
3017
3018        // Geometric mean: (0.5 * 0.5 * 0.5)^(1/3) = 0.5
3019        let hc3 = HierarchicalConfidence::new(0.5, 0.5, 0.5);
3020        assert!((hc3.combined() - 0.5).abs() < 0.001);
3021    }
3022
3023    #[test]
3024    fn test_hierarchical_confidence_threshold() {
3025        let hc = HierarchicalConfidence::new(0.9, 0.8, 0.7);
3026        assert!(hc.passes_threshold(0.5, 0.5, 0.5));
3027        assert!(hc.passes_threshold(0.9, 0.8, 0.7));
3028        assert!(!hc.passes_threshold(0.95, 0.8, 0.7)); // linkage too high
3029        assert!(!hc.passes_threshold(0.9, 0.85, 0.7)); // type too high
3030    }
3031
3032    #[test]
3033    fn test_hierarchical_confidence_from_f64() {
3034        let hc: HierarchicalConfidence = 0.85_f64.into();
3035        assert!((hc.linkage - 0.85).abs() < 0.001);
3036    }
3037
3038    // ========================================================================
3039    // RaggedBatch Tests
3040    // ========================================================================
3041
3042    #[test]
3043    fn test_ragged_batch_from_sequences() {
3044        let seqs = vec![vec![1, 2, 3], vec![4, 5], vec![6, 7, 8, 9]];
3045        let batch = RaggedBatch::from_sequences(&seqs);
3046
3047        assert_eq!(batch.batch_size(), 3);
3048        assert_eq!(batch.total_tokens(), 9);
3049        assert_eq!(batch.max_seq_len, 4);
3050        assert_eq!(batch.cumulative_offsets, vec![0, 3, 5, 9]);
3051    }
3052
3053    #[test]
3054    fn test_ragged_batch_doc_range() {
3055        let seqs = vec![vec![1, 2, 3], vec![4, 5]];
3056        let batch = RaggedBatch::from_sequences(&seqs);
3057
3058        assert_eq!(batch.doc_range(0), Some(0..3));
3059        assert_eq!(batch.doc_range(1), Some(3..5));
3060        assert_eq!(batch.doc_range(2), None);
3061    }
3062
3063    #[test]
3064    fn test_ragged_batch_doc_tokens() {
3065        let seqs = vec![vec![1, 2, 3], vec![4, 5]];
3066        let batch = RaggedBatch::from_sequences(&seqs);
3067
3068        assert_eq!(batch.doc_tokens(0), Some(&[1, 2, 3][..]));
3069        assert_eq!(batch.doc_tokens(1), Some(&[4, 5][..]));
3070    }
3071
3072    #[test]
3073    fn test_ragged_batch_padding_savings() {
3074        // 3 docs: [3, 2, 4] tokens, max = 4
3075        // Padded: 3 * 4 = 12, actual: 9
3076        // Savings: 1 - 9/12 = 0.25
3077        let seqs = vec![vec![1, 2, 3], vec![4, 5], vec![6, 7, 8, 9]];
3078        let batch = RaggedBatch::from_sequences(&seqs);
3079        let savings = batch.padding_savings();
3080        assert!((savings - 0.25).abs() < 0.001);
3081    }
3082
3083    // ========================================================================
3084    // SpanCandidate Tests
3085    // ========================================================================
3086
3087    #[test]
3088    fn test_span_candidate() {
3089        let sc = SpanCandidate::new(0, 5, 10);
3090        assert_eq!(sc.doc_idx, 0);
3091        assert_eq!(sc.start, 5);
3092        assert_eq!(sc.end, 10);
3093        assert_eq!(sc.width(), 5);
3094    }
3095
3096    #[test]
3097    fn test_generate_span_candidates() {
3098        let seqs = vec![vec![1, 2, 3]]; // doc with 3 tokens
3099        let batch = RaggedBatch::from_sequences(&seqs);
3100        let candidates = generate_span_candidates(&batch, 2);
3101
3102        // With max_width=2: [0,1], [1,2], [2,3], [0,2], [1,3]
3103        // = spans: (0,1), (0,2), (1,2), (1,3), (2,3)
3104        assert_eq!(candidates.len(), 5);
3105
3106        // Verify all candidates are valid
3107        for c in &candidates {
3108            assert_eq!(c.doc_idx, 0);
3109            assert!(c.end as usize <= 3);
3110            assert!(c.width() as usize <= 2);
3111        }
3112    }
3113
3114    #[test]
3115    fn test_generate_filtered_candidates() {
3116        let seqs = vec![vec![1, 2, 3]];
3117        let batch = RaggedBatch::from_sequences(&seqs);
3118
3119        // With max_width=2, we have 5 candidates
3120        // Set mask: only first 2 pass threshold
3121        let mask = vec![0.9, 0.9, 0.1, 0.1, 0.1];
3122        let candidates = generate_filtered_candidates(&batch, 2, &mask, 0.5);
3123
3124        assert_eq!(candidates.len(), 2);
3125    }
3126
3127    // ========================================================================
3128    // EntityBuilder Tests
3129    // ========================================================================
3130
3131    #[test]
3132    fn test_entity_builder_basic() {
3133        let entity = Entity::builder("John", EntityType::Person)
3134            .span(0, 4)
3135            .confidence(0.95)
3136            .build();
3137
3138        assert_eq!(entity.text, "John");
3139        assert_eq!(entity.entity_type, EntityType::Person);
3140        assert_eq!(entity.start(), 0);
3141        assert_eq!(entity.end(), 4);
3142        assert!((entity.confidence - 0.95).abs() < f64::EPSILON);
3143    }
3144
3145    #[test]
3146    fn test_entity_builder_full() {
3147        let entity = Entity::builder("Marie Curie", EntityType::Person)
3148            .span(0, 11)
3149            .confidence(0.95)
3150            .kb_id("Q7186")
3151            .canonical_id(42)
3152            .normalized("Marie Salomea Skłodowska Curie")
3153            .provenance(Provenance::ml("bert", 0.95))
3154            .build();
3155
3156        assert_eq!(entity.text, "Marie Curie");
3157        assert_eq!(entity.kb_id.as_deref(), Some("Q7186"));
3158        assert_eq!(
3159            entity.canonical_id,
3160            Some(crate::core::types::CanonicalId::new(42))
3161        );
3162        assert_eq!(
3163            entity.normalized.as_deref(),
3164            Some("Marie Salomea Skłodowska Curie")
3165        );
3166        assert!(entity.provenance.is_some());
3167    }
3168
3169    #[test]
3170    fn test_entity_builder_hierarchical() {
3171        let hc = HierarchicalConfidence::new(0.9, 0.8, 0.7);
3172        let entity = Entity::builder("test", EntityType::Person)
3173            .span(0, 4)
3174            .hierarchical_confidence(hc)
3175            .build();
3176
3177        assert!(entity.hierarchical_confidence.is_some());
3178        assert!((entity.linkage_confidence() - 0.9).abs() < 0.001);
3179        assert!((entity.type_confidence() - 0.8).abs() < 0.001);
3180        assert!((entity.boundary_confidence() - 0.7).abs() < 0.001);
3181    }
3182
3183    #[test]
3184    fn test_entity_builder_visual() {
3185        let bbox = Span::bbox(0.1, 0.2, 0.3, 0.4);
3186        let entity = Entity::builder("receipt item", EntityType::Money)
3187            .visual_span(bbox)
3188            .confidence(0.9)
3189            .build();
3190
3191        assert!(entity.is_visual());
3192        assert!(entity.visual_span.is_some());
3193    }
3194
3195    // ========================================================================
3196    // Entity Helper Method Tests
3197    // ========================================================================
3198
3199    #[test]
3200    fn test_entity_hierarchical_confidence_helpers() {
3201        let mut entity = Entity::new("test", EntityType::Person, 0, 4, 0.8);
3202
3203        // Without hierarchical confidence, falls back to main confidence
3204        assert!((entity.linkage_confidence() - 0.8).abs() < 0.001);
3205        assert!((entity.type_confidence() - 0.8).abs() < 0.001);
3206        assert!((entity.boundary_confidence() - 0.8).abs() < 0.001);
3207
3208        // Set hierarchical confidence
3209        entity.set_hierarchical_confidence(HierarchicalConfidence::new(0.95, 0.85, 0.75));
3210        assert!((entity.linkage_confidence() - 0.95).abs() < 0.001);
3211        assert!((entity.type_confidence() - 0.85).abs() < 0.001);
3212        assert!((entity.boundary_confidence() - 0.75).abs() < 0.001);
3213    }
3214
3215    #[test]
3216    fn test_entity_from_visual() {
3217        let entity = Entity::from_visual(
3218            "receipt total",
3219            EntityType::Money,
3220            Span::bbox(0.5, 0.8, 0.2, 0.05),
3221            0.92,
3222        );
3223
3224        assert!(entity.is_visual());
3225        assert_eq!(entity.start(), 0);
3226        assert_eq!(entity.end(), 0);
3227        assert!((entity.confidence - 0.92).abs() < f64::EPSILON);
3228    }
3229
3230    #[test]
3231    fn test_entity_span_helpers() {
3232        let entity = Entity::new("test", EntityType::Person, 10, 20, 0.9);
3233        assert_eq!(entity.text_span(), (10, 20));
3234        assert_eq!(entity.span_len(), 10);
3235    }
3236
3237    // ========================================================================
3238    // Provenance Tests
3239    // ========================================================================
3240
3241    #[test]
3242    fn test_provenance_pattern() {
3243        let prov = Provenance::pattern("EMAIL");
3244        assert_eq!(prov.method, ExtractionMethod::Pattern);
3245        assert_eq!(prov.pattern.as_deref(), Some("EMAIL"));
3246        assert_eq!(prov.raw_confidence, Some(Confidence::new(1.0))); // Patterns are deterministic
3247    }
3248
3249    #[test]
3250    fn test_provenance_ml() {
3251        let prov = Provenance::ml("bert-ner", 0.87);
3252        assert_eq!(prov.method, ExtractionMethod::Neural);
3253        assert_eq!(prov.source.as_ref(), "bert-ner");
3254        assert_eq!(prov.raw_confidence, Some(Confidence::new(0.87)));
3255    }
3256
3257    #[test]
3258    fn test_provenance_with_version() {
3259        let prov = Provenance::ml("gliner", 0.92).with_version("v2.1.0");
3260
3261        assert_eq!(prov.model_version.as_deref(), Some("v2.1.0"));
3262        assert_eq!(prov.source.as_ref(), "gliner");
3263    }
3264
3265    #[test]
3266    fn test_provenance_with_timestamp() {
3267        let prov = Provenance::pattern("DATE").with_timestamp("2024-01-15T10:30:00Z");
3268
3269        assert_eq!(prov.timestamp.as_deref(), Some("2024-01-15T10:30:00Z"));
3270    }
3271
3272    #[test]
3273    fn test_provenance_builder_chain() {
3274        let prov = Provenance::ml("modernbert-ner", 0.95)
3275            .with_version("v1.0.0")
3276            .with_timestamp("2024-11-27T12:00:00Z");
3277
3278        assert_eq!(prov.method, ExtractionMethod::Neural);
3279        assert_eq!(prov.source.as_ref(), "modernbert-ner");
3280        assert_eq!(prov.raw_confidence, Some(Confidence::new(0.95)));
3281        assert_eq!(prov.model_version.as_deref(), Some("v1.0.0"));
3282        assert_eq!(prov.timestamp.as_deref(), Some("2024-11-27T12:00:00Z"));
3283    }
3284
3285    #[test]
3286    fn test_provenance_serialization() {
3287        let prov = Provenance::ml("test", 0.9)
3288            .with_version("v1.0")
3289            .with_timestamp("2024-01-01");
3290
3291        let json = serde_json::to_string(&prov).unwrap();
3292        assert!(json.contains("model_version"));
3293        assert!(json.contains("v1.0"));
3294
3295        let restored: Provenance = serde_json::from_str(&json).unwrap();
3296        assert_eq!(restored.model_version.as_deref(), Some("v1.0"));
3297        assert_eq!(restored.timestamp.as_deref(), Some("2024-01-01"));
3298    }
3299
3300    #[test]
3301    fn entity_serde_roundtrip_no_temporal_fields() {
3302        let entity = Entity::new("Berlin", EntityType::Location, 0, 6, 0.95);
3303        let json = serde_json::to_string(&entity).unwrap();
3304        // Verify removed fields don't appear
3305        assert!(!json.contains("valid_from"));
3306        assert!(!json.contains("valid_until"));
3307        assert!(!json.contains("phi_features"));
3308        // Roundtrip works
3309        let recovered: Entity = serde_json::from_str(&json).unwrap();
3310        assert_eq!(recovered.text, "Berlin");
3311        assert_eq!(recovered.start(), 0);
3312        assert_eq!(recovered.end(), 6);
3313    }
3314
3315    #[test]
3316    fn entity_deserialize_ignores_unknown_fields() {
3317        let json = r#"{"text":"Berlin","entity_type":"LOC","start":0,"end":6,"confidence":0.95,"valid_from":null,"phi_features":null}"#;
3318        let entity: Entity = serde_json::from_str(json).unwrap();
3319        assert_eq!(entity.text, "Berlin");
3320    }
3321}
3322
3323#[cfg(test)]
3324mod proptests {
3325    #![allow(clippy::unwrap_used)] // unwrap() is acceptable in property tests
3326    use super::*;
3327    use proptest::prelude::*;
3328
3329    proptest! {
3330        #[test]
3331        fn confidence_always_clamped(conf in -10.0f64..10.0) {
3332            let e = Entity::new("test", EntityType::Person, 0, 4, conf);
3333            prop_assert!(e.confidence >= 0.0);
3334            prop_assert!(e.confidence <= 1.0);
3335        }
3336
3337        #[test]
3338        fn entity_type_roundtrip(label in "[A-Z]{3,10}") {
3339            let et = EntityType::from_label(&label);
3340            let back = EntityType::from_label(et.as_label());
3341            // Custom types may round-trip to themselves or normalize
3342            let is_custom = matches!(back, EntityType::Custom { .. });
3343            prop_assert!(is_custom || back == et);
3344        }
3345
3346        #[test]
3347        fn overlap_is_symmetric(
3348            s1 in 0usize..100,
3349            len1 in 1usize..50,
3350            s2 in 0usize..100,
3351            len2 in 1usize..50,
3352        ) {
3353            let e1 = Entity::new("a", EntityType::Person, s1, s1 + len1, 1.0);
3354            let e2 = Entity::new("b", EntityType::Person, s2, s2 + len2, 1.0);
3355            prop_assert_eq!(e1.overlaps(&e2), e2.overlaps(&e1));
3356        }
3357
3358        #[test]
3359        fn overlap_ratio_bounded(
3360            s1 in 0usize..100,
3361            len1 in 1usize..50,
3362            s2 in 0usize..100,
3363            len2 in 1usize..50,
3364        ) {
3365            let e1 = Entity::new("a", EntityType::Person, s1, s1 + len1, 1.0);
3366            let e2 = Entity::new("b", EntityType::Person, s2, s2 + len2, 1.0);
3367            let ratio = e1.overlap_ratio(&e2);
3368            prop_assert!(ratio >= 0.0);
3369            prop_assert!(ratio <= 1.0);
3370        }
3371
3372        #[test]
3373        fn self_overlap_ratio_is_one(s in 0usize..100, len in 1usize..50) {
3374            let e = Entity::new("test", EntityType::Person, s, s + len, 1.0);
3375            let ratio = e.overlap_ratio(&e);
3376            prop_assert!((ratio - 1.0).abs() < 1e-10);
3377        }
3378
3379        #[test]
3380        fn hierarchical_confidence_always_clamped(
3381            linkage in -2.0f32..2.0,
3382            type_score in -2.0f32..2.0,
3383            boundary in -2.0f32..2.0,
3384        ) {
3385            let hc = HierarchicalConfidence::new(linkage, type_score, boundary);
3386            prop_assert!(hc.linkage >= 0.0 && hc.linkage <= 1.0);
3387            prop_assert!(hc.type_score >= 0.0 && hc.type_score <= 1.0);
3388            prop_assert!(hc.boundary >= 0.0 && hc.boundary <= 1.0);
3389            prop_assert!(hc.combined() >= 0.0 && hc.combined() <= 1.0);
3390        }
3391
3392        #[test]
3393        fn span_candidate_width_consistent(
3394            doc in 0u32..10,
3395            start in 0u32..100,
3396            end in 1u32..100,
3397        ) {
3398            let actual_end = start.max(end);
3399            let sc = SpanCandidate::new(doc, start, actual_end);
3400            prop_assert_eq!(sc.width(), actual_end.saturating_sub(start));
3401        }
3402
3403        #[test]
3404        fn ragged_batch_preserves_tokens(
3405            seq_lens in proptest::collection::vec(1usize..10, 1..5),
3406        ) {
3407            // Create sequences with sequential token IDs
3408            let mut counter = 0u32;
3409            let seqs: Vec<Vec<u32>> = seq_lens.iter().map(|&len| {
3410                let seq: Vec<u32> = (counter..counter + len as u32).collect();
3411                counter += len as u32;
3412                seq
3413            }).collect();
3414
3415            let batch = RaggedBatch::from_sequences(&seqs);
3416
3417            // Verify batch properties
3418            prop_assert_eq!(batch.batch_size(), seqs.len());
3419            prop_assert_eq!(batch.total_tokens(), seq_lens.iter().sum::<usize>());
3420
3421            // Verify each doc can be retrieved correctly
3422            for (i, seq) in seqs.iter().enumerate() {
3423                let doc_tokens = batch.doc_tokens(i).unwrap();
3424                prop_assert_eq!(doc_tokens, seq.as_slice());
3425            }
3426        }
3427
3428        #[test]
3429        fn span_text_offsets_consistent(start in 0usize..100, len in 0usize..50) {
3430            let end = start + len;
3431            let span = Span::text(start, end);
3432            let (s, e) = span.text_offsets().unwrap();
3433            prop_assert_eq!(s, start);
3434            prop_assert_eq!(e, end);
3435            prop_assert_eq!(span.len(), len);
3436        }
3437
3438        // =================================================================
3439        // Property tests for core type invariants
3440        // =================================================================
3441
3442        /// Entity with start < end always passes the span validity check in validate().
3443        #[test]
3444        fn entity_span_validity(
3445            start in 0usize..10000,
3446            len in 1usize..500,
3447            conf in 0.0f64..=1.0,
3448        ) {
3449            let end = start + len;
3450            // Build a source text long enough to cover the span
3451            let text_content: String = "x".repeat(end);
3452            let entity_text: String = text_content.chars().skip(start).take(len).collect();
3453            let e = Entity::new(&entity_text, EntityType::Person, start, end, conf);
3454            let issues = e.validate(&text_content);
3455            // No InvalidSpan or SpanOutOfBounds issues
3456            for issue in &issues {
3457                match issue {
3458                    ValidationIssue::InvalidSpan { .. } => {
3459                        prop_assert!(false, "start < end should never produce InvalidSpan");
3460                    }
3461                    ValidationIssue::SpanOutOfBounds { .. } => {
3462                        prop_assert!(false, "span within text should never produce SpanOutOfBounds");
3463                    }
3464                    _ => {} // TextMismatch or others are fine to check separately
3465                }
3466            }
3467        }
3468
3469        /// EntityType::from_label(et.as_label()) == et for all standard (non-Custom) types.
3470        #[test]
3471        fn entity_type_label_roundtrip_standard(
3472            idx in 0usize..13,
3473        ) {
3474            let standard_types = [
3475                EntityType::Person,
3476                EntityType::Organization,
3477                EntityType::Location,
3478                EntityType::Date,
3479                EntityType::Time,
3480                EntityType::Money,
3481                EntityType::Percent,
3482                EntityType::Quantity,
3483                EntityType::Cardinal,
3484                EntityType::Ordinal,
3485                EntityType::Email,
3486                EntityType::Url,
3487                EntityType::Phone,
3488            ];
3489            let et = &standard_types[idx];
3490            let label = et.as_label();
3491            let roundtripped = EntityType::from_label(label);
3492            prop_assert_eq!(&roundtripped, et,
3493                "from_label(as_label()) must roundtrip for {:?} (label={:?})", et, label);
3494        }
3495
3496        /// Span containment: if span A contains span B, then A.start <= B.start && A.end >= B.end.
3497        #[test]
3498        fn span_containment_property(
3499            a_start in 0usize..5000,
3500            a_len in 1usize..5000,
3501            b_offset in 0usize..5000,
3502            b_len in 1usize..5000,
3503        ) {
3504            let a_end = a_start + a_len;
3505            let b_start = a_start + (b_offset % a_len); // B starts within A
3506            let b_end_candidate = b_start + b_len;
3507
3508            // Only test the containment invariant when B is actually inside A
3509            if b_start >= a_start && b_end_candidate <= a_end {
3510                // B is contained in A
3511                prop_assert!(a_start <= b_start);
3512                prop_assert!(a_end >= b_end_candidate);
3513
3514                // Also verify via Entity overlap: A must overlap B if A contains B
3515                let ea = Entity::new("a", EntityType::Person, a_start, a_end, 1.0);
3516                let eb = Entity::new("b", EntityType::Person, b_start, b_end_candidate, 1.0);
3517                prop_assert!(ea.overlaps(&eb),
3518                    "containing span must overlap contained span");
3519            }
3520        }
3521
3522        /// Serde roundtrip preserves all fields of Entity.
3523        #[test]
3524        fn entity_serde_roundtrip(
3525            start in 0usize..10000,
3526            len in 1usize..500,
3527            conf in 0.0f64..=1.0,
3528            type_idx in 0usize..5,
3529        ) {
3530            let end = start + len;
3531            let types = [
3532                EntityType::Person,
3533                EntityType::Organization,
3534                EntityType::Location,
3535                EntityType::Date,
3536                EntityType::Email,
3537            ];
3538            let et = types[type_idx].clone();
3539            let text = format!("entity_{}", start);
3540            let e = Entity::new(&text, et, start, end, conf);
3541
3542            let json = serde_json::to_string(&e).unwrap();
3543            let e2: Entity = serde_json::from_str(&json).unwrap();
3544
3545            prop_assert_eq!(&e.text, &e2.text);
3546            prop_assert_eq!(&e.entity_type, &e2.entity_type);
3547            prop_assert_eq!(e.start(), e2.start());
3548            prop_assert_eq!(e.end(), e2.end());
3549            // f64 roundtrip through JSON: compare with tolerance
3550            prop_assert!((e.confidence - e2.confidence).abs() < 1e-10,
3551                "confidence roundtrip: {} vs {}", e.confidence, e2.confidence);
3552            prop_assert_eq!(&e.normalized, &e2.normalized);
3553            prop_assert_eq!(&e.kb_id, &e2.kb_id);
3554        }
3555
3556        /// DiscontinuousSpan: total_len() == sum of merged segment lengths,
3557        /// and merged segments are non-overlapping and sorted.
3558        #[test]
3559        fn discontinuous_span_total_length(
3560            segments in proptest::collection::vec(
3561                (0usize..5000, 1usize..500),
3562                1..6
3563            ),
3564        ) {
3565            let ranges: Vec<std::ops::Range<usize>> = segments.iter()
3566                .map(|&(start, len)| start..start + len)
3567                .collect();
3568            let span = DiscontinuousSpan::new(ranges);
3569            // After merging, total_len must equal sum of the stored segments.
3570            let expected_sum: usize = span.segments().iter().map(|r| r.end - r.start).sum();
3571            prop_assert_eq!(span.total_len(), expected_sum,
3572                "total_len must equal sum of merged segment lengths");
3573            // Verify no overlaps in stored segments.
3574            for w in span.segments().windows(2) {
3575                prop_assert!(w[0].end <= w[1].start,
3576                    "segments must not overlap: {:?} vs {:?}", w[0], w[1]);
3577            }
3578        }
3579    }
3580
3581    // ========================================================================
3582    // EntityCategory Tests
3583    // ========================================================================
3584
3585    #[test]
3586    fn test_entity_category_requires_ml() {
3587        assert!(EntityCategory::Agent.requires_ml());
3588        assert!(EntityCategory::Organization.requires_ml());
3589        assert!(EntityCategory::Place.requires_ml());
3590        assert!(EntityCategory::Creative.requires_ml());
3591        assert!(EntityCategory::Relation.requires_ml());
3592
3593        assert!(!EntityCategory::Temporal.requires_ml());
3594        assert!(!EntityCategory::Numeric.requires_ml());
3595        assert!(!EntityCategory::Contact.requires_ml());
3596        assert!(!EntityCategory::Misc.requires_ml());
3597    }
3598
3599    #[test]
3600    fn test_entity_category_pattern_detectable() {
3601        assert!(EntityCategory::Temporal.pattern_detectable());
3602        assert!(EntityCategory::Numeric.pattern_detectable());
3603        assert!(EntityCategory::Contact.pattern_detectable());
3604
3605        assert!(!EntityCategory::Agent.pattern_detectable());
3606        assert!(!EntityCategory::Organization.pattern_detectable());
3607        assert!(!EntityCategory::Place.pattern_detectable());
3608        assert!(!EntityCategory::Creative.pattern_detectable());
3609        assert!(!EntityCategory::Relation.pattern_detectable());
3610        assert!(!EntityCategory::Misc.pattern_detectable());
3611    }
3612
3613    #[test]
3614    fn test_entity_category_is_relation() {
3615        assert!(EntityCategory::Relation.is_relation());
3616
3617        assert!(!EntityCategory::Agent.is_relation());
3618        assert!(!EntityCategory::Organization.is_relation());
3619        assert!(!EntityCategory::Place.is_relation());
3620        assert!(!EntityCategory::Temporal.is_relation());
3621        assert!(!EntityCategory::Numeric.is_relation());
3622        assert!(!EntityCategory::Contact.is_relation());
3623        assert!(!EntityCategory::Creative.is_relation());
3624        assert!(!EntityCategory::Misc.is_relation());
3625    }
3626
3627    #[test]
3628    fn test_entity_category_as_str() {
3629        assert_eq!(EntityCategory::Agent.as_str(), "agent");
3630        assert_eq!(EntityCategory::Organization.as_str(), "organization");
3631        assert_eq!(EntityCategory::Place.as_str(), "place");
3632        assert_eq!(EntityCategory::Creative.as_str(), "creative");
3633        assert_eq!(EntityCategory::Temporal.as_str(), "temporal");
3634        assert_eq!(EntityCategory::Numeric.as_str(), "numeric");
3635        assert_eq!(EntityCategory::Contact.as_str(), "contact");
3636        assert_eq!(EntityCategory::Relation.as_str(), "relation");
3637        assert_eq!(EntityCategory::Misc.as_str(), "misc");
3638    }
3639
3640    #[test]
3641    fn test_entity_category_display() {
3642        assert_eq!(format!("{}", EntityCategory::Agent), "agent");
3643        assert_eq!(format!("{}", EntityCategory::Temporal), "temporal");
3644        assert_eq!(format!("{}", EntityCategory::Relation), "relation");
3645    }
3646
3647    // ========================================================================
3648    // EntityType serde tests (N20: flat string serialization)
3649    // ========================================================================
3650
3651    #[test]
3652    fn test_entity_type_serializes_to_flat_string() {
3653        assert_eq!(
3654            serde_json::to_string(&EntityType::Person).unwrap(),
3655            r#""PER""#
3656        );
3657        assert_eq!(
3658            serde_json::to_string(&EntityType::Organization).unwrap(),
3659            r#""ORG""#
3660        );
3661        assert_eq!(
3662            serde_json::to_string(&EntityType::Location).unwrap(),
3663            r#""LOC""#
3664        );
3665        assert_eq!(
3666            serde_json::to_string(&EntityType::Date).unwrap(),
3667            r#""DATE""#
3668        );
3669        assert_eq!(
3670            serde_json::to_string(&EntityType::Money).unwrap(),
3671            r#""MONEY""#
3672        );
3673    }
3674
3675    #[test]
3676    fn test_custom_entity_type_serializes_flat() {
3677        let misc = EntityType::custom("MISC", EntityCategory::Misc);
3678        assert_eq!(serde_json::to_string(&misc).unwrap(), r#""MISC""#);
3679
3680        let disease = EntityType::custom("DISEASE", EntityCategory::Agent);
3681        assert_eq!(serde_json::to_string(&disease).unwrap(), r#""DISEASE""#);
3682    }
3683
3684    #[test]
3685    fn test_entity_type_deserializes_from_flat_string() {
3686        let per: EntityType = serde_json::from_str(r#""PER""#).unwrap();
3687        assert_eq!(per, EntityType::Person);
3688
3689        let org: EntityType = serde_json::from_str(r#""ORG""#).unwrap();
3690        assert_eq!(org, EntityType::Organization);
3691
3692        let misc: EntityType = serde_json::from_str(r#""MISC""#).unwrap();
3693        assert_eq!(misc, EntityType::custom("MISC", EntityCategory::Misc));
3694    }
3695
3696    #[test]
3697    fn test_entity_type_deserializes_backward_compat_custom() {
3698        // Old format: {"Custom":{"name":"MISC","category":"Misc"}}
3699        let json = r#"{"Custom":{"name":"MISC","category":"Misc"}}"#;
3700        let et: EntityType = serde_json::from_str(json).unwrap();
3701        assert_eq!(et, EntityType::custom("MISC", EntityCategory::Misc));
3702    }
3703
3704    #[test]
3705    fn test_entity_type_deserializes_backward_compat_other() {
3706        // Old format: {"Other":"foo"} -- now routes to Custom with Misc category
3707        let json = r#"{"Other":"foo"}"#;
3708        let et: EntityType = serde_json::from_str(json).unwrap();
3709        assert_eq!(et, EntityType::custom("foo", EntityCategory::Misc));
3710    }
3711
3712    #[test]
3713    fn test_entity_type_serde_roundtrip() {
3714        let types = vec![
3715            EntityType::Person,
3716            EntityType::Organization,
3717            EntityType::Location,
3718            EntityType::Date,
3719            EntityType::Time,
3720            EntityType::Money,
3721            EntityType::Percent,
3722            EntityType::Quantity,
3723            EntityType::Cardinal,
3724            EntityType::Ordinal,
3725            EntityType::Email,
3726            EntityType::Url,
3727            EntityType::Phone,
3728            EntityType::custom("MISC", EntityCategory::Misc),
3729            EntityType::custom("DISEASE", EntityCategory::Agent),
3730        ];
3731
3732        for t in &types {
3733            let json = serde_json::to_string(t).unwrap();
3734            let back: EntityType = serde_json::from_str(&json).unwrap();
3735            // All variants roundtrip through from_label, so Custom types
3736            // survive as Custom (not as a built-in variant).
3737            assert_eq!(
3738                t.as_label(),
3739                back.as_label(),
3740                "roundtrip failed for {:?}",
3741                t
3742            );
3743        }
3744    }
3745}
anno_core/core/entity.rs

anno_core/core/
entity.rs