Skip to main content

anno/
schema.rs

1//! Schema harmonization for multi-dataset NER training.
2//!
3//! # The Schema Misalignment Problem
4//!
5//! Different NER datasets use incompatible label schemas:
6//!
7//! ```text
8//! ┌─────────────────────────────────────────────────────────────────────┐
9//! │ Dataset        │ "Marie Curie"  │ "Paris"      │ "Americans"        │
10//! ├─────────────────────────────────────────────────────────────────────┤
11//! │ CoNLL-2003     │ PER            │ LOC          │ MISC               │
12//! │ OntoNotes 5.0  │ PERSON         │ GPE          │ NORP               │
13//! │ Science Corpus │ SCIENTIST      │ LOCATION     │ —                  │
14//! │ MultiNERD      │ PER            │ LOC          │ —                  │
15//! └─────────────────────────────────────────────────────────────────────┘
16//! ```
17//!
18//! **Naive concatenation causes ~30% F1 degradation** (CyberNER 2025, ESNERA 2025).
19//!
20//! # Solution: Canonical Ontology
21//!
22//! This module defines a single source of truth for entity types, with explicit
23//! mappings from each dataset schema. Information loss is documented and intentional.
24//!
25//! # Usage
26//!
27//! ```rust
28//! use anno::schema::{CanonicalType, DatasetSchema, SchemaMapper};
29//!
30//! // Create mapper for OntoNotes
31//! let mapper = SchemaMapper::for_dataset(DatasetSchema::OntoNotes);
32//!
33//! // Map dataset-specific label to canonical type
34//! let canonical = mapper.to_canonical("NORP");
35//! assert_eq!(canonical.name(), "GROUP");  // Not "ORG"!
36//!
37//! // Check information loss
38//! let loss = mapper.information_loss("FAC");
39//! assert!(loss.is_some());  // FAC → LOCATION loses "man-made" semantics
40//! ```
41//!
42//! # Research References
43//!
44//! - CyberNER (2025): Schema harmonization for cyber threat NER
45//! - ESNERA (2025): Entity schema normalization for evaluation
46//! - OntoNotes 5.0 Guidelines: <https://catalog.ldc.upenn.edu/docs/LDC2013T19/OntoNotes-Release-5.0.pdf>
47
48use crate::{EntityCategory, EntityType};
49use std::collections::HashMap;
50
51// =============================================================================
52// Canonical Entity Types
53// =============================================================================
54
55/// Canonical entity type in the unified schema.
56///
57/// This is the single source of truth. All dataset-specific labels map here.
58#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
59pub enum CanonicalType {
60    // === Agent Types (people and groups) ===
61    /// Individual person (maps from: PER, PERSON, ACTOR, DIRECTOR, etc.)
62    Person,
63    /// Group of people by nationality/religion/politics (maps from: NORP)
64    /// NOT the same as Organization!
65    Group,
66
67    // === Organization Types ===
68    /// Formal organization (maps from: ORG, ORGANIZATION, CORPORATION)
69    Organization,
70
71    // === Location Types (with semantic preservation) ===
72    /// Geopolitical entity - has government (maps from: GPE, COUNTRY, CITY)
73    GeopoliticalEntity,
74    /// Natural location (maps from: LOC, LOCATION - mountains, rivers)
75    NaturalLocation,
76    /// Man-made facility (maps from: FAC, FACILITY - buildings, airports)
77    Facility,
78    /// Generic location (fallback when distinction unknown)
79    Location,
80
81    // === Temporal Types ===
82    /// Date expression
83    Date,
84    /// Time expression
85    Time,
86
87    // === Numeric Types ===
88    /// Monetary value
89    Money,
90    /// Percentage
91    Percent,
92    /// Quantity with unit
93    Quantity,
94    /// Cardinal number
95    Cardinal,
96    /// Ordinal number
97    Ordinal,
98
99    // === Creative/Legal ===
100    /// Creative work (maps from: WORK_OF_ART, TITLE, creative-work)
101    CreativeWork,
102    /// Product (maps from: PRODUCT, PROD)
103    Product,
104    /// Event (maps from: EVENT, EVE)
105    Event,
106    /// Law or legal document
107    Law,
108    /// Language
109    Language,
110
111    // === Domain-Specific (Biomedical) ===
112    /// Disease or medical condition
113    Disease,
114    /// Chemical compound
115    Chemical,
116    /// Gene
117    Gene,
118    /// Drug
119    Drug,
120
121    // === Domain-Specific (Other) ===
122    /// Animal
123    Animal,
124    /// Plant
125    Plant,
126    /// Food item
127    Food,
128
129    // === Fallback ===
130    /// Miscellaneous (maps from: MISC, unknown types)
131    Misc,
132}
133
134impl CanonicalType {
135    /// Get the canonical name.
136    #[must_use]
137    pub fn name(&self) -> &'static str {
138        match self {
139            Self::Person => "PERSON",
140            Self::Group => "GROUP",
141            Self::Organization => "ORG",
142            Self::GeopoliticalEntity => "GPE",
143            Self::NaturalLocation => "LOC",
144            Self::Facility => "FAC",
145            Self::Location => "LOCATION",
146            Self::Date => "DATE",
147            Self::Time => "TIME",
148            Self::Money => "MONEY",
149            Self::Percent => "PERCENT",
150            Self::Quantity => "QUANTITY",
151            Self::Cardinal => "CARDINAL",
152            Self::Ordinal => "ORDINAL",
153            Self::CreativeWork => "WORK_OF_ART",
154            Self::Product => "PRODUCT",
155            Self::Event => "EVENT",
156            Self::Law => "LAW",
157            Self::Language => "LANGUAGE",
158            Self::Disease => "DISEASE",
159            Self::Chemical => "CHEMICAL",
160            Self::Gene => "GENE",
161            Self::Drug => "DRUG",
162            Self::Animal => "ANIMAL",
163            Self::Plant => "PLANT",
164            Self::Food => "FOOD",
165            Self::Misc => "MISC",
166        }
167    }
168
169    /// Get the category for this canonical type.
170    #[must_use]
171    pub fn category(&self) -> EntityCategory {
172        match self {
173            Self::Person | Self::Group => EntityCategory::Agent,
174            Self::Organization => EntityCategory::Organization,
175            Self::GeopoliticalEntity | Self::NaturalLocation | Self::Facility | Self::Location => {
176                EntityCategory::Place
177            }
178            Self::Date | Self::Time => EntityCategory::Temporal,
179            Self::Money | Self::Percent | Self::Quantity | Self::Cardinal | Self::Ordinal => {
180                EntityCategory::Numeric
181            }
182            Self::CreativeWork | Self::Product | Self::Event | Self::Law | Self::Language => {
183                EntityCategory::Creative
184            }
185            Self::Disease | Self::Chemical | Self::Gene | Self::Drug => EntityCategory::Agent,
186            Self::Animal | Self::Plant | Self::Food => EntityCategory::Misc,
187            Self::Misc => EntityCategory::Misc,
188        }
189    }
190
191    /// Convert to the legacy EntityType for compatibility.
192    #[must_use]
193    pub fn to_entity_type(&self) -> EntityType {
194        match self {
195            Self::Person => EntityType::Person,
196            Self::Group => EntityType::custom("GROUP", EntityCategory::Agent),
197            Self::Organization => EntityType::Organization,
198            Self::GeopoliticalEntity => EntityType::custom("GPE", EntityCategory::Place),
199            Self::NaturalLocation => EntityType::Location,
200            Self::Facility => EntityType::custom("FAC", EntityCategory::Place),
201            Self::Location => EntityType::Location,
202            Self::Date => EntityType::Date,
203            Self::Time => EntityType::Time,
204            Self::Money => EntityType::Money,
205            Self::Percent => EntityType::Percent,
206            Self::Quantity => EntityType::Quantity,
207            Self::Cardinal => EntityType::Cardinal,
208            Self::Ordinal => EntityType::Ordinal,
209            Self::CreativeWork => EntityType::custom("WORK_OF_ART", EntityCategory::Creative),
210            Self::Product => EntityType::custom("PRODUCT", EntityCategory::Misc),
211            Self::Event => EntityType::custom("EVENT", EntityCategory::Misc),
212            Self::Law => EntityType::custom("LAW", EntityCategory::Misc),
213            Self::Language => EntityType::custom("LANGUAGE", EntityCategory::Misc),
214            Self::Disease => EntityType::custom("DISEASE", EntityCategory::Agent),
215            Self::Chemical => EntityType::custom("CHEMICAL", EntityCategory::Misc),
216            Self::Gene => EntityType::custom("GENE", EntityCategory::Misc),
217            Self::Drug => EntityType::custom("DRUG", EntityCategory::Misc),
218            Self::Animal => EntityType::custom("ANIMAL", EntityCategory::Misc),
219            Self::Plant => EntityType::custom("PLANT", EntityCategory::Misc),
220            Self::Food => EntityType::custom("FOOD", EntityCategory::Misc),
221            Self::Misc => EntityType::Other("MISC".to_string()),
222        }
223    }
224}
225
226// =============================================================================
227// Dataset Schemas
228// =============================================================================
229
230/// Known dataset schemas for automatic mapping.
231#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
232pub enum DatasetSchema {
233    /// CoNLL-2003: PER, LOC, ORG, MISC
234    CoNLL2003,
235    /// OntoNotes 5.0: 18 types including GPE, NORP, FAC
236    OntoNotes,
237    /// MultiNERD: 15 types
238    MultiNERD,
239    /// FewNERD: 8 coarse + 66 fine types
240    FewNERD,
241    /// CrossNER: Domain-specific types
242    CrossNER,
243    /// BC5CDR: Chemical, Disease
244    BC5CDR,
245    /// NCBI Disease: Disease only
246    NCBIDisease,
247    /// MIT Movie: Actor, Director, Title, etc.
248    MITMovie,
249    /// MIT Restaurant: Restaurant_Name, Cuisine, etc.
250    MITRestaurant,
251    /// WNUT-17: person, location, corporation, product, creative-work, group
252    WNUT17,
253}
254
255impl DatasetSchema {
256    /// Get the entity labels used by this dataset.
257    #[must_use]
258    pub fn labels(&self) -> &'static [&'static str] {
259        match self {
260            Self::CoNLL2003 => &["PER", "LOC", "ORG", "MISC"],
261            Self::OntoNotes => &[
262                "PERSON",
263                "NORP",
264                "FAC",
265                "ORG",
266                "GPE",
267                "LOC",
268                "PRODUCT",
269                "EVENT",
270                "WORK_OF_ART",
271                "LAW",
272                "LANGUAGE",
273                "DATE",
274                "TIME",
275                "PERCENT",
276                "MONEY",
277                "QUANTITY",
278                "ORDINAL",
279                "CARDINAL",
280            ],
281            Self::MultiNERD => &[
282                "PER", "LOC", "ORG", "ANIM", "BIO", "CEL", "DIS", "EVE", "FOOD", "INST", "MEDIA",
283                "MYTH", "PLANT", "TIME", "VEHI",
284            ],
285            Self::FewNERD => &[
286                "person",
287                "organization",
288                "location",
289                "building",
290                "art",
291                "product",
292                "event",
293                "other",
294            ],
295            Self::CrossNER => &[
296                "politician",
297                "election",
298                "political_party",
299                "country",
300                "location",
301                "organization",
302                "person",
303                "misc",
304            ],
305            Self::BC5CDR => &["Chemical", "Disease"],
306            Self::NCBIDisease => &["Disease"],
307            Self::MITMovie => &[
308                "Actor",
309                "Director",
310                "Genre",
311                "Title",
312                "Year",
313                "Song",
314                "Character",
315                "Plot",
316                "Rating",
317            ],
318            Self::MITRestaurant => &[
319                "Amenity",
320                "Cuisine",
321                "Dish",
322                "Hours",
323                "Location",
324                "Price",
325                "Rating",
326                "Restaurant_Name",
327            ],
328            Self::WNUT17 => &[
329                "person",
330                "location",
331                "corporation",
332                "product",
333                "creative-work",
334                "group",
335            ],
336        }
337    }
338}
339
340// =============================================================================
341// Information Loss Tracking
342// =============================================================================
343
344/// Documents information lost during schema mapping.
345#[derive(Debug, Clone)]
346pub struct InformationLoss {
347    /// The original fine-grained label
348    pub original: String,
349    /// The coarse canonical type it maps to
350    pub canonical: CanonicalType,
351    /// What semantic information is lost
352    pub lost_semantics: &'static str,
353}
354
355// =============================================================================
356// Schema Mapper
357// =============================================================================
358
359/// Maps dataset-specific labels to canonical types.
360#[derive(Debug, Clone)]
361pub struct SchemaMapper {
362    /// The source dataset schema
363    pub source_schema: DatasetSchema,
364    /// Label → CanonicalType mapping
365    mappings: HashMap<String, CanonicalType>,
366    /// Label → InformationLoss (if any)
367    losses: HashMap<String, InformationLoss>,
368}
369
370impl SchemaMapper {
371    /// Create a mapper for a specific dataset.
372    #[must_use]
373    pub fn for_dataset(schema: DatasetSchema) -> Self {
374        let mut mapper = Self {
375            source_schema: schema,
376            mappings: HashMap::new(),
377            losses: HashMap::new(),
378        };
379
380        match schema {
381            DatasetSchema::CoNLL2003 => {
382                mapper.add("PER", CanonicalType::Person);
383                mapper.add("LOC", CanonicalType::Location);
384                mapper.add("ORG", CanonicalType::Organization);
385                mapper.add("MISC", CanonicalType::Misc);
386            }
387            DatasetSchema::OntoNotes => {
388                // Person types
389                mapper.add("PERSON", CanonicalType::Person);
390
391                // CRITICAL: NORP is NOT Organization!
392                mapper.add_with_loss(
393                    "NORP",
394                    CanonicalType::Group,
395                    "Nationalities/religions/politics - distinct from formal organizations",
396                );
397
398                // Location types - preserve distinctions
399                mapper.add("GPE", CanonicalType::GeopoliticalEntity);
400                mapper.add_with_loss(
401                    "LOC",
402                    CanonicalType::NaturalLocation,
403                    "Natural locations (mountains, rivers)",
404                );
405                mapper.add_with_loss(
406                    "FAC",
407                    CanonicalType::Facility,
408                    "Man-made structures (buildings, bridges)",
409                );
410
411                // Organization
412                mapper.add("ORG", CanonicalType::Organization);
413
414                // Temporal
415                mapper.add("DATE", CanonicalType::Date);
416                mapper.add("TIME", CanonicalType::Time);
417
418                // Numeric
419                mapper.add("MONEY", CanonicalType::Money);
420                mapper.add("PERCENT", CanonicalType::Percent);
421                mapper.add("QUANTITY", CanonicalType::Quantity);
422                mapper.add("CARDINAL", CanonicalType::Cardinal);
423                mapper.add("ORDINAL", CanonicalType::Ordinal);
424
425                // Creative/Legal
426                mapper.add("PRODUCT", CanonicalType::Product);
427                mapper.add("EVENT", CanonicalType::Event);
428                mapper.add("WORK_OF_ART", CanonicalType::CreativeWork);
429                mapper.add("LAW", CanonicalType::Law);
430                mapper.add("LANGUAGE", CanonicalType::Language);
431            }
432            DatasetSchema::MultiNERD => {
433                mapper.add("PER", CanonicalType::Person);
434                mapper.add("LOC", CanonicalType::Location);
435                mapper.add("ORG", CanonicalType::Organization);
436                mapper.add("ANIM", CanonicalType::Animal);
437                mapper.add_with_loss("BIO", CanonicalType::Misc, "Biological entities");
438                mapper.add_with_loss("CEL", CanonicalType::Misc, "Celestial bodies");
439                mapper.add("DIS", CanonicalType::Disease);
440                mapper.add("EVE", CanonicalType::Event);
441                mapper.add("FOOD", CanonicalType::Food);
442                mapper.add_with_loss("INST", CanonicalType::Misc, "Instruments");
443                mapper.add_with_loss("MEDIA", CanonicalType::CreativeWork, "Media works");
444                mapper.add_with_loss("MYTH", CanonicalType::Misc, "Mythological entities");
445                mapper.add("PLANT", CanonicalType::Plant);
446                mapper.add("TIME", CanonicalType::Time);
447                mapper.add_with_loss("VEHI", CanonicalType::Product, "Vehicles");
448            }
449            DatasetSchema::FewNERD => {
450                mapper.add("person", CanonicalType::Person);
451                mapper.add("organization", CanonicalType::Organization);
452                mapper.add("location", CanonicalType::Location);
453                mapper.add_with_loss("building", CanonicalType::Facility, "Buildings/structures");
454                mapper.add("art", CanonicalType::CreativeWork);
455                mapper.add("product", CanonicalType::Product);
456                mapper.add("event", CanonicalType::Event);
457                mapper.add("other", CanonicalType::Misc);
458            }
459            DatasetSchema::CrossNER => {
460                mapper.add_with_loss("politician", CanonicalType::Person, "Political role lost");
461                mapper.add_with_loss(
462                    "election",
463                    CanonicalType::Event,
464                    "Election specificity lost",
465                );
466                mapper.add_with_loss(
467                    "political_party",
468                    CanonicalType::Organization,
469                    "Political nature lost",
470                );
471                mapper.add("country", CanonicalType::GeopoliticalEntity);
472                mapper.add("location", CanonicalType::Location);
473                mapper.add("organization", CanonicalType::Organization);
474                mapper.add("person", CanonicalType::Person);
475                mapper.add("misc", CanonicalType::Misc);
476            }
477            DatasetSchema::BC5CDR => {
478                mapper.add("Chemical", CanonicalType::Chemical);
479                mapper.add("Disease", CanonicalType::Disease);
480            }
481            DatasetSchema::NCBIDisease => {
482                mapper.add("Disease", CanonicalType::Disease);
483            }
484            DatasetSchema::MITMovie => {
485                mapper.add_with_loss("Actor", CanonicalType::Person, "Acting role lost");
486                mapper.add_with_loss("Director", CanonicalType::Person, "Directing role lost");
487                mapper.add_with_loss("Character", CanonicalType::Person, "Fictional status lost");
488                mapper.add("Title", CanonicalType::CreativeWork);
489                mapper.add("Year", CanonicalType::Date);
490                mapper.add_with_loss("Song", CanonicalType::CreativeWork, "Song vs film lost");
491                mapper.add_with_loss("Genre", CanonicalType::Misc, "Genre semantics lost");
492                mapper.add_with_loss("Plot", CanonicalType::Misc, "Plot description lost");
493                mapper.add_with_loss("Rating", CanonicalType::Misc, "Rating semantics lost");
494            }
495            DatasetSchema::MITRestaurant => {
496                mapper.add("Restaurant_Name", CanonicalType::Organization);
497                mapper.add("Location", CanonicalType::Location);
498                mapper.add_with_loss("Cuisine", CanonicalType::Misc, "Cuisine type lost");
499                mapper.add_with_loss("Dish", CanonicalType::Food, "Dish specifics lost");
500                mapper.add("Price", CanonicalType::Money);
501                mapper.add_with_loss("Amenity", CanonicalType::Misc, "Amenity type lost");
502                mapper.add("Hours", CanonicalType::Time);
503                mapper.add_with_loss("Rating", CanonicalType::Misc, "Rating semantics lost");
504            }
505            DatasetSchema::WNUT17 => {
506                mapper.add("person", CanonicalType::Person);
507                mapper.add("location", CanonicalType::Location);
508                mapper.add("corporation", CanonicalType::Organization);
509                mapper.add("product", CanonicalType::Product);
510                mapper.add("creative-work", CanonicalType::CreativeWork);
511                mapper.add("group", CanonicalType::Group);
512            }
513        }
514
515        mapper
516    }
517
518    /// Add a simple mapping (no information loss).
519    fn add(&mut self, label: &str, canonical: CanonicalType) {
520        self.mappings.insert(label.to_uppercase(), canonical);
521    }
522
523    /// Add a mapping with documented information loss.
524    fn add_with_loss(
525        &mut self,
526        label: &str,
527        canonical: CanonicalType,
528        lost_semantics: &'static str,
529    ) {
530        let upper = label.to_uppercase();
531        self.mappings.insert(upper.clone(), canonical);
532        self.losses.insert(
533            upper.clone(),
534            InformationLoss {
535                original: label.to_string(),
536                canonical,
537                lost_semantics,
538            },
539        );
540    }
541
542    /// Map a dataset label to canonical type.
543    #[must_use]
544    pub fn to_canonical(&self, label: &str) -> CanonicalType {
545        self.mappings
546            .get(&label.to_uppercase())
547            .copied()
548            .unwrap_or(CanonicalType::Misc)
549    }
550
551    /// Get information loss for a label (if any).
552    #[must_use]
553    pub fn information_loss(&self, label: &str) -> Option<&InformationLoss> {
554        self.losses.get(&label.to_uppercase())
555    }
556
557    /// Map to EntityType for compatibility with existing code.
558    #[must_use]
559    pub fn to_entity_type(&self, label: &str) -> EntityType {
560        self.to_canonical(label).to_entity_type()
561    }
562
563    /// Get all mappings that have information loss.
564    pub fn all_losses(&self) -> impl Iterator<Item = &InformationLoss> {
565        self.losses.values()
566    }
567
568    /// Calculate label overlap with another schema.
569    ///
570    /// Used to detect if "zero-shot" evaluation is actually fair.
571    /// High overlap (>80%) means evaluation inflates scores.
572    #[must_use]
573    pub fn label_overlap(&self, other: &SchemaMapper) -> f64 {
574        let self_canonicals: std::collections::HashSet<_> =
575            self.mappings.values().copied().collect();
576        let other_canonicals: std::collections::HashSet<_> =
577            other.mappings.values().copied().collect();
578
579        let intersection = self_canonicals.intersection(&other_canonicals).count();
580        let union = self_canonicals.union(&other_canonicals).count();
581
582        if union == 0 {
583            0.0
584        } else {
585            intersection as f64 / union as f64
586        }
587    }
588}
589
590// =============================================================================
591// Unified Mapping Function (replaces all the ad-hoc ones)
592// =============================================================================
593
594/// Unified label mapping - THE SINGLE SOURCE OF TRUTH.
595///
596/// Replaces:
597/// - `EntityType::from_label()` (partial)
598/// - `map_entity_type()` in loader.rs
599/// - `map_label_to_entity_type()` in datasets.rs
600/// - `string_to_entity_type()` in bio_adapter.rs
601///
602/// # Arguments
603/// * `label` - The entity type label from any dataset
604/// * `schema` - Optional source schema for precise mapping
605///
606/// # Returns
607/// The canonical EntityType
608#[must_use]
609pub fn map_to_canonical(label: &str, schema: Option<DatasetSchema>) -> EntityType {
610    let label = label
611        .strip_prefix("B-")
612        .or_else(|| label.strip_prefix("I-"))
613        .or_else(|| label.strip_prefix("E-"))
614        .or_else(|| label.strip_prefix("S-"))
615        .or_else(|| label.strip_prefix("L-"))
616        .or_else(|| label.strip_prefix("U-"))
617        .unwrap_or(label);
618
619    if let Some(schema) = schema {
620        SchemaMapper::for_dataset(schema).to_entity_type(label)
621    } else {
622        // Fallback: use heuristic mapping
623        map_label_heuristic(label)
624    }
625}
626
627/// Heuristic mapping when schema is unknown.
628fn map_label_heuristic(label: &str) -> EntityType {
629    match label.to_uppercase().as_str() {
630        // Person types
631        "PER" | "PERSON" | "ACTOR" | "DIRECTOR" | "CHARACTER" | "POLITICIAN" => EntityType::Person,
632
633        // NORP - distinct from ORG!
634        "NORP" | "GROUP" | "NATIONALITY" | "RELIGION" => {
635            EntityType::custom("GROUP", EntityCategory::Agent)
636        }
637
638        // Organization types
639        "ORG" | "ORGANIZATION" | "ORGANISATION" | "CORPORATION" | "COMPANY" | "POLITICAL_PARTY"
640        | "RESTAURANT_NAME" => EntityType::Organization,
641
642        // Location types - preserve GPE/FAC when possible
643        "GPE" | "COUNTRY" | "CITY" | "STATE" => EntityType::custom("GPE", EntityCategory::Place),
644        "FAC" | "FACILITY" | "BUILDING" => EntityType::custom("FAC", EntityCategory::Place),
645        "LOC" | "LOCATION" | "GEO" => EntityType::Location,
646
647        // Temporal
648        "DATE" | "YEAR" => EntityType::Date,
649        "TIME" | "HOURS" => EntityType::Time,
650
651        // Numeric
652        "MONEY" | "PRICE" | "CURRENCY" => EntityType::Money,
653        "PERCENT" | "PERCENTAGE" => EntityType::Percent,
654        "QUANTITY" => EntityType::Quantity,
655        "CARDINAL" => EntityType::Cardinal,
656        "ORDINAL" => EntityType::Ordinal,
657
658        // Creative/Legal
659        "PRODUCT" | "PROD" => EntityType::custom("PRODUCT", EntityCategory::Misc),
660        "EVENT" | "EVE" | "ELECTION" => EntityType::custom("EVENT", EntityCategory::Misc),
661        "WORK_OF_ART" | "CREATIVE-WORK" | "TITLE" | "SONG" | "ART" | "MEDIA" | "BOOK" => {
662            EntityType::custom("WORK_OF_ART", EntityCategory::Creative)
663        }
664        "LAW" => EntityType::custom("LAW", EntityCategory::Misc),
665        "LANGUAGE" => EntityType::custom("LANGUAGE", EntityCategory::Misc),
666
667        // Historical/Official types (CHisIEC - Ancient Chinese)
668        "OFI" | "OFFICIAL" | "POSITION" | "TITLE_OFFICE" => {
669            EntityType::custom("OFFICIAL", EntityCategory::Misc)
670        }
671
672        // Biomedical
673        "DISEASE" | "DIS" => EntityType::custom("DISEASE", EntityCategory::Agent),
674        "CHEMICAL" => EntityType::custom("CHEMICAL", EntityCategory::Misc),
675        "GENE" => EntityType::custom("GENE", EntityCategory::Misc),
676        "DRUG" => EntityType::custom("DRUG", EntityCategory::Misc),
677
678        // Other domain types
679        "ANIM" | "ANIMAL" => EntityType::custom("ANIMAL", EntityCategory::Misc),
680        "PLANT" => EntityType::custom("PLANT", EntityCategory::Misc),
681        "FOOD" | "DISH" | "CUISINE" => EntityType::custom("FOOD", EntityCategory::Misc),
682        "VEHI" | "VEHICLE" => EntityType::custom("VEHICLE", EntityCategory::Misc),
683
684        // Contact
685        "EMAIL" => EntityType::Email,
686        "URL" | "URI" => EntityType::Url,
687        "PHONE" | "TELEPHONE" => EntityType::Phone,
688
689        // Misc fallback
690        "MISC" | "MISCELLANEOUS" | "O" | "OTHER" => EntityType::Other("MISC".to_string()),
691
692        // Unknown - preserve original
693        other => EntityType::Other(other.to_string()),
694    }
695}
696
697// =============================================================================
698// Coarse Schema for Training
699// =============================================================================
700
701/// Coarse-grained schema for multi-dataset training.
702///
703/// Use this when training on concatenated datasets to avoid label conflicts.
704#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
705pub enum CoarseType {
706    /// Any person entity
707    Person,
708    /// Any organization entity
709    Organization,
710    /// Any location entity
711    Location,
712    /// Any temporal entity
713    DateTime,
714    /// Any numeric entity
715    Numeric,
716    /// Everything else
717    Other,
718}
719
720impl CoarseType {
721    /// Map from canonical type.
722    #[must_use]
723    pub fn from_canonical(ct: CanonicalType) -> Self {
724        match ct {
725            CanonicalType::Person | CanonicalType::Group => Self::Person,
726            CanonicalType::Organization => Self::Organization,
727            CanonicalType::GeopoliticalEntity
728            | CanonicalType::NaturalLocation
729            | CanonicalType::Facility
730            | CanonicalType::Location => Self::Location,
731            CanonicalType::Date | CanonicalType::Time => Self::DateTime,
732            CanonicalType::Money
733            | CanonicalType::Percent
734            | CanonicalType::Quantity
735            | CanonicalType::Cardinal
736            | CanonicalType::Ordinal => Self::Numeric,
737            _ => Self::Other,
738        }
739    }
740
741    /// Map from any label.
742    #[must_use]
743    pub fn from_label(label: &str) -> Self {
744        let canonical = SchemaMapper::for_dataset(DatasetSchema::OntoNotes).to_canonical(label);
745        Self::from_canonical(canonical)
746    }
747}
748
749// =============================================================================
750// Tests
751// =============================================================================
752
753#[cfg(test)]
754mod tests {
755    use super::*;
756
757    #[test]
758    fn test_norp_is_not_organization() {
759        let mapper = SchemaMapper::for_dataset(DatasetSchema::OntoNotes);
760        let norp = mapper.to_canonical("NORP");
761        let org = mapper.to_canonical("ORG");
762
763        assert_eq!(norp, CanonicalType::Group);
764        assert_eq!(org, CanonicalType::Organization);
765        assert_ne!(norp, org, "NORP should NOT map to Organization!");
766    }
767
768    #[test]
769    fn test_location_distinctions_preserved() {
770        let mapper = SchemaMapper::for_dataset(DatasetSchema::OntoNotes);
771
772        assert_eq!(
773            mapper.to_canonical("GPE"),
774            CanonicalType::GeopoliticalEntity
775        );
776        assert_eq!(mapper.to_canonical("LOC"), CanonicalType::NaturalLocation);
777        assert_eq!(mapper.to_canonical("FAC"), CanonicalType::Facility);
778    }
779
780    #[test]
781    fn test_information_loss_documented() {
782        let mapper = SchemaMapper::for_dataset(DatasetSchema::OntoNotes);
783
784        let fac_loss = mapper.information_loss("FAC");
785        assert!(fac_loss.is_some());
786        let loss_text = fac_loss.unwrap().lost_semantics.to_lowercase();
787        // Check that loss contains info about structures/buildings
788        assert!(loss_text.contains("structure") || loss_text.contains("building"));
789    }
790
791    #[test]
792    fn test_conll_to_ontonotes_overlap() {
793        let conll = SchemaMapper::for_dataset(DatasetSchema::CoNLL2003);
794        let ontonotes = SchemaMapper::for_dataset(DatasetSchema::OntoNotes);
795
796        let overlap = conll.label_overlap(&ontonotes);
797        // CoNLL has 4 types, OntoNotes has 18 - expect low overlap
798        assert!(overlap < 0.5);
799    }
800
801    #[test]
802    fn test_unified_mapping_strips_bio() {
803        let et = map_to_canonical("B-PER", None);
804        assert_eq!(et, EntityType::Person);
805
806        let et = map_to_canonical("I-ORG", None);
807        assert_eq!(et, EntityType::Organization);
808    }
809
810    #[test]
811    fn test_coarse_schema() {
812        assert_eq!(
813            CoarseType::from_canonical(CanonicalType::Person),
814            CoarseType::Person
815        );
816        assert_eq!(
817            CoarseType::from_canonical(CanonicalType::Group),
818            CoarseType::Person
819        );
820        assert_eq!(
821            CoarseType::from_canonical(CanonicalType::GeopoliticalEntity),
822            CoarseType::Location
823        );
824    }
825}