Skip to main content

anno_core/core/
ontology.rs

1//! Cross-schema entity type normalization.
2//!
3//! Normalizes labels across NER schemas: CoNLL `PER` = OntoNotes `PERSON` = spaCy `PERSON`.
4//! Strips BIO prefixes automatically (`B-PER` → `PER`).
5//!
6//! # Example
7//!
8//! ```rust
9//! use anno_core::core::ontology::{normalize, is_known, CoreType};
10//!
11//! assert_eq!(normalize("B-PER"), Some(CoreType::Person));
12//! assert_eq!(normalize("PERSON"), Some(CoreType::Person));
13//! assert_eq!(normalize("personne"), Some(CoreType::Person)); // French
14//! assert!(is_known("ORG"));
15//! ```
16//!
17//! For domain-specific types, use [`TypeMapper`](super::TypeMapper) instead of extending
18//! the ontology.
19//! - **OWL/RDF complexity**: Wrong abstraction level for runtime performance
20//!
21//! The goal is **practical interoperability**, not ontological completeness.
22
23use std::collections::HashMap;
24use std::sync::RwLock;
25
26use serde::{Deserialize, Serialize};
27
28// =============================================================================
29// Core Label Normalization
30// =============================================================================
31
32/// Canonical entity type label.
33///
34/// This is the normalized form that all aliases map to.
35/// Keep this list small (8-15 core types) per research recommendations.
36#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
37#[non_exhaustive]
38pub enum CoreType {
39    // === Named Entities (ML-required) ===
40    /// Person names
41    Person,
42    /// Organizations, companies, agencies
43    Organization,
44    /// Locations, places, geopolitical entities
45    Location,
46    /// Miscellaneous named entities
47    Misc,
48
49    // === Temporal (pattern-detectable) ===
50    /// Date expressions
51    Date,
52    /// Time expressions
53    Time,
54
55    // === Numeric (pattern-detectable) ===
56    /// Monetary values
57    Money,
58    /// Percentages
59    Percent,
60    /// Quantities with units
61    Quantity,
62    /// Cardinal numbers
63    Cardinal,
64    /// Ordinal numbers (first, second)
65    Ordinal,
66
67    // === Contact (pattern-detectable) ===
68    /// Email addresses
69    Email,
70    /// URLs/URIs
71    Url,
72    /// Phone numbers
73    Phone,
74
75    // === Extended (OntoNotes-style) ===
76    /// Nationalities, religious/political groups
77    Norp,
78    /// Facilities (buildings, airports)
79    Facility,
80    /// Products
81    Product,
82    /// Events
83    Event,
84    /// Works of art
85    WorkOfArt,
86    /// Laws
87    Law,
88    /// Languages
89    Language,
90
91    // === Domain Extensions ===
92    /// Domain-specific type (registered at runtime)
93    Domain(&'static str),
94}
95
96impl CoreType {
97    /// Get canonical label string.
98    pub fn as_label(&self) -> &'static str {
99        match self {
100            CoreType::Person => "PER",
101            CoreType::Organization => "ORG",
102            CoreType::Location => "LOC",
103            CoreType::Misc => "MISC",
104            CoreType::Date => "DATE",
105            CoreType::Time => "TIME",
106            CoreType::Money => "MONEY",
107            CoreType::Percent => "PERCENT",
108            CoreType::Quantity => "QUANTITY",
109            CoreType::Cardinal => "CARDINAL",
110            CoreType::Ordinal => "ORDINAL",
111            CoreType::Email => "EMAIL",
112            CoreType::Url => "URL",
113            CoreType::Phone => "PHONE",
114            CoreType::Norp => "NORP",
115            CoreType::Facility => "FAC",
116            CoreType::Product => "PRODUCT",
117            CoreType::Event => "EVENT",
118            CoreType::WorkOfArt => "WORK_OF_ART",
119            CoreType::Law => "LAW",
120            CoreType::Language => "LANGUAGE",
121            CoreType::Domain(s) => s,
122        }
123    }
124
125    /// Is this type pattern-detectable (vs ML-required)?
126    pub fn is_pattern_detectable(&self) -> bool {
127        matches!(
128            self,
129            CoreType::Date
130                | CoreType::Time
131                | CoreType::Money
132                | CoreType::Percent
133                | CoreType::Quantity
134                | CoreType::Cardinal
135                | CoreType::Ordinal
136                | CoreType::Email
137                | CoreType::Url
138                | CoreType::Phone
139        )
140    }
141}
142
143impl std::fmt::Display for CoreType {
144    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
145        write!(f, "{}", self.as_label())
146    }
147}
148
149// =============================================================================
150// Label Normalizer
151// =============================================================================
152
153/// Normalizes entity type labels across schemas and languages.
154///
155/// This is a simple alias table, not a complex ontology graph.
156/// Add aliases as needed; the lookup is O(1) via HashMap.
157///
158/// # Example
159///
160/// ```rust
161/// use anno_core::core::ontology::{LabelNormalizer, CoreType};
162///
163/// let norm = LabelNormalizer::default();
164///
165/// // Different schemas resolve to same type
166/// assert_eq!(norm.normalize("PER"), Some(CoreType::Person));
167/// assert_eq!(norm.normalize("PERSON"), Some(CoreType::Person));
168/// assert_eq!(norm.normalize("B-PER"), Some(CoreType::Person)); // BIO prefix stripped
169///
170/// // Cross-lingual
171/// assert_eq!(norm.normalize("personne"), Some(CoreType::Person)); // French
172/// ```
173pub struct LabelNormalizer {
174    aliases: RwLock<HashMap<String, CoreType>>,
175}
176
177impl Default for LabelNormalizer {
178    fn default() -> Self {
179        let norm = Self {
180            aliases: RwLock::new(HashMap::new()),
181        };
182        norm.register_core_aliases();
183        norm
184    }
185}
186
187impl LabelNormalizer {
188    /// Create empty normalizer (no aliases registered).
189    pub fn new() -> Self {
190        Self {
191            aliases: RwLock::new(HashMap::new()),
192        }
193    }
194
195    /// Register an alias for a core type.
196    pub fn register(&self, alias: &str, core_type: CoreType) {
197        let mut aliases = self.aliases.write().expect("LabelNormalizer lock poisoned");
198        aliases.insert(alias.to_lowercase(), core_type);
199    }
200
201    /// Register multiple aliases for a core type.
202    pub fn register_many(&self, aliases: &[&str], core_type: CoreType) {
203        for alias in aliases {
204            self.register(alias, core_type);
205        }
206    }
207
208    /// Normalize a label to its core type.
209    ///
210    /// Handles BIO/BIOES prefixes automatically.
211    pub fn normalize(&self, label: &str) -> Option<CoreType> {
212        // Strip BIO/BIOES prefix
213        let label = label
214            .strip_prefix("B-")
215            .or_else(|| label.strip_prefix("I-"))
216            .or_else(|| label.strip_prefix("E-"))
217            .or_else(|| label.strip_prefix("S-"))
218            .or_else(|| label.strip_prefix("L-"))
219            .or_else(|| label.strip_prefix("U-"))
220            .unwrap_or(label);
221
222        let aliases = self.aliases.read().expect("LabelNormalizer lock poisoned");
223        aliases.get(&label.to_lowercase()).copied()
224    }
225
226    /// Check if a label is known.
227    pub fn is_known(&self, label: &str) -> bool {
228        self.normalize(label).is_some()
229    }
230
231    /// Register all core type aliases.
232    fn register_core_aliases(&self) {
233        // === Person ===
234        self.register_many(
235            &[
236                "per",
237                "person",
238                "personne", // French
239                "persona",  // Spanish/Italian
240                "person",   // German (same)
241                "pessoa",   // Portuguese
242                "человек",  // Russian
243                "人",       // Chinese
244                "人物",     // Japanese
245            ],
246            CoreType::Person,
247        );
248
249        // === Organization ===
250        self.register_many(
251            &[
252                "org",
253                "organization",
254                "organisation",   // British/French
255                "organización",   // Spanish
256                "organizzazione", // Italian
257                "organização",    // Portuguese
258                "組織",           // Japanese
259            ],
260            CoreType::Organization,
261        );
262
263        // === Location ===
264        self.register_many(
265            &[
266                "loc", "location", "gpe", // Geopolitical entity (OntoNotes)
267                "place", "lieu",  // French
268                "lugar", // Spanish
269                "ort",   // German
270                "地点",  // Chinese/Japanese
271            ],
272            CoreType::Location,
273        );
274
275        // === Misc ===
276        self.register_many(&["misc", "miscellaneous", "other", "o"], CoreType::Misc);
277
278        // === Temporal ===
279        self.register_many(&["date", "datum", "fecha", "日期"], CoreType::Date);
280        self.register_many(&["time", "zeit", "hora", "時間"], CoreType::Time);
281
282        // === Numeric ===
283        self.register_many(
284            &["money", "currency", "argent", "geld", "dinero"],
285            CoreType::Money,
286        );
287        self.register_many(&["percent", "percentage"], CoreType::Percent);
288        self.register_many(&["quantity", "qty"], CoreType::Quantity);
289        self.register_many(&["cardinal", "number"], CoreType::Cardinal);
290        self.register_many(&["ordinal"], CoreType::Ordinal);
291
292        // === Contact ===
293        self.register_many(&["email", "e-mail", "correo"], CoreType::Email);
294        self.register_many(&["url", "uri", "link", "enlace"], CoreType::Url);
295        self.register_many(&["phone", "telephone", "tel", "telefon"], CoreType::Phone);
296
297        // === OntoNotes Extended ===
298        self.register_many(&["norp", "nationality"], CoreType::Norp);
299        self.register_many(&["fac", "facility", "building"], CoreType::Facility);
300        self.register_many(&["product", "produkt", "producto"], CoreType::Product);
301        self.register_many(&["event", "ereignis", "evento"], CoreType::Event);
302        self.register_many(
303            &["work_of_art", "creative-work", "artwork"],
304            CoreType::WorkOfArt,
305        );
306        self.register_many(&["law", "legal", "ley", "gesetz"], CoreType::Law);
307        self.register_many(
308            &["language", "sprache", "idioma", "langue"],
309            CoreType::Language,
310        );
311    }
312
313    /// Register biomedical domain types.
314    ///
315    /// These are kept separate because they're domain-specific
316    /// and shouldn't pollute the core type namespace.
317    pub fn register_biomedical(&self) {
318        // Map biomedical types to domain extensions
319        // In practice, you'd use TypeMapper for domain-specific handling
320        self.register("gene", CoreType::Domain("GENE"));
321        self.register("dna", CoreType::Domain("GENE"));
322        self.register("protein", CoreType::Domain("PROTEIN"));
323        self.register("disease", CoreType::Domain("DISEASE"));
324        self.register("chemical", CoreType::Domain("CHEMICAL"));
325        self.register("drug", CoreType::Domain("DRUG"));
326        self.register("cell_line", CoreType::Domain("CELL_LINE"));
327        self.register("cell_type", CoreType::Domain("CELL_TYPE"));
328        self.register("species", CoreType::Domain("SPECIES"));
329        self.register("anatomy", CoreType::Domain("ANATOMY"));
330    }
331
332    /// Register legal domain types.
333    pub fn register_legal(&self) {
334        self.register("case_ref", CoreType::Domain("CASE_REF"));
335        self.register("citation", CoreType::Domain("CITATION"));
336        self.register("court", CoreType::Domain("COURT"));
337        self.register("statute", CoreType::Domain("STATUTE"));
338        self.register("judge", CoreType::Domain("JUDGE"));
339    }
340
341    /// Get all known aliases (for debugging/documentation).
342    pub fn all_aliases(&self) -> Vec<(String, CoreType)> {
343        let aliases = self.aliases.read().expect("LabelNormalizer lock poisoned");
344        aliases.iter().map(|(k, v)| (k.clone(), *v)).collect()
345    }
346}
347
348// =============================================================================
349// Global Instance
350// =============================================================================
351
352use once_cell::sync::Lazy;
353
354/// Global label normalizer with all core aliases pre-registered.
355pub static NORMALIZER: Lazy<LabelNormalizer> = Lazy::new(LabelNormalizer::default);
356
357/// Convenience function to normalize a label using the global normalizer.
358pub fn normalize(label: &str) -> Option<CoreType> {
359    NORMALIZER.normalize(label)
360}
361
362/// Check if a label is known.
363pub fn is_known(label: &str) -> bool {
364    NORMALIZER.is_known(label)
365}
366
367// =============================================================================
368// Optional: External Ontology Links (for KB linking, not NER)
369// =============================================================================
370
371/// External identifier for entity linking (not for NER type classification).
372///
373/// These are useful for linking detected entities to knowledge bases,
374/// but should NOT be used for type hierarchies in NER.
375#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
376pub enum ExternalId {
377    /// Wikidata Q-item (e.g., Q5 for "human")
378    Wikidata(String),
379    /// DBpedia resource URI
380    DBpedia(String),
381    /// UMLS concept (medical)
382    Umls(String),
383    /// Custom external identifier
384    Custom {
385        /// The namespace/source of the identifier (e.g., "freebase", "geonames")
386        source: String,
387        /// The actual identifier value
388        id: String,
389    },
390}
391
392impl ExternalId {
393    /// Create a Wikidata reference.
394    pub fn wikidata(qid: &str) -> Self {
395        ExternalId::Wikidata(qid.to_string())
396    }
397
398    /// Create a DBpedia reference.
399    pub fn dbpedia(resource: &str) -> Self {
400        ExternalId::DBpedia(resource.to_string())
401    }
402
403    /// Get the full IRI/URI.
404    pub fn to_iri(&self) -> String {
405        match self {
406            ExternalId::Wikidata(q) => format!("http://www.wikidata.org/entity/{}", q),
407            ExternalId::DBpedia(r) => format!("http://dbpedia.org/resource/{}", r),
408            ExternalId::Umls(c) => format!("https://uts.nlm.nih.gov/uts/umls/concept/{}", c),
409            ExternalId::Custom { source, id } => format!("{}:{}", source, id),
410        }
411    }
412}
413
414/// Well-known external IDs for core types.
415///
416/// These are provided for entity linking tasks, not for NER classification.
417pub mod external_ids {
418    use super::ExternalId;
419
420    /// Wikidata ID for person (Q5 - human).
421    pub fn person() -> ExternalId {
422        ExternalId::wikidata("Q5")
423    }
424
425    /// Wikidata ID for organization (Q43229).
426    pub fn organization() -> ExternalId {
427        ExternalId::wikidata("Q43229")
428    }
429
430    /// Wikidata ID for location (Q618123 - geographical feature).
431    pub fn location() -> ExternalId {
432        ExternalId::wikidata("Q618123")
433    }
434
435    /// Wikidata ID for date (Q205892 - calendar date).
436    pub fn date() -> ExternalId {
437        ExternalId::wikidata("Q205892")
438    }
439
440    /// Wikidata ID for money (Q1368 - currency).
441    pub fn money() -> ExternalId {
442        ExternalId::wikidata("Q1368")
443    }
444}
445
446// =============================================================================
447// Tests
448// =============================================================================
449
450#[cfg(test)]
451mod tests {
452    use super::*;
453
454    #[test]
455    fn test_normalize_conll_labels() {
456        let norm = LabelNormalizer::default();
457
458        assert_eq!(norm.normalize("PER"), Some(CoreType::Person));
459        assert_eq!(norm.normalize("ORG"), Some(CoreType::Organization));
460        assert_eq!(norm.normalize("LOC"), Some(CoreType::Location));
461        assert_eq!(norm.normalize("MISC"), Some(CoreType::Misc));
462    }
463
464    #[test]
465    fn test_normalize_ontonotes_labels() {
466        let norm = LabelNormalizer::default();
467
468        assert_eq!(norm.normalize("PERSON"), Some(CoreType::Person));
469        assert_eq!(norm.normalize("GPE"), Some(CoreType::Location));
470        assert_eq!(norm.normalize("NORP"), Some(CoreType::Norp));
471        assert_eq!(norm.normalize("FAC"), Some(CoreType::Facility));
472    }
473
474    #[test]
475    fn test_bio_prefix_stripping() {
476        let norm = LabelNormalizer::default();
477
478        assert_eq!(norm.normalize("B-PER"), Some(CoreType::Person));
479        assert_eq!(norm.normalize("I-PER"), Some(CoreType::Person));
480        assert_eq!(norm.normalize("E-ORG"), Some(CoreType::Organization));
481        assert_eq!(norm.normalize("S-LOC"), Some(CoreType::Location));
482    }
483
484    #[test]
485    fn test_cross_lingual() {
486        let norm = LabelNormalizer::default();
487
488        // French
489        assert_eq!(norm.normalize("personne"), Some(CoreType::Person));
490        assert_eq!(norm.normalize("lieu"), Some(CoreType::Location));
491
492        // Spanish
493        assert_eq!(norm.normalize("persona"), Some(CoreType::Person));
494        assert_eq!(norm.normalize("lugar"), Some(CoreType::Location));
495
496        // German
497        assert_eq!(norm.normalize("ort"), Some(CoreType::Location));
498    }
499
500    #[test]
501    fn test_case_insensitive() {
502        let norm = LabelNormalizer::default();
503
504        assert_eq!(norm.normalize("per"), Some(CoreType::Person));
505        assert_eq!(norm.normalize("PER"), Some(CoreType::Person));
506        assert_eq!(norm.normalize("Per"), Some(CoreType::Person));
507        assert_eq!(norm.normalize("PERSON"), Some(CoreType::Person));
508        assert_eq!(norm.normalize("person"), Some(CoreType::Person));
509    }
510
511    #[test]
512    fn test_biomedical_registration() {
513        let norm = LabelNormalizer::default();
514        norm.register_biomedical();
515
516        assert!(norm.is_known("gene"));
517        assert!(norm.is_known("protein"));
518        assert!(norm.is_known("disease"));
519
520        // Domain types resolve to Domain variant
521        match norm.normalize("gene") {
522            Some(CoreType::Domain(s)) => assert_eq!(s, "GENE"),
523            _ => panic!("Expected Domain type"),
524        }
525    }
526
527    #[test]
528    fn test_pattern_detectable() {
529        assert!(CoreType::Date.is_pattern_detectable());
530        assert!(CoreType::Email.is_pattern_detectable());
531        assert!(CoreType::Money.is_pattern_detectable());
532
533        assert!(!CoreType::Person.is_pattern_detectable());
534        assert!(!CoreType::Organization.is_pattern_detectable());
535    }
536
537    #[test]
538    fn test_global_normalizer() {
539        // Test convenience functions
540        assert_eq!(normalize("PER"), Some(CoreType::Person));
541        assert!(is_known("ORG"));
542        assert!(!is_known("UNKNOWN_TYPE_XYZ"));
543    }
544
545    #[test]
546    fn test_external_ids() {
547        let qid = external_ids::person();
548        assert_eq!(qid.to_iri(), "http://www.wikidata.org/entity/Q5");
549
550        let dbp = ExternalId::dbpedia("Person");
551        assert_eq!(dbp.to_iri(), "http://dbpedia.org/resource/Person");
552    }
553}