1use std::collections::HashMap;
24use std::sync::RwLock;
25
26use serde::{Deserialize, Serialize};
27
28#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
37#[non_exhaustive]
38pub enum CoreType {
39 Person,
42 Organization,
44 Location,
46 Misc,
48
49 Date,
52 Time,
54
55 Money,
58 Percent,
60 Quantity,
62 Cardinal,
64 Ordinal,
66
67 Email,
70 Url,
72 Phone,
74
75 Norp,
78 Facility,
80 Product,
82 Event,
84 WorkOfArt,
86 Law,
88 Language,
90
91 Domain(&'static str),
94}
95
96impl CoreType {
97 pub fn as_label(&self) -> &'static str {
99 match self {
100 CoreType::Person => "PER",
101 CoreType::Organization => "ORG",
102 CoreType::Location => "LOC",
103 CoreType::Misc => "MISC",
104 CoreType::Date => "DATE",
105 CoreType::Time => "TIME",
106 CoreType::Money => "MONEY",
107 CoreType::Percent => "PERCENT",
108 CoreType::Quantity => "QUANTITY",
109 CoreType::Cardinal => "CARDINAL",
110 CoreType::Ordinal => "ORDINAL",
111 CoreType::Email => "EMAIL",
112 CoreType::Url => "URL",
113 CoreType::Phone => "PHONE",
114 CoreType::Norp => "NORP",
115 CoreType::Facility => "FAC",
116 CoreType::Product => "PRODUCT",
117 CoreType::Event => "EVENT",
118 CoreType::WorkOfArt => "WORK_OF_ART",
119 CoreType::Law => "LAW",
120 CoreType::Language => "LANGUAGE",
121 CoreType::Domain(s) => s,
122 }
123 }
124
125 pub fn is_pattern_detectable(&self) -> bool {
127 matches!(
128 self,
129 CoreType::Date
130 | CoreType::Time
131 | CoreType::Money
132 | CoreType::Percent
133 | CoreType::Quantity
134 | CoreType::Cardinal
135 | CoreType::Ordinal
136 | CoreType::Email
137 | CoreType::Url
138 | CoreType::Phone
139 )
140 }
141}
142
143impl std::fmt::Display for CoreType {
144 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
145 write!(f, "{}", self.as_label())
146 }
147}
148
149pub struct LabelNormalizer {
174 aliases: RwLock<HashMap<String, CoreType>>,
175}
176
177impl Default for LabelNormalizer {
178 fn default() -> Self {
179 let norm = Self {
180 aliases: RwLock::new(HashMap::new()),
181 };
182 norm.register_core_aliases();
183 norm
184 }
185}
186
187impl LabelNormalizer {
188 pub fn new() -> Self {
190 Self {
191 aliases: RwLock::new(HashMap::new()),
192 }
193 }
194
195 pub fn register(&self, alias: &str, core_type: CoreType) {
197 let mut aliases = self.aliases.write().expect("LabelNormalizer lock poisoned");
198 aliases.insert(alias.to_lowercase(), core_type);
199 }
200
201 pub fn register_many(&self, aliases: &[&str], core_type: CoreType) {
203 for alias in aliases {
204 self.register(alias, core_type);
205 }
206 }
207
208 pub fn normalize(&self, label: &str) -> Option<CoreType> {
212 let label = label
214 .strip_prefix("B-")
215 .or_else(|| label.strip_prefix("I-"))
216 .or_else(|| label.strip_prefix("E-"))
217 .or_else(|| label.strip_prefix("S-"))
218 .or_else(|| label.strip_prefix("L-"))
219 .or_else(|| label.strip_prefix("U-"))
220 .unwrap_or(label);
221
222 let aliases = self.aliases.read().expect("LabelNormalizer lock poisoned");
223 aliases.get(&label.to_lowercase()).copied()
224 }
225
226 pub fn is_known(&self, label: &str) -> bool {
228 self.normalize(label).is_some()
229 }
230
231 fn register_core_aliases(&self) {
233 self.register_many(
235 &[
236 "per",
237 "person",
238 "personne", "persona", "person", "pessoa", "человек", "人", "人物", ],
246 CoreType::Person,
247 );
248
249 self.register_many(
251 &[
252 "org",
253 "organization",
254 "organisation", "organización", "organizzazione", "organização", "組織", ],
260 CoreType::Organization,
261 );
262
263 self.register_many(
265 &[
266 "loc", "location", "gpe", "place", "lieu", "lugar", "ort", "地点", ],
272 CoreType::Location,
273 );
274
275 self.register_many(&["misc", "miscellaneous", "other", "o"], CoreType::Misc);
277
278 self.register_many(&["date", "datum", "fecha", "日期"], CoreType::Date);
280 self.register_many(&["time", "zeit", "hora", "時間"], CoreType::Time);
281
282 self.register_many(
284 &["money", "currency", "argent", "geld", "dinero"],
285 CoreType::Money,
286 );
287 self.register_many(&["percent", "percentage"], CoreType::Percent);
288 self.register_many(&["quantity", "qty"], CoreType::Quantity);
289 self.register_many(&["cardinal", "number"], CoreType::Cardinal);
290 self.register_many(&["ordinal"], CoreType::Ordinal);
291
292 self.register_many(&["email", "e-mail", "correo"], CoreType::Email);
294 self.register_many(&["url", "uri", "link", "enlace"], CoreType::Url);
295 self.register_many(&["phone", "telephone", "tel", "telefon"], CoreType::Phone);
296
297 self.register_many(&["norp", "nationality"], CoreType::Norp);
299 self.register_many(&["fac", "facility", "building"], CoreType::Facility);
300 self.register_many(&["product", "produkt", "producto"], CoreType::Product);
301 self.register_many(&["event", "ereignis", "evento"], CoreType::Event);
302 self.register_many(
303 &["work_of_art", "creative-work", "artwork"],
304 CoreType::WorkOfArt,
305 );
306 self.register_many(&["law", "legal", "ley", "gesetz"], CoreType::Law);
307 self.register_many(
308 &["language", "sprache", "idioma", "langue"],
309 CoreType::Language,
310 );
311 }
312
313 pub fn register_biomedical(&self) {
318 self.register("gene", CoreType::Domain("GENE"));
321 self.register("dna", CoreType::Domain("GENE"));
322 self.register("protein", CoreType::Domain("PROTEIN"));
323 self.register("disease", CoreType::Domain("DISEASE"));
324 self.register("chemical", CoreType::Domain("CHEMICAL"));
325 self.register("drug", CoreType::Domain("DRUG"));
326 self.register("cell_line", CoreType::Domain("CELL_LINE"));
327 self.register("cell_type", CoreType::Domain("CELL_TYPE"));
328 self.register("species", CoreType::Domain("SPECIES"));
329 self.register("anatomy", CoreType::Domain("ANATOMY"));
330 }
331
332 pub fn register_legal(&self) {
334 self.register("case_ref", CoreType::Domain("CASE_REF"));
335 self.register("citation", CoreType::Domain("CITATION"));
336 self.register("court", CoreType::Domain("COURT"));
337 self.register("statute", CoreType::Domain("STATUTE"));
338 self.register("judge", CoreType::Domain("JUDGE"));
339 }
340
341 pub fn all_aliases(&self) -> Vec<(String, CoreType)> {
343 let aliases = self.aliases.read().expect("LabelNormalizer lock poisoned");
344 aliases.iter().map(|(k, v)| (k.clone(), *v)).collect()
345 }
346}
347
348use once_cell::sync::Lazy;
353
354pub static NORMALIZER: Lazy<LabelNormalizer> = Lazy::new(LabelNormalizer::default);
356
357pub fn normalize(label: &str) -> Option<CoreType> {
359 NORMALIZER.normalize(label)
360}
361
362pub fn is_known(label: &str) -> bool {
364 NORMALIZER.is_known(label)
365}
366
367#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
376pub enum ExternalId {
377 Wikidata(String),
379 DBpedia(String),
381 Umls(String),
383 Custom {
385 source: String,
387 id: String,
389 },
390}
391
392impl ExternalId {
393 pub fn wikidata(qid: &str) -> Self {
395 ExternalId::Wikidata(qid.to_string())
396 }
397
398 pub fn dbpedia(resource: &str) -> Self {
400 ExternalId::DBpedia(resource.to_string())
401 }
402
403 pub fn to_iri(&self) -> String {
405 match self {
406 ExternalId::Wikidata(q) => format!("http://www.wikidata.org/entity/{}", q),
407 ExternalId::DBpedia(r) => format!("http://dbpedia.org/resource/{}", r),
408 ExternalId::Umls(c) => format!("https://uts.nlm.nih.gov/uts/umls/concept/{}", c),
409 ExternalId::Custom { source, id } => format!("{}:{}", source, id),
410 }
411 }
412}
413
414pub mod external_ids {
418 use super::ExternalId;
419
420 pub fn person() -> ExternalId {
422 ExternalId::wikidata("Q5")
423 }
424
425 pub fn organization() -> ExternalId {
427 ExternalId::wikidata("Q43229")
428 }
429
430 pub fn location() -> ExternalId {
432 ExternalId::wikidata("Q618123")
433 }
434
435 pub fn date() -> ExternalId {
437 ExternalId::wikidata("Q205892")
438 }
439
440 pub fn money() -> ExternalId {
442 ExternalId::wikidata("Q1368")
443 }
444}
445
446#[cfg(test)]
451mod tests {
452 use super::*;
453
454 #[test]
455 fn test_normalize_conll_labels() {
456 let norm = LabelNormalizer::default();
457
458 assert_eq!(norm.normalize("PER"), Some(CoreType::Person));
459 assert_eq!(norm.normalize("ORG"), Some(CoreType::Organization));
460 assert_eq!(norm.normalize("LOC"), Some(CoreType::Location));
461 assert_eq!(norm.normalize("MISC"), Some(CoreType::Misc));
462 }
463
464 #[test]
465 fn test_normalize_ontonotes_labels() {
466 let norm = LabelNormalizer::default();
467
468 assert_eq!(norm.normalize("PERSON"), Some(CoreType::Person));
469 assert_eq!(norm.normalize("GPE"), Some(CoreType::Location));
470 assert_eq!(norm.normalize("NORP"), Some(CoreType::Norp));
471 assert_eq!(norm.normalize("FAC"), Some(CoreType::Facility));
472 }
473
474 #[test]
475 fn test_bio_prefix_stripping() {
476 let norm = LabelNormalizer::default();
477
478 assert_eq!(norm.normalize("B-PER"), Some(CoreType::Person));
479 assert_eq!(norm.normalize("I-PER"), Some(CoreType::Person));
480 assert_eq!(norm.normalize("E-ORG"), Some(CoreType::Organization));
481 assert_eq!(norm.normalize("S-LOC"), Some(CoreType::Location));
482 }
483
484 #[test]
485 fn test_cross_lingual() {
486 let norm = LabelNormalizer::default();
487
488 assert_eq!(norm.normalize("personne"), Some(CoreType::Person));
490 assert_eq!(norm.normalize("lieu"), Some(CoreType::Location));
491
492 assert_eq!(norm.normalize("persona"), Some(CoreType::Person));
494 assert_eq!(norm.normalize("lugar"), Some(CoreType::Location));
495
496 assert_eq!(norm.normalize("ort"), Some(CoreType::Location));
498 }
499
500 #[test]
501 fn test_case_insensitive() {
502 let norm = LabelNormalizer::default();
503
504 assert_eq!(norm.normalize("per"), Some(CoreType::Person));
505 assert_eq!(norm.normalize("PER"), Some(CoreType::Person));
506 assert_eq!(norm.normalize("Per"), Some(CoreType::Person));
507 assert_eq!(norm.normalize("PERSON"), Some(CoreType::Person));
508 assert_eq!(norm.normalize("person"), Some(CoreType::Person));
509 }
510
511 #[test]
512 fn test_biomedical_registration() {
513 let norm = LabelNormalizer::default();
514 norm.register_biomedical();
515
516 assert!(norm.is_known("gene"));
517 assert!(norm.is_known("protein"));
518 assert!(norm.is_known("disease"));
519
520 match norm.normalize("gene") {
522 Some(CoreType::Domain(s)) => assert_eq!(s, "GENE"),
523 _ => panic!("Expected Domain type"),
524 }
525 }
526
527 #[test]
528 fn test_pattern_detectable() {
529 assert!(CoreType::Date.is_pattern_detectable());
530 assert!(CoreType::Email.is_pattern_detectable());
531 assert!(CoreType::Money.is_pattern_detectable());
532
533 assert!(!CoreType::Person.is_pattern_detectable());
534 assert!(!CoreType::Organization.is_pattern_detectable());
535 }
536
537 #[test]
538 fn test_global_normalizer() {
539 assert_eq!(normalize("PER"), Some(CoreType::Person));
541 assert!(is_known("ORG"));
542 assert!(!is_known("UNKNOWN_TYPE_XYZ"));
543 }
544
545 #[test]
546 fn test_external_ids() {
547 let qid = external_ids::person();
548 assert_eq!(qid.to_iri(), "http://www.wikidata.org/entity/Q5");
549
550 let dbp = ExternalId::dbpedia("Person");
551 assert_eq!(dbp.to_iri(), "http://dbpedia.org/resource/Person");
552 }
553}