1use crate::{EntityCategory, EntityType};
49use std::collections::HashMap;
50
51#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
59pub enum CanonicalType {
60 Person,
63 Group,
66
67 Organization,
70
71 GeopoliticalEntity,
74 NaturalLocation,
76 Facility,
78 Location,
80
81 Date,
84 Time,
86
87 Money,
90 Percent,
92 Quantity,
94 Cardinal,
96 Ordinal,
98
99 CreativeWork,
102 Product,
104 Event,
106 Law,
108 Language,
110
111 Disease,
114 Chemical,
116 Gene,
118 Drug,
120
121 Animal,
124 Plant,
126 Food,
128
129 Misc,
132}
133
134impl CanonicalType {
135 #[must_use]
137 pub fn name(&self) -> &'static str {
138 match self {
139 Self::Person => "PERSON",
140 Self::Group => "GROUP",
141 Self::Organization => "ORG",
142 Self::GeopoliticalEntity => "GPE",
143 Self::NaturalLocation => "LOC",
144 Self::Facility => "FAC",
145 Self::Location => "LOCATION",
146 Self::Date => "DATE",
147 Self::Time => "TIME",
148 Self::Money => "MONEY",
149 Self::Percent => "PERCENT",
150 Self::Quantity => "QUANTITY",
151 Self::Cardinal => "CARDINAL",
152 Self::Ordinal => "ORDINAL",
153 Self::CreativeWork => "WORK_OF_ART",
154 Self::Product => "PRODUCT",
155 Self::Event => "EVENT",
156 Self::Law => "LAW",
157 Self::Language => "LANGUAGE",
158 Self::Disease => "DISEASE",
159 Self::Chemical => "CHEMICAL",
160 Self::Gene => "GENE",
161 Self::Drug => "DRUG",
162 Self::Animal => "ANIMAL",
163 Self::Plant => "PLANT",
164 Self::Food => "FOOD",
165 Self::Misc => "MISC",
166 }
167 }
168
169 #[must_use]
171 pub fn category(&self) -> EntityCategory {
172 match self {
173 Self::Person | Self::Group => EntityCategory::Agent,
174 Self::Organization => EntityCategory::Organization,
175 Self::GeopoliticalEntity | Self::NaturalLocation | Self::Facility | Self::Location => {
176 EntityCategory::Place
177 }
178 Self::Date | Self::Time => EntityCategory::Temporal,
179 Self::Money | Self::Percent | Self::Quantity | Self::Cardinal | Self::Ordinal => {
180 EntityCategory::Numeric
181 }
182 Self::CreativeWork | Self::Product | Self::Event | Self::Law | Self::Language => {
183 EntityCategory::Creative
184 }
185 Self::Disease | Self::Chemical | Self::Gene | Self::Drug => EntityCategory::Agent,
186 Self::Animal | Self::Plant | Self::Food => EntityCategory::Misc,
187 Self::Misc => EntityCategory::Misc,
188 }
189 }
190
191 #[must_use]
193 pub fn to_entity_type(&self) -> EntityType {
194 match self {
195 Self::Person => EntityType::Person,
196 Self::Group => EntityType::custom("GROUP", EntityCategory::Agent),
197 Self::Organization => EntityType::Organization,
198 Self::GeopoliticalEntity => EntityType::custom("GPE", EntityCategory::Place),
199 Self::NaturalLocation => EntityType::Location,
200 Self::Facility => EntityType::custom("FAC", EntityCategory::Place),
201 Self::Location => EntityType::Location,
202 Self::Date => EntityType::Date,
203 Self::Time => EntityType::Time,
204 Self::Money => EntityType::Money,
205 Self::Percent => EntityType::Percent,
206 Self::Quantity => EntityType::Quantity,
207 Self::Cardinal => EntityType::Cardinal,
208 Self::Ordinal => EntityType::Ordinal,
209 Self::CreativeWork => EntityType::custom("WORK_OF_ART", EntityCategory::Creative),
210 Self::Product => EntityType::custom("PRODUCT", EntityCategory::Misc),
211 Self::Event => EntityType::custom("EVENT", EntityCategory::Misc),
212 Self::Law => EntityType::custom("LAW", EntityCategory::Misc),
213 Self::Language => EntityType::custom("LANGUAGE", EntityCategory::Misc),
214 Self::Disease => EntityType::custom("DISEASE", EntityCategory::Agent),
215 Self::Chemical => EntityType::custom("CHEMICAL", EntityCategory::Misc),
216 Self::Gene => EntityType::custom("GENE", EntityCategory::Misc),
217 Self::Drug => EntityType::custom("DRUG", EntityCategory::Misc),
218 Self::Animal => EntityType::custom("ANIMAL", EntityCategory::Misc),
219 Self::Plant => EntityType::custom("PLANT", EntityCategory::Misc),
220 Self::Food => EntityType::custom("FOOD", EntityCategory::Misc),
221 Self::Misc => EntityType::Other("MISC".to_string()),
222 }
223 }
224}
225
226#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
232pub enum DatasetSchema {
233 CoNLL2003,
235 OntoNotes,
237 MultiNERD,
239 FewNERD,
241 CrossNER,
243 BC5CDR,
245 NCBIDisease,
247 MITMovie,
249 MITRestaurant,
251 WNUT17,
253}
254
255impl DatasetSchema {
256 #[must_use]
258 pub fn labels(&self) -> &'static [&'static str] {
259 match self {
260 Self::CoNLL2003 => &["PER", "LOC", "ORG", "MISC"],
261 Self::OntoNotes => &[
262 "PERSON",
263 "NORP",
264 "FAC",
265 "ORG",
266 "GPE",
267 "LOC",
268 "PRODUCT",
269 "EVENT",
270 "WORK_OF_ART",
271 "LAW",
272 "LANGUAGE",
273 "DATE",
274 "TIME",
275 "PERCENT",
276 "MONEY",
277 "QUANTITY",
278 "ORDINAL",
279 "CARDINAL",
280 ],
281 Self::MultiNERD => &[
282 "PER", "LOC", "ORG", "ANIM", "BIO", "CEL", "DIS", "EVE", "FOOD", "INST", "MEDIA",
283 "MYTH", "PLANT", "TIME", "VEHI",
284 ],
285 Self::FewNERD => &[
286 "person",
287 "organization",
288 "location",
289 "building",
290 "art",
291 "product",
292 "event",
293 "other",
294 ],
295 Self::CrossNER => &[
296 "politician",
297 "election",
298 "political_party",
299 "country",
300 "location",
301 "organization",
302 "person",
303 "misc",
304 ],
305 Self::BC5CDR => &["Chemical", "Disease"],
306 Self::NCBIDisease => &["Disease"],
307 Self::MITMovie => &[
308 "Actor",
309 "Director",
310 "Genre",
311 "Title",
312 "Year",
313 "Song",
314 "Character",
315 "Plot",
316 "Rating",
317 ],
318 Self::MITRestaurant => &[
319 "Amenity",
320 "Cuisine",
321 "Dish",
322 "Hours",
323 "Location",
324 "Price",
325 "Rating",
326 "Restaurant_Name",
327 ],
328 Self::WNUT17 => &[
329 "person",
330 "location",
331 "corporation",
332 "product",
333 "creative-work",
334 "group",
335 ],
336 }
337 }
338}
339
340#[derive(Debug, Clone)]
346pub struct InformationLoss {
347 pub original: String,
349 pub canonical: CanonicalType,
351 pub lost_semantics: &'static str,
353}
354
355#[derive(Debug, Clone)]
361pub struct SchemaMapper {
362 pub source_schema: DatasetSchema,
364 mappings: HashMap<String, CanonicalType>,
366 losses: HashMap<String, InformationLoss>,
368}
369
370impl SchemaMapper {
371 #[must_use]
373 pub fn for_dataset(schema: DatasetSchema) -> Self {
374 let mut mapper = Self {
375 source_schema: schema,
376 mappings: HashMap::new(),
377 losses: HashMap::new(),
378 };
379
380 match schema {
381 DatasetSchema::CoNLL2003 => {
382 mapper.add("PER", CanonicalType::Person);
383 mapper.add("LOC", CanonicalType::Location);
384 mapper.add("ORG", CanonicalType::Organization);
385 mapper.add("MISC", CanonicalType::Misc);
386 }
387 DatasetSchema::OntoNotes => {
388 mapper.add("PERSON", CanonicalType::Person);
390
391 mapper.add_with_loss(
393 "NORP",
394 CanonicalType::Group,
395 "Nationalities/religions/politics - distinct from formal organizations",
396 );
397
398 mapper.add("GPE", CanonicalType::GeopoliticalEntity);
400 mapper.add_with_loss(
401 "LOC",
402 CanonicalType::NaturalLocation,
403 "Natural locations (mountains, rivers)",
404 );
405 mapper.add_with_loss(
406 "FAC",
407 CanonicalType::Facility,
408 "Man-made structures (buildings, bridges)",
409 );
410
411 mapper.add("ORG", CanonicalType::Organization);
413
414 mapper.add("DATE", CanonicalType::Date);
416 mapper.add("TIME", CanonicalType::Time);
417
418 mapper.add("MONEY", CanonicalType::Money);
420 mapper.add("PERCENT", CanonicalType::Percent);
421 mapper.add("QUANTITY", CanonicalType::Quantity);
422 mapper.add("CARDINAL", CanonicalType::Cardinal);
423 mapper.add("ORDINAL", CanonicalType::Ordinal);
424
425 mapper.add("PRODUCT", CanonicalType::Product);
427 mapper.add("EVENT", CanonicalType::Event);
428 mapper.add("WORK_OF_ART", CanonicalType::CreativeWork);
429 mapper.add("LAW", CanonicalType::Law);
430 mapper.add("LANGUAGE", CanonicalType::Language);
431 }
432 DatasetSchema::MultiNERD => {
433 mapper.add("PER", CanonicalType::Person);
434 mapper.add("LOC", CanonicalType::Location);
435 mapper.add("ORG", CanonicalType::Organization);
436 mapper.add("ANIM", CanonicalType::Animal);
437 mapper.add_with_loss("BIO", CanonicalType::Misc, "Biological entities");
438 mapper.add_with_loss("CEL", CanonicalType::Misc, "Celestial bodies");
439 mapper.add("DIS", CanonicalType::Disease);
440 mapper.add("EVE", CanonicalType::Event);
441 mapper.add("FOOD", CanonicalType::Food);
442 mapper.add_with_loss("INST", CanonicalType::Misc, "Instruments");
443 mapper.add_with_loss("MEDIA", CanonicalType::CreativeWork, "Media works");
444 mapper.add_with_loss("MYTH", CanonicalType::Misc, "Mythological entities");
445 mapper.add("PLANT", CanonicalType::Plant);
446 mapper.add("TIME", CanonicalType::Time);
447 mapper.add_with_loss("VEHI", CanonicalType::Product, "Vehicles");
448 }
449 DatasetSchema::FewNERD => {
450 mapper.add("person", CanonicalType::Person);
451 mapper.add("organization", CanonicalType::Organization);
452 mapper.add("location", CanonicalType::Location);
453 mapper.add_with_loss("building", CanonicalType::Facility, "Buildings/structures");
454 mapper.add("art", CanonicalType::CreativeWork);
455 mapper.add("product", CanonicalType::Product);
456 mapper.add("event", CanonicalType::Event);
457 mapper.add("other", CanonicalType::Misc);
458 }
459 DatasetSchema::CrossNER => {
460 mapper.add_with_loss("politician", CanonicalType::Person, "Political role lost");
461 mapper.add_with_loss(
462 "election",
463 CanonicalType::Event,
464 "Election specificity lost",
465 );
466 mapper.add_with_loss(
467 "political_party",
468 CanonicalType::Organization,
469 "Political nature lost",
470 );
471 mapper.add("country", CanonicalType::GeopoliticalEntity);
472 mapper.add("location", CanonicalType::Location);
473 mapper.add("organization", CanonicalType::Organization);
474 mapper.add("person", CanonicalType::Person);
475 mapper.add("misc", CanonicalType::Misc);
476 }
477 DatasetSchema::BC5CDR => {
478 mapper.add("Chemical", CanonicalType::Chemical);
479 mapper.add("Disease", CanonicalType::Disease);
480 }
481 DatasetSchema::NCBIDisease => {
482 mapper.add("Disease", CanonicalType::Disease);
483 }
484 DatasetSchema::MITMovie => {
485 mapper.add_with_loss("Actor", CanonicalType::Person, "Acting role lost");
486 mapper.add_with_loss("Director", CanonicalType::Person, "Directing role lost");
487 mapper.add_with_loss("Character", CanonicalType::Person, "Fictional status lost");
488 mapper.add("Title", CanonicalType::CreativeWork);
489 mapper.add("Year", CanonicalType::Date);
490 mapper.add_with_loss("Song", CanonicalType::CreativeWork, "Song vs film lost");
491 mapper.add_with_loss("Genre", CanonicalType::Misc, "Genre semantics lost");
492 mapper.add_with_loss("Plot", CanonicalType::Misc, "Plot description lost");
493 mapper.add_with_loss("Rating", CanonicalType::Misc, "Rating semantics lost");
494 }
495 DatasetSchema::MITRestaurant => {
496 mapper.add("Restaurant_Name", CanonicalType::Organization);
497 mapper.add("Location", CanonicalType::Location);
498 mapper.add_with_loss("Cuisine", CanonicalType::Misc, "Cuisine type lost");
499 mapper.add_with_loss("Dish", CanonicalType::Food, "Dish specifics lost");
500 mapper.add("Price", CanonicalType::Money);
501 mapper.add_with_loss("Amenity", CanonicalType::Misc, "Amenity type lost");
502 mapper.add("Hours", CanonicalType::Time);
503 mapper.add_with_loss("Rating", CanonicalType::Misc, "Rating semantics lost");
504 }
505 DatasetSchema::WNUT17 => {
506 mapper.add("person", CanonicalType::Person);
507 mapper.add("location", CanonicalType::Location);
508 mapper.add("corporation", CanonicalType::Organization);
509 mapper.add("product", CanonicalType::Product);
510 mapper.add("creative-work", CanonicalType::CreativeWork);
511 mapper.add("group", CanonicalType::Group);
512 }
513 }
514
515 mapper
516 }
517
518 fn add(&mut self, label: &str, canonical: CanonicalType) {
520 self.mappings.insert(label.to_uppercase(), canonical);
521 }
522
523 fn add_with_loss(
525 &mut self,
526 label: &str,
527 canonical: CanonicalType,
528 lost_semantics: &'static str,
529 ) {
530 let upper = label.to_uppercase();
531 self.mappings.insert(upper.clone(), canonical);
532 self.losses.insert(
533 upper.clone(),
534 InformationLoss {
535 original: label.to_string(),
536 canonical,
537 lost_semantics,
538 },
539 );
540 }
541
542 #[must_use]
544 pub fn to_canonical(&self, label: &str) -> CanonicalType {
545 self.mappings
546 .get(&label.to_uppercase())
547 .copied()
548 .unwrap_or(CanonicalType::Misc)
549 }
550
551 #[must_use]
553 pub fn information_loss(&self, label: &str) -> Option<&InformationLoss> {
554 self.losses.get(&label.to_uppercase())
555 }
556
557 #[must_use]
559 pub fn to_entity_type(&self, label: &str) -> EntityType {
560 self.to_canonical(label).to_entity_type()
561 }
562
563 pub fn all_losses(&self) -> impl Iterator<Item = &InformationLoss> {
565 self.losses.values()
566 }
567
568 #[must_use]
573 pub fn label_overlap(&self, other: &SchemaMapper) -> f64 {
574 let self_canonicals: std::collections::HashSet<_> =
575 self.mappings.values().copied().collect();
576 let other_canonicals: std::collections::HashSet<_> =
577 other.mappings.values().copied().collect();
578
579 let intersection = self_canonicals.intersection(&other_canonicals).count();
580 let union = self_canonicals.union(&other_canonicals).count();
581
582 if union == 0 {
583 0.0
584 } else {
585 intersection as f64 / union as f64
586 }
587 }
588}
589
590#[must_use]
609pub fn map_to_canonical(label: &str, schema: Option<DatasetSchema>) -> EntityType {
610 let label = label
611 .strip_prefix("B-")
612 .or_else(|| label.strip_prefix("I-"))
613 .or_else(|| label.strip_prefix("E-"))
614 .or_else(|| label.strip_prefix("S-"))
615 .or_else(|| label.strip_prefix("L-"))
616 .or_else(|| label.strip_prefix("U-"))
617 .unwrap_or(label);
618
619 if let Some(schema) = schema {
620 SchemaMapper::for_dataset(schema).to_entity_type(label)
621 } else {
622 map_label_heuristic(label)
624 }
625}
626
627fn map_label_heuristic(label: &str) -> EntityType {
629 match label.to_uppercase().as_str() {
630 "PER" | "PERSON" | "ACTOR" | "DIRECTOR" | "CHARACTER" | "POLITICIAN" => EntityType::Person,
632
633 "NORP" | "GROUP" | "NATIONALITY" | "RELIGION" => {
635 EntityType::custom("GROUP", EntityCategory::Agent)
636 }
637
638 "ORG" | "ORGANIZATION" | "ORGANISATION" | "CORPORATION" | "COMPANY" | "POLITICAL_PARTY"
640 | "RESTAURANT_NAME" => EntityType::Organization,
641
642 "GPE" | "COUNTRY" | "CITY" | "STATE" => EntityType::custom("GPE", EntityCategory::Place),
644 "FAC" | "FACILITY" | "BUILDING" => EntityType::custom("FAC", EntityCategory::Place),
645 "LOC" | "LOCATION" | "GEO" => EntityType::Location,
646
647 "DATE" | "YEAR" => EntityType::Date,
649 "TIME" | "HOURS" => EntityType::Time,
650
651 "MONEY" | "PRICE" | "CURRENCY" => EntityType::Money,
653 "PERCENT" | "PERCENTAGE" => EntityType::Percent,
654 "QUANTITY" => EntityType::Quantity,
655 "CARDINAL" => EntityType::Cardinal,
656 "ORDINAL" => EntityType::Ordinal,
657
658 "PRODUCT" | "PROD" => EntityType::custom("PRODUCT", EntityCategory::Misc),
660 "EVENT" | "EVE" | "ELECTION" => EntityType::custom("EVENT", EntityCategory::Misc),
661 "WORK_OF_ART" | "CREATIVE-WORK" | "TITLE" | "SONG" | "ART" | "MEDIA" | "BOOK" => {
662 EntityType::custom("WORK_OF_ART", EntityCategory::Creative)
663 }
664 "LAW" => EntityType::custom("LAW", EntityCategory::Misc),
665 "LANGUAGE" => EntityType::custom("LANGUAGE", EntityCategory::Misc),
666
667 "OFI" | "OFFICIAL" | "POSITION" | "TITLE_OFFICE" => {
669 EntityType::custom("OFFICIAL", EntityCategory::Misc)
670 }
671
672 "DISEASE" | "DIS" => EntityType::custom("DISEASE", EntityCategory::Agent),
674 "CHEMICAL" => EntityType::custom("CHEMICAL", EntityCategory::Misc),
675 "GENE" => EntityType::custom("GENE", EntityCategory::Misc),
676 "DRUG" => EntityType::custom("DRUG", EntityCategory::Misc),
677
678 "ANIM" | "ANIMAL" => EntityType::custom("ANIMAL", EntityCategory::Misc),
680 "PLANT" => EntityType::custom("PLANT", EntityCategory::Misc),
681 "FOOD" | "DISH" | "CUISINE" => EntityType::custom("FOOD", EntityCategory::Misc),
682 "VEHI" | "VEHICLE" => EntityType::custom("VEHICLE", EntityCategory::Misc),
683
684 "EMAIL" => EntityType::Email,
686 "URL" | "URI" => EntityType::Url,
687 "PHONE" | "TELEPHONE" => EntityType::Phone,
688
689 "MISC" | "MISCELLANEOUS" | "O" | "OTHER" => EntityType::Other("MISC".to_string()),
691
692 other => EntityType::Other(other.to_string()),
694 }
695}
696
697#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
705pub enum CoarseType {
706 Person,
708 Organization,
710 Location,
712 DateTime,
714 Numeric,
716 Other,
718}
719
720impl CoarseType {
721 #[must_use]
723 pub fn from_canonical(ct: CanonicalType) -> Self {
724 match ct {
725 CanonicalType::Person | CanonicalType::Group => Self::Person,
726 CanonicalType::Organization => Self::Organization,
727 CanonicalType::GeopoliticalEntity
728 | CanonicalType::NaturalLocation
729 | CanonicalType::Facility
730 | CanonicalType::Location => Self::Location,
731 CanonicalType::Date | CanonicalType::Time => Self::DateTime,
732 CanonicalType::Money
733 | CanonicalType::Percent
734 | CanonicalType::Quantity
735 | CanonicalType::Cardinal
736 | CanonicalType::Ordinal => Self::Numeric,
737 _ => Self::Other,
738 }
739 }
740
741 #[must_use]
743 pub fn from_label(label: &str) -> Self {
744 let canonical = SchemaMapper::for_dataset(DatasetSchema::OntoNotes).to_canonical(label);
745 Self::from_canonical(canonical)
746 }
747}
748
749#[cfg(test)]
754mod tests {
755 use super::*;
756
757 #[test]
758 fn test_norp_is_not_organization() {
759 let mapper = SchemaMapper::for_dataset(DatasetSchema::OntoNotes);
760 let norp = mapper.to_canonical("NORP");
761 let org = mapper.to_canonical("ORG");
762
763 assert_eq!(norp, CanonicalType::Group);
764 assert_eq!(org, CanonicalType::Organization);
765 assert_ne!(norp, org, "NORP should NOT map to Organization!");
766 }
767
768 #[test]
769 fn test_location_distinctions_preserved() {
770 let mapper = SchemaMapper::for_dataset(DatasetSchema::OntoNotes);
771
772 assert_eq!(
773 mapper.to_canonical("GPE"),
774 CanonicalType::GeopoliticalEntity
775 );
776 assert_eq!(mapper.to_canonical("LOC"), CanonicalType::NaturalLocation);
777 assert_eq!(mapper.to_canonical("FAC"), CanonicalType::Facility);
778 }
779
780 #[test]
781 fn test_information_loss_documented() {
782 let mapper = SchemaMapper::for_dataset(DatasetSchema::OntoNotes);
783
784 let fac_loss = mapper.information_loss("FAC");
785 assert!(fac_loss.is_some());
786 let loss_text = fac_loss.unwrap().lost_semantics.to_lowercase();
787 assert!(loss_text.contains("structure") || loss_text.contains("building"));
789 }
790
791 #[test]
792 fn test_conll_to_ontonotes_overlap() {
793 let conll = SchemaMapper::for_dataset(DatasetSchema::CoNLL2003);
794 let ontonotes = SchemaMapper::for_dataset(DatasetSchema::OntoNotes);
795
796 let overlap = conll.label_overlap(&ontonotes);
797 assert!(overlap < 0.5);
799 }
800
801 #[test]
802 fn test_unified_mapping_strips_bio() {
803 let et = map_to_canonical("B-PER", None);
804 assert_eq!(et, EntityType::Person);
805
806 let et = map_to_canonical("I-ORG", None);
807 assert_eq!(et, EntityType::Organization);
808 }
809
810 #[test]
811 fn test_coarse_schema() {
812 assert_eq!(
813 CoarseType::from_canonical(CanonicalType::Person),
814 CoarseType::Person
815 );
816 assert_eq!(
817 CoarseType::from_canonical(CanonicalType::Group),
818 CoarseType::Person
819 );
820 assert_eq!(
821 CoarseType::from_canonical(CanonicalType::GeopoliticalEntity),
822 CoarseType::Location
823 );
824 }
825}