use crate::{EntityCategory, EntityType};
use std::collections::HashMap;
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum CanonicalType {
Person,
Group,
Organization,
GeopoliticalEntity,
NaturalLocation,
Facility,
Location,
Date,
Time,
Money,
Percent,
Quantity,
Cardinal,
Ordinal,
CreativeWork,
Product,
Event,
Law,
Language,
Disease,
Chemical,
Gene,
Drug,
Animal,
Plant,
Food,
Misc,
}
impl CanonicalType {
#[must_use]
pub fn name(&self) -> &'static str {
match self {
Self::Person => "PERSON",
Self::Group => "GROUP",
Self::Organization => "ORG",
Self::GeopoliticalEntity => "GPE",
Self::NaturalLocation => "LOC",
Self::Facility => "FAC",
Self::Location => "LOCATION",
Self::Date => "DATE",
Self::Time => "TIME",
Self::Money => "MONEY",
Self::Percent => "PERCENT",
Self::Quantity => "QUANTITY",
Self::Cardinal => "CARDINAL",
Self::Ordinal => "ORDINAL",
Self::CreativeWork => "WORK_OF_ART",
Self::Product => "PRODUCT",
Self::Event => "EVENT",
Self::Law => "LAW",
Self::Language => "LANGUAGE",
Self::Disease => "DISEASE",
Self::Chemical => "CHEMICAL",
Self::Gene => "GENE",
Self::Drug => "DRUG",
Self::Animal => "ANIMAL",
Self::Plant => "PLANT",
Self::Food => "FOOD",
Self::Misc => "MISC",
}
}
#[must_use]
pub fn category(&self) -> EntityCategory {
match self {
Self::Person | Self::Group => EntityCategory::Agent,
Self::Organization => EntityCategory::Organization,
Self::GeopoliticalEntity | Self::NaturalLocation | Self::Facility | Self::Location => {
EntityCategory::Place
}
Self::Date | Self::Time => EntityCategory::Temporal,
Self::Money | Self::Percent | Self::Quantity | Self::Cardinal | Self::Ordinal => {
EntityCategory::Numeric
}
Self::CreativeWork | Self::Product | Self::Event | Self::Law | Self::Language => {
EntityCategory::Creative
}
Self::Disease | Self::Chemical | Self::Gene | Self::Drug => EntityCategory::Agent,
Self::Animal | Self::Plant | Self::Food => EntityCategory::Misc,
Self::Misc => EntityCategory::Misc,
}
}
#[must_use]
pub fn to_entity_type(&self) -> EntityType {
match self {
Self::Person => EntityType::Person,
Self::Group => EntityType::custom("GROUP", EntityCategory::Agent),
Self::Organization => EntityType::Organization,
Self::GeopoliticalEntity => EntityType::custom("GPE", EntityCategory::Place),
Self::NaturalLocation => EntityType::Location,
Self::Facility => EntityType::custom("FAC", EntityCategory::Place),
Self::Location => EntityType::Location,
Self::Date => EntityType::Date,
Self::Time => EntityType::Time,
Self::Money => EntityType::Money,
Self::Percent => EntityType::Percent,
Self::Quantity => EntityType::Quantity,
Self::Cardinal => EntityType::Cardinal,
Self::Ordinal => EntityType::Ordinal,
Self::CreativeWork => EntityType::custom("WORK_OF_ART", EntityCategory::Creative),
Self::Product => EntityType::custom("PRODUCT", EntityCategory::Misc),
Self::Event => EntityType::custom("EVENT", EntityCategory::Misc),
Self::Law => EntityType::custom("LAW", EntityCategory::Misc),
Self::Language => EntityType::custom("LANGUAGE", EntityCategory::Misc),
Self::Disease => EntityType::custom("DISEASE", EntityCategory::Agent),
Self::Chemical => EntityType::custom("CHEMICAL", EntityCategory::Misc),
Self::Gene => EntityType::custom("GENE", EntityCategory::Misc),
Self::Drug => EntityType::custom("DRUG", EntityCategory::Misc),
Self::Animal => EntityType::custom("ANIMAL", EntityCategory::Misc),
Self::Plant => EntityType::custom("PLANT", EntityCategory::Misc),
Self::Food => EntityType::custom("FOOD", EntityCategory::Misc),
Self::Misc => EntityType::custom("MISC", EntityCategory::Misc),
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum DatasetSchema {
CoNLL2003,
OntoNotes,
MultiNERD,
FewNERD,
CrossNER,
BC5CDR,
NCBIDisease,
MITMovie,
MITRestaurant,
WNUT17,
}
impl DatasetSchema {
#[must_use]
pub fn labels(&self) -> &'static [&'static str] {
match self {
Self::CoNLL2003 => &["PER", "LOC", "ORG", "MISC"],
Self::OntoNotes => &[
"PERSON",
"NORP",
"FAC",
"ORG",
"GPE",
"LOC",
"PRODUCT",
"EVENT",
"WORK_OF_ART",
"LAW",
"LANGUAGE",
"DATE",
"TIME",
"PERCENT",
"MONEY",
"QUANTITY",
"ORDINAL",
"CARDINAL",
],
Self::MultiNERD => &[
"PER", "LOC", "ORG", "ANIM", "BIO", "CEL", "DIS", "EVE", "FOOD", "INST", "MEDIA",
"MYTH", "PLANT", "TIME", "VEHI",
],
Self::FewNERD => &[
"person",
"organization",
"location",
"building",
"art",
"product",
"event",
"other",
],
Self::CrossNER => &[
"politician",
"election",
"political_party",
"country",
"location",
"organization",
"person",
"misc",
],
Self::BC5CDR => &["Chemical", "Disease"],
Self::NCBIDisease => &["Disease"],
Self::MITMovie => &[
"Actor",
"Director",
"Genre",
"Title",
"Year",
"Song",
"Character",
"Plot",
"Rating",
],
Self::MITRestaurant => &[
"Amenity",
"Cuisine",
"Dish",
"Hours",
"Location",
"Price",
"Rating",
"Restaurant_Name",
],
Self::WNUT17 => &[
"person",
"location",
"corporation",
"product",
"creative-work",
"group",
],
}
}
}
#[derive(Debug, Clone)]
pub struct InformationLoss {
pub original: String,
pub canonical: CanonicalType,
pub lost_semantics: &'static str,
}
#[derive(Debug, Clone)]
pub struct SchemaMapper {
pub source_schema: DatasetSchema,
mappings: HashMap<String, CanonicalType>,
losses: HashMap<String, InformationLoss>,
}
impl SchemaMapper {
#[must_use]
pub fn for_dataset(schema: DatasetSchema) -> Self {
let mut mapper = Self {
source_schema: schema,
mappings: HashMap::new(),
losses: HashMap::new(),
};
match schema {
DatasetSchema::CoNLL2003 => {
mapper.add("PER", CanonicalType::Person);
mapper.add("LOC", CanonicalType::Location);
mapper.add("ORG", CanonicalType::Organization);
mapper.add("MISC", CanonicalType::Misc);
}
DatasetSchema::OntoNotes => {
mapper.add("PERSON", CanonicalType::Person);
mapper.add_with_loss(
"NORP",
CanonicalType::Group,
"Nationalities/religions/politics - distinct from formal organizations",
);
mapper.add("GPE", CanonicalType::GeopoliticalEntity);
mapper.add_with_loss(
"LOC",
CanonicalType::NaturalLocation,
"Natural locations (mountains, rivers)",
);
mapper.add_with_loss(
"FAC",
CanonicalType::Facility,
"Man-made structures (buildings, bridges)",
);
mapper.add("ORG", CanonicalType::Organization);
mapper.add("DATE", CanonicalType::Date);
mapper.add("TIME", CanonicalType::Time);
mapper.add("MONEY", CanonicalType::Money);
mapper.add("PERCENT", CanonicalType::Percent);
mapper.add("QUANTITY", CanonicalType::Quantity);
mapper.add("CARDINAL", CanonicalType::Cardinal);
mapper.add("ORDINAL", CanonicalType::Ordinal);
mapper.add("PRODUCT", CanonicalType::Product);
mapper.add("EVENT", CanonicalType::Event);
mapper.add("WORK_OF_ART", CanonicalType::CreativeWork);
mapper.add("LAW", CanonicalType::Law);
mapper.add("LANGUAGE", CanonicalType::Language);
}
DatasetSchema::MultiNERD => {
mapper.add("PER", CanonicalType::Person);
mapper.add("LOC", CanonicalType::Location);
mapper.add("ORG", CanonicalType::Organization);
mapper.add("ANIM", CanonicalType::Animal);
mapper.add_with_loss("BIO", CanonicalType::Misc, "Biological entities");
mapper.add_with_loss("CEL", CanonicalType::Misc, "Celestial bodies");
mapper.add("DIS", CanonicalType::Disease);
mapper.add("EVE", CanonicalType::Event);
mapper.add("FOOD", CanonicalType::Food);
mapper.add_with_loss("INST", CanonicalType::Misc, "Instruments");
mapper.add_with_loss("MEDIA", CanonicalType::CreativeWork, "Media works");
mapper.add_with_loss("MYTH", CanonicalType::Misc, "Mythological entities");
mapper.add("PLANT", CanonicalType::Plant);
mapper.add("TIME", CanonicalType::Time);
mapper.add_with_loss("VEHI", CanonicalType::Product, "Vehicles");
}
DatasetSchema::FewNERD => {
mapper.add("person", CanonicalType::Person);
mapper.add("organization", CanonicalType::Organization);
mapper.add("location", CanonicalType::Location);
mapper.add_with_loss("building", CanonicalType::Facility, "Buildings/structures");
mapper.add("art", CanonicalType::CreativeWork);
mapper.add("product", CanonicalType::Product);
mapper.add("event", CanonicalType::Event);
mapper.add("other", CanonicalType::Misc);
}
DatasetSchema::CrossNER => {
mapper.add_with_loss("politician", CanonicalType::Person, "Political role lost");
mapper.add_with_loss(
"election",
CanonicalType::Event,
"Election specificity lost",
);
mapper.add_with_loss(
"political_party",
CanonicalType::Organization,
"Political nature lost",
);
mapper.add("country", CanonicalType::GeopoliticalEntity);
mapper.add("location", CanonicalType::Location);
mapper.add("organization", CanonicalType::Organization);
mapper.add("person", CanonicalType::Person);
mapper.add("misc", CanonicalType::Misc);
}
DatasetSchema::BC5CDR => {
mapper.add("Chemical", CanonicalType::Chemical);
mapper.add("Disease", CanonicalType::Disease);
}
DatasetSchema::NCBIDisease => {
mapper.add("Disease", CanonicalType::Disease);
}
DatasetSchema::MITMovie => {
mapper.add_with_loss("Actor", CanonicalType::Person, "Acting role lost");
mapper.add_with_loss("Director", CanonicalType::Person, "Directing role lost");
mapper.add_with_loss("Character", CanonicalType::Person, "Fictional status lost");
mapper.add("Title", CanonicalType::CreativeWork);
mapper.add("Year", CanonicalType::Date);
mapper.add_with_loss("Song", CanonicalType::CreativeWork, "Song vs film lost");
mapper.add_with_loss("Genre", CanonicalType::Misc, "Genre semantics lost");
mapper.add_with_loss("Plot", CanonicalType::Misc, "Plot description lost");
mapper.add_with_loss("Rating", CanonicalType::Misc, "Rating semantics lost");
}
DatasetSchema::MITRestaurant => {
mapper.add("Restaurant_Name", CanonicalType::Organization);
mapper.add("Location", CanonicalType::Location);
mapper.add_with_loss("Cuisine", CanonicalType::Misc, "Cuisine type lost");
mapper.add_with_loss("Dish", CanonicalType::Food, "Dish specifics lost");
mapper.add("Price", CanonicalType::Money);
mapper.add_with_loss("Amenity", CanonicalType::Misc, "Amenity type lost");
mapper.add("Hours", CanonicalType::Time);
mapper.add_with_loss("Rating", CanonicalType::Misc, "Rating semantics lost");
}
DatasetSchema::WNUT17 => {
mapper.add("person", CanonicalType::Person);
mapper.add("location", CanonicalType::Location);
mapper.add("corporation", CanonicalType::Organization);
mapper.add("product", CanonicalType::Product);
mapper.add("creative-work", CanonicalType::CreativeWork);
mapper.add("group", CanonicalType::Group);
}
}
mapper
}
fn add(&mut self, label: &str, canonical: CanonicalType) {
self.mappings.insert(label.to_uppercase(), canonical);
}
fn add_with_loss(
&mut self,
label: &str,
canonical: CanonicalType,
lost_semantics: &'static str,
) {
let upper = label.to_uppercase();
self.mappings.insert(upper.clone(), canonical);
self.losses.insert(
upper.clone(),
InformationLoss {
original: label.to_string(),
canonical,
lost_semantics,
},
);
}
#[must_use]
pub fn to_canonical(&self, label: &str) -> CanonicalType {
self.mappings
.get(&label.to_uppercase())
.copied()
.unwrap_or(CanonicalType::Misc)
}
#[must_use]
pub fn information_loss(&self, label: &str) -> Option<&InformationLoss> {
self.losses.get(&label.to_uppercase())
}
#[must_use]
pub fn to_entity_type(&self, label: &str) -> EntityType {
self.to_canonical(label).to_entity_type()
}
pub fn all_losses(&self) -> impl Iterator<Item = &InformationLoss> {
self.losses.values()
}
#[must_use]
pub fn label_overlap(&self, other: &SchemaMapper) -> f64 {
let self_canonicals: std::collections::HashSet<_> =
self.mappings.values().copied().collect();
let other_canonicals: std::collections::HashSet<_> =
other.mappings.values().copied().collect();
let intersection = self_canonicals.intersection(&other_canonicals).count();
let union = self_canonicals.union(&other_canonicals).count();
if union == 0 {
0.0
} else {
intersection as f64 / union as f64
}
}
}
#[must_use]
pub fn map_to_canonical(label: &str, schema: Option<DatasetSchema>) -> EntityType {
let label = label
.strip_prefix("B-")
.or_else(|| label.strip_prefix("I-"))
.or_else(|| label.strip_prefix("E-"))
.or_else(|| label.strip_prefix("S-"))
.or_else(|| label.strip_prefix("L-"))
.or_else(|| label.strip_prefix("U-"))
.unwrap_or(label);
if let Some(schema) = schema {
SchemaMapper::for_dataset(schema).to_entity_type(label)
} else {
map_label_heuristic(label)
}
}
fn map_label_heuristic(label: &str) -> EntityType {
match label.to_uppercase().as_str() {
"PER" | "PERSON" | "ACTOR" | "DIRECTOR" | "CHARACTER" | "POLITICIAN" => EntityType::Person,
"NORP" | "GROUP" | "NATIONALITY" | "RELIGION" => {
EntityType::custom("GROUP", EntityCategory::Agent)
}
"ORG" | "ORGANIZATION" | "ORGANISATION" | "CORPORATION" | "COMPANY" | "POLITICAL_PARTY"
| "RESTAURANT_NAME" => EntityType::Organization,
"GPE" | "COUNTRY" | "CITY" | "STATE" => EntityType::custom("GPE", EntityCategory::Place),
"FAC" | "FACILITY" | "BUILDING" => EntityType::custom("FAC", EntityCategory::Place),
"LOC" | "LOCATION" | "GEO" => EntityType::Location,
"DATE" | "YEAR" => EntityType::Date,
"TIME" | "HOURS" => EntityType::Time,
"MONEY" | "PRICE" | "CURRENCY" => EntityType::Money,
"PERCENT" | "PERCENTAGE" => EntityType::Percent,
"QUANTITY" => EntityType::Quantity,
"CARDINAL" => EntityType::Cardinal,
"ORDINAL" => EntityType::Ordinal,
"PRODUCT" | "PROD" => EntityType::custom("PRODUCT", EntityCategory::Misc),
"EVENT" | "EVE" | "ELECTION" => EntityType::custom("EVENT", EntityCategory::Misc),
"WORK_OF_ART" | "CREATIVE-WORK" | "TITLE" | "SONG" | "ART" | "MEDIA" | "BOOK" => {
EntityType::custom("WORK_OF_ART", EntityCategory::Creative)
}
"LAW" => EntityType::custom("LAW", EntityCategory::Misc),
"LANGUAGE" => EntityType::custom("LANGUAGE", EntityCategory::Misc),
"OFI" | "OFFICIAL" | "POSITION" | "TITLE_OFFICE" => {
EntityType::custom("OFFICIAL", EntityCategory::Misc)
}
"DISEASE" | "DIS" => EntityType::custom("DISEASE", EntityCategory::Agent),
"CHEMICAL" => EntityType::custom("CHEMICAL", EntityCategory::Misc),
"GENE" => EntityType::custom("GENE", EntityCategory::Misc),
"DRUG" => EntityType::custom("DRUG", EntityCategory::Misc),
"ANIM" | "ANIMAL" => EntityType::custom("ANIMAL", EntityCategory::Misc),
"PLANT" => EntityType::custom("PLANT", EntityCategory::Misc),
"FOOD" | "DISH" | "CUISINE" => EntityType::custom("FOOD", EntityCategory::Misc),
"VEHI" | "VEHICLE" => EntityType::custom("VEHICLE", EntityCategory::Misc),
"EMAIL" => EntityType::Email,
"URL" | "URI" => EntityType::Url,
"PHONE" | "TELEPHONE" => EntityType::Phone,
"MISC" | "MISCELLANEOUS" | "O" | "OTHER" => {
EntityType::custom("MISC", EntityCategory::Misc)
}
other => EntityType::custom(other, EntityCategory::Misc),
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum CoarseType {
Person,
Organization,
Location,
DateTime,
Numeric,
Other,
}
impl CoarseType {
#[must_use]
pub fn from_canonical(ct: CanonicalType) -> Self {
match ct {
CanonicalType::Person | CanonicalType::Group => Self::Person,
CanonicalType::Organization => Self::Organization,
CanonicalType::GeopoliticalEntity
| CanonicalType::NaturalLocation
| CanonicalType::Facility
| CanonicalType::Location => Self::Location,
CanonicalType::Date | CanonicalType::Time => Self::DateTime,
CanonicalType::Money
| CanonicalType::Percent
| CanonicalType::Quantity
| CanonicalType::Cardinal
| CanonicalType::Ordinal => Self::Numeric,
_ => Self::Other,
}
}
#[must_use]
pub fn from_label(label: &str) -> Self {
let canonical = SchemaMapper::for_dataset(DatasetSchema::OntoNotes).to_canonical(label);
Self::from_canonical(canonical)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_norp_is_not_organization() {
let mapper = SchemaMapper::for_dataset(DatasetSchema::OntoNotes);
let norp = mapper.to_canonical("NORP");
let org = mapper.to_canonical("ORG");
assert_eq!(norp, CanonicalType::Group);
assert_eq!(org, CanonicalType::Organization);
assert_ne!(norp, org, "NORP should NOT map to Organization!");
}
#[test]
fn test_location_distinctions_preserved() {
let mapper = SchemaMapper::for_dataset(DatasetSchema::OntoNotes);
assert_eq!(
mapper.to_canonical("GPE"),
CanonicalType::GeopoliticalEntity
);
assert_eq!(mapper.to_canonical("LOC"), CanonicalType::NaturalLocation);
assert_eq!(mapper.to_canonical("FAC"), CanonicalType::Facility);
}
#[test]
fn test_information_loss_documented() {
let mapper = SchemaMapper::for_dataset(DatasetSchema::OntoNotes);
let fac_loss = mapper.information_loss("FAC");
assert!(fac_loss.is_some());
let loss_text = fac_loss.unwrap().lost_semantics.to_lowercase();
assert!(loss_text.contains("structure") || loss_text.contains("building"));
}
#[test]
fn test_conll_to_ontonotes_overlap() {
let conll = SchemaMapper::for_dataset(DatasetSchema::CoNLL2003);
let ontonotes = SchemaMapper::for_dataset(DatasetSchema::OntoNotes);
let overlap = conll.label_overlap(&ontonotes);
assert!(overlap < 0.5);
}
#[test]
fn test_unified_mapping_strips_bio() {
let et = map_to_canonical("B-PER", None);
assert_eq!(et, EntityType::Person);
let et = map_to_canonical("I-ORG", None);
assert_eq!(et, EntityType::Organization);
}
#[test]
fn test_coarse_schema() {
assert_eq!(
CoarseType::from_canonical(CanonicalType::Person),
CoarseType::Person
);
assert_eq!(
CoarseType::from_canonical(CanonicalType::Group),
CoarseType::Person
);
assert_eq!(
CoarseType::from_canonical(CanonicalType::GeopoliticalEntity),
CoarseType::Location
);
}
}