use crate::{CanonicalId, CoreferenceResolver, Gender};
use crate::{Entity, EntityType};
use std::collections::HashMap;
#[derive(Debug, Clone)]
pub struct CorefConfig {
pub max_pronoun_lookback: usize,
pub fuzzy_matching: bool,
pub include_singletons: bool,
pub use_name_gazetteer: bool,
pub acronym_matching: bool,
pub relaxed_head_match: bool,
pub proper_containment: bool,
pub precise_constructs: bool,
pub strict_head_match: bool,
pub proper_head_word_match: bool,
}
impl Default for CorefConfig {
fn default() -> Self {
Self {
max_pronoun_lookback: 3,
fuzzy_matching: true,
include_singletons: true,
use_name_gazetteer: true,
acronym_matching: true,
relaxed_head_match: true,
proper_containment: true,
precise_constructs: true,
strict_head_match: true,
proper_head_word_match: true,
}
}
}
#[derive(Debug, Clone)]
pub struct SimpleCorefResolver {
config: CorefConfig,
}
impl Default for SimpleCorefResolver {
fn default() -> Self {
Self::new(CorefConfig::default())
}
}
impl SimpleCorefResolver {
#[must_use]
pub fn new(config: CorefConfig) -> Self {
Self { config }
}
#[must_use]
pub fn resolve_entities(&self, entities: &[Entity]) -> Vec<Entity> {
if entities.is_empty() {
return vec![];
}
let mut resolved = entities.to_vec();
let mut next_cluster_id = CanonicalId::ZERO;
let mut canonical_to_cluster: HashMap<String, CanonicalId> = HashMap::new();
for i in 0..resolved.len() {
let entity = &resolved[i];
if entity.canonical_id.is_some() {
continue;
}
let cluster_id =
self.find_matching_cluster(entity, &resolved[..i], &canonical_to_cluster);
let cluster_id = cluster_id.unwrap_or_else(|| {
let id = next_cluster_id;
next_cluster_id += 1;
id
});
resolved[i].canonical_id = Some(cluster_id);
let canonical = self.canonical_form(&resolved[i].text, &resolved[i].entity_type);
canonical_to_cluster.insert(canonical, cluster_id);
}
resolved
}
#[must_use]
pub fn resolve(&self, entities: &[Entity]) -> Vec<Entity> {
self.resolve_entities(entities)
}
#[cfg(feature = "analysis")]
#[must_use]
pub fn resolve_to_chains(&self, entities: &[Entity]) -> Vec<crate::metrics::coref::CorefChain> {
let resolved = self.resolve_entities(entities);
crate::metrics::coref::entities_to_chains(&resolved)
}
fn find_matching_cluster(
&self,
entity: &Entity,
previous: &[Entity],
canonical_map: &HashMap<String, CanonicalId>,
) -> Option<CanonicalId> {
if self.is_pronoun(&entity.text) {
return self.resolve_pronoun(entity, previous);
}
let canonical = self.canonical_form(&entity.text, &entity.entity_type);
if let Some(&cluster_id) = canonical_map.get(&canonical) {
return Some(cluster_id);
}
if self.config.precise_constructs {
for prev in previous.iter().rev() {
if let Some(cluster_id) = prev.canonical_id {
if self.is_precise_construct(entity, prev) {
return Some(cluster_id);
}
}
}
}
if self.config.acronym_matching {
for (other_canonical, &cluster_id) in canonical_map {
if self.is_acronym_match(&canonical, other_canonical) {
return Some(cluster_id);
}
}
}
if self.config.strict_head_match {
for prev in previous.iter().rev() {
if let Some(cluster_id) = prev.canonical_id {
if self.is_strict_head_match(entity, prev) {
return Some(cluster_id);
}
}
}
}
let is_wildcard_type = matches!(entity.entity_type, EntityType::Custom { .. });
if !is_wildcard_type {
if self.config.proper_head_word_match {
for prev in previous.iter().rev() {
if let Some(cluster_id) = prev.canonical_id {
if self.is_proper_head_word_match(entity, prev) {
return Some(cluster_id);
}
}
}
}
if self.config.relaxed_head_match {
for prev in previous.iter().rev() {
if let Some(cluster_id) = prev.canonical_id {
if self.is_relaxed_head_match(entity, prev) {
return Some(cluster_id);
}
}
}
}
if self.config.proper_containment {
for prev in previous.iter().rev() {
if let Some(cluster_id) = prev.canonical_id {
if self.is_proper_containment(entity, prev) {
return Some(cluster_id);
}
}
}
}
if self.config.fuzzy_matching {
for (other_canonical, &cluster_id) in canonical_map {
if self.names_match(&canonical, other_canonical) {
return Some(cluster_id);
}
}
}
}
None
}
fn resolve_pronoun(&self, pronoun: &Entity, previous: &[Entity]) -> Option<CanonicalId> {
let pronoun_gender = self.infer_gender(&pronoun.text);
for entity in previous
.iter()
.rev()
.take(self.config.max_pronoun_lookback * 10)
{
if self.is_pronoun(&entity.text) {
continue;
}
if !self.pronoun_compatible(&pronoun.text, &entity.entity_type) {
continue;
}
let entity_gender = self.infer_gender(&entity.text);
if let (Some(pg), Some(eg)) = (pronoun_gender, entity_gender) {
if !pg.is_compatible(&eg) {
continue;
}
}
return entity.canonical_id;
}
None
}
pub(crate) fn is_pronoun(&self, text: &str) -> bool {
matches!(
text.to_lowercase().as_str(),
"he" | "she" | "him" | "her" | "his" | "hers" | "himself" | "herself" |
"they" | "them" | "their" | "theirs" | "themselves" | "themself" |
"it" | "its" | "itself" |
"xe" | "xem" | "xyr" | "xyrs" | "xemself" |
"ze" | "hir" | "zir" | "hirs" | "zirs" | "hirself" | "zirself" |
"ey" | "em" | "eir" | "eirs" | "emself" |
"fae" | "faer" | "faers" | "faeself" | "faerself"
)
}
fn pronoun_compatible(&self, pronoun: &str, entity_type: &EntityType) -> bool {
let lower = pronoun.to_lowercase();
match entity_type {
EntityType::Person => matches!(
lower.as_str(),
"he" | "she"
| "they"
| "him"
| "her"
| "them"
| "his"
| "hers"
| "their"
| "theirs"
| "himself"
| "herself"
| "themselves"
| "themself"
| "it"
| "its"
| "itself"
| "xe"
| "xem"
| "xyr"
| "xyrs"
| "xemself"
| "ze"
| "hir"
| "zir"
| "hirs"
| "zirs"
| "hirself"
| "zirself"
| "ey"
| "em"
| "eir"
| "eirs"
| "emself"
| "fae"
| "faer"
| "faers"
| "faeself"
),
EntityType::Organization => matches!(
lower.as_str(),
"it" | "they" | "its" | "their" | "theirs" | "itself" | "themselves"
),
EntityType::Location => matches!(lower.as_str(), "it" | "its" | "itself"),
_ => matches!(lower.as_str(), "it" | "its" | "itself"),
}
}
fn infer_gender(&self, text: &str) -> Option<Gender> {
if let Some(g) = Gender::from_pronoun(text) {
return Some(g);
}
if self.config.use_name_gazetteer {
return gender_from_name(text);
}
None
}
fn canonical_form(&self, text: &str, entity_type: &EntityType) -> String {
let normalized = text.to_lowercase().trim().to_string();
format!("{}:{}", entity_type.as_label(), normalized)
}
fn is_acronym_match(&self, name1: &str, name2: &str) -> bool {
let (type1, text1) = name1.split_once(':').unwrap_or(("", name1));
let (type2, text2) = name2.split_once(':').unwrap_or(("", name2));
if type1 != type2 {
return false;
}
let words1: Vec<&str> = text1.split_whitespace().collect();
let words2: Vec<&str> = text2.split_whitespace().collect();
let (acronym, words) = if words1.len() == 1 && words2.len() > 1 {
(text1, &words2)
} else if words2.len() == 1 && words1.len() > 1 {
(text2, &words1)
} else {
return false;
};
let acronym_chars: Vec<char> = acronym.chars().collect();
if acronym_chars.len() < 2 || acronym_chars.len() != words.len() {
return false;
}
acronym_chars
.iter()
.zip(words.iter())
.all(|(&ac, word)| word.starts_with(ac))
}
fn is_precise_construct(&self, a: &Entity, b: &Entity) -> bool {
if a.entity_type != b.entity_type {
return false;
}
if self.is_pronoun(&a.text) || self.is_pronoun(&b.text) {
return false;
}
let gap = if a.start() >= b.end() {
a.start() - b.end()
} else if b.start() >= a.end() {
b.start() - a.end()
} else {
return false;
};
gap <= 2
}
fn is_strict_head_match(&self, a: &Entity, b: &Entity) -> bool {
if a.entity_type != b.entity_type {
return false;
}
if (a.start() >= b.start() && a.end() <= b.end())
|| (b.start() >= a.start() && b.end() <= a.end())
{
return false;
}
let head_a = Self::head_word(&a.text);
let head_b = Self::head_word(&b.text);
if !head_a.eq_ignore_ascii_case(head_b) {
return false;
}
let gender_a = self.infer_gender(&a.text);
let gender_b = self.infer_gender(&b.text);
match (gender_a, gender_b) {
(Some(ga), Some(gb)) => ga.is_compatible(&gb),
_ => true, }
}
fn is_proper_head_word_match(&self, a: &Entity, b: &Entity) -> bool {
if a.entity_type != b.entity_type {
return false;
}
let words_a: Vec<&str> = a.text.split_whitespace().collect();
let words_b: Vec<&str> = b.text.split_whitespace().collect();
if words_a.len() < 2 || words_b.len() < 2 {
return false;
}
let head_a = Self::head_word(&a.text);
let head_b = Self::head_word(&b.text);
let head_a_in_b = words_b.iter().any(|w| w.eq_ignore_ascii_case(head_a));
let head_b_in_a = words_a.iter().any(|w| w.eq_ignore_ascii_case(head_b));
head_a_in_b || head_b_in_a
}
fn head_word(text: &str) -> &str {
text.split_whitespace().next_back().unwrap_or(text)
}
fn is_relaxed_head_match(&self, a: &Entity, b: &Entity) -> bool {
if a.entity_type != b.entity_type {
return false;
}
let words_a: Vec<&str> = a.text.split_whitespace().collect();
let words_b: Vec<&str> = b.text.split_whitespace().collect();
if words_a.len() < 2 || words_b.len() < 2 {
return false;
}
words_a
.last()
.unwrap()
.eq_ignore_ascii_case(words_b.last().unwrap())
}
fn is_proper_containment(&self, a: &Entity, b: &Entity) -> bool {
if a.entity_type != b.entity_type {
return false;
}
let text_a = a.text.trim();
let text_b = b.text.trim();
if text_a.is_empty() || text_b.is_empty() || text_a.eq_ignore_ascii_case(text_b) {
return false; }
let (shorter, longer) = if text_a.len() < text_b.len() {
(text_a, text_b)
} else if text_b.len() < text_a.len() {
(text_b, text_a)
} else {
return false; };
let longer_lower = longer.to_lowercase();
let shorter_lower = shorter.to_lowercase();
let longer_words: Vec<&str> = longer_lower.split_whitespace().collect();
let shorter_words: Vec<&str> = shorter_lower.split_whitespace().collect();
if shorter_words.is_empty() || shorter_words.len() >= longer_words.len() {
return false;
}
longer_words
.windows(shorter_words.len())
.any(|window| window == shorter_words.as_slice())
}
fn names_match(&self, name1: &str, name2: &str) -> bool {
let (type1, text1) = name1.split_once(':').unwrap_or(("", name1));
let (type2, text2) = name2.split_once(':').unwrap_or(("", name2));
if type1 != type2 {
return false;
}
if text1 == text2 {
return true;
}
let (shorter, longer) = if text1.len() <= text2.len() {
(text1, text2)
} else {
(text2, text1)
};
if longer.contains(shorter) {
let shorter_char_count = shorter.chars().count();
let longer_char_count = longer.chars().count();
let ratio = shorter_char_count as f64 / longer_char_count as f64;
if shorter_char_count >= 5 && ratio > 0.3 {
return true;
}
if longer.split_whitespace().any(|word| word == shorter) {
return true;
}
}
let words1: Vec<&str> = text1.split_whitespace().collect();
let words2: Vec<&str> = text2.split_whitespace().collect();
if words1.len() > 1 && words2.len() == 1 && words1.last() == words2.first() {
return true;
}
if words2.len() > 1 && words1.len() == 1 && words2.last() == words1.first() {
return true;
}
false
}
}
pub(crate) fn gender_from_name(text: &str) -> Option<Gender> {
let first_word = text.split_whitespace().next()?;
let lower = first_word.to_lowercase();
match lower.as_str() {
"james" | "john" | "robert" | "michael" | "david" | "william" | "richard" | "joseph"
| "thomas" | "charles" | "christopher" | "daniel" | "matthew" | "anthony" | "mark"
| "donald" | "steven" | "paul" | "andrew" | "joshua" | "kenneth" | "kevin" | "brian"
| "george" | "timothy" | "ronald" | "edward" | "jason" | "jeffrey" | "ryan" | "jacob"
| "gary" | "nicholas" | "eric" | "jonathan" | "stephen" | "larry" | "justin" | "scott"
| "brandon" | "benjamin" | "samuel" | "raymond" | "gregory" | "frank" | "alexander"
| "patrick" | "jack" | "dennis" | "peter" | "bob" | "jim" | "tom" | "mike" | "bill"
| "joe" | "dan" | "matt" | "steve" | "chris" | "nick" | "ben" | "sam" | "jake" | "adam"
| "henry" | "nathan" | "philip" | "carl" | "ahmed" | "ahmad" | "mohammed" | "muhammad"
| "omar" | "ali" | "hassan" | "hussein" | "khalid" | "ibrahim" => Some(Gender::Masculine),
"mary" | "patricia" | "jennifer" | "linda" | "barbara" | "elizabeth" | "susan"
| "jessica" | "sarah" | "karen" | "lisa" | "nancy" | "betty" | "margaret" | "sandra"
| "ashley" | "dorothy" | "kimberly" | "emily" | "donna" | "michelle" | "carol"
| "amanda" | "melissa" | "deborah" | "stephanie" | "rebecca" | "sharon" | "laura"
| "cynthia" | "kathleen" | "amy" | "angela" | "shirley" | "anna" | "brenda" | "pamela"
| "emma" | "nicole" | "helen" | "samantha" | "katherine" | "christine" | "debra"
| "rachel" | "carolyn" | "janet" | "catherine" | "maria" | "heather" | "diane" | "ruth"
| "julie" | "olivia" | "joyce" | "virginia" | "victoria" | "kelly" | "lauren"
| "christina" | "joan" | "evelyn" | "judith" | "alice" | "ann" | "anne" | "jane"
| "jean" | "marie" | "rose" | "grace" | "fatima" | "aisha" | "maryam" | "nour"
| "layla" | "hana" => Some(Gender::Feminine),
_ => None,
}
}
impl CoreferenceResolver for SimpleCorefResolver {
fn resolve(&self, entities: &[Entity]) -> Vec<Entity> {
self.resolve_entities(entities)
}
fn name(&self) -> &'static str {
"simple-rule-based"
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::EntityCategory;
fn resolver() -> SimpleCorefResolver {
SimpleCorefResolver::new(CorefConfig::default())
}
#[test]
fn precise_construct_appositive_adjacent() {
let r = resolver();
let entities = vec![
Entity::new("Barack Obama", EntityType::Person, 0, 12, 0.9),
Entity::new("the president", EntityType::Person, 14, 27, 0.85),
];
let resolved = r.resolve(&entities);
assert_eq!(
resolved[0].canonical_id, resolved[1].canonical_id,
"appositive entities should corefer"
);
}
#[test]
fn precise_construct_rejects_distant_entities() {
let entities = vec![
Entity::new("Barack Obama", EntityType::Person, 0, 12, 0.9),
Entity::new("the president", EntityType::Person, 50, 63, 0.85),
];
let cfg = CorefConfig {
fuzzy_matching: false,
relaxed_head_match: false,
proper_containment: false,
strict_head_match: false,
proper_head_word_match: false,
acronym_matching: false,
..Default::default()
};
let r = SimpleCorefResolver::new(cfg);
let resolved = r.resolve(&entities);
assert_ne!(
resolved[0].canonical_id, resolved[1].canonical_id,
"distant entities should not match via precise constructs"
);
}
#[test]
fn precise_construct_rejects_different_types() {
let a = Entity::new("Acme Corp", EntityType::Organization, 0, 9, 0.9);
let b = Entity::new("New York", EntityType::Location, 11, 19, 0.85);
assert!(
!resolver().is_precise_construct(&a, &b),
"different entity types should not match"
);
}
#[test]
fn strict_head_match_same_head_compatible_gender() {
let r = resolver();
let a = Entity::new("John Smith", EntityType::Person, 0, 10, 0.9);
let b = Entity::new("Robert Smith", EntityType::Person, 50, 62, 0.9);
assert!(
r.is_strict_head_match(&a, &b),
"same head word + compatible gender should match"
);
}
#[test]
fn strict_head_match_i_within_i_rejection() {
let r = resolver();
let outer = Entity::new(
"the president of the company",
EntityType::Person,
0,
30,
0.9,
);
let inner = Entity::new("the company", EntityType::Person, 15, 26, 0.9);
assert!(
!r.is_strict_head_match(&outer, &inner),
"nested spans (i-within-i) should be rejected"
);
}
#[test]
fn strict_head_match_gender_incompatible() {
let r = resolver();
let a = Entity::new("John Smith", EntityType::Person, 0, 10, 0.9);
let b = Entity::new("Mary Smith", EntityType::Person, 50, 60, 0.9);
assert!(
!r.is_strict_head_match(&a, &b),
"gender-incompatible mentions should not match"
);
}
#[test]
fn strict_head_match_single_word() {
let r = resolver();
let a = Entity::new("Obama", EntityType::Person, 0, 5, 0.9);
let b = Entity::new("Obama", EntityType::Person, 50, 55, 0.9);
assert!(
r.is_strict_head_match(&a, &b),
"identical single-word mentions should match via strict head"
);
}
#[test]
fn proper_head_word_match_cross_reference() {
let r = resolver();
let a = Entity::new("President Obama", EntityType::Person, 0, 15, 0.9);
let b = Entity::new("Barack Obama", EntityType::Person, 50, 62, 0.9);
assert!(
r.is_proper_head_word_match(&a, &b),
"head 'Obama' found in both mentions"
);
}
#[test]
fn proper_head_word_match_head_in_longer() {
let r = resolver();
let a = Entity::new("President Obama", EntityType::Person, 0, 15, 0.9);
let b = Entity::new("Barack Hussein Obama", EntityType::Person, 50, 70, 0.9);
assert!(
r.is_proper_head_word_match(&a, &b),
"head 'Obama' from A found in B"
);
}
#[test]
fn proper_head_word_rejects_single_word() {
let r = resolver();
let a = Entity::new("Obama", EntityType::Person, 0, 5, 0.9);
let b = Entity::new("Barack Obama", EntityType::Person, 50, 62, 0.9);
assert!(
!r.is_proper_head_word_match(&a, &b),
"single-word mention should not match via proper head word sieve"
);
}
#[test]
fn proper_head_word_rejects_different_types() {
let r = resolver();
let a = Entity::new("New York Times", EntityType::Organization, 0, 14, 0.9);
let b = Entity::new("New York", EntityType::Location, 50, 58, 0.9);
assert!(
!r.is_proper_head_word_match(&a, &b),
"different entity types should not match"
);
}
#[test]
fn integration_strict_head_clusters_smiths() {
let r = resolver();
let entities = vec![
Entity::new("Acme Corp", EntityType::Organization, 0, 9, 0.9),
Entity::new("Acme Corporation", EntityType::Organization, 50, 66, 0.9),
];
let resolved = r.resolve(&entities);
assert_eq!(
resolved[0].canonical_id, resolved[1].canonical_id,
"should corefer via fuzzy matching"
);
}
#[test]
fn integration_new_sieves_do_not_break_existing() {
let r = resolver();
let entities = vec![
Entity::new("John Smith", EntityType::Person, 0, 10, 0.9),
Entity::new("he", EntityType::Person, 15, 17, 0.8),
Entity::new("John Smith", EntityType::Person, 30, 40, 0.9),
];
let resolved = r.resolve(&entities);
assert_eq!(resolved[0].canonical_id, resolved[1].canonical_id);
assert_eq!(resolved[0].canonical_id, resolved[2].canonical_id);
}
#[test]
fn head_word_extraction() {
assert_eq!(SimpleCorefResolver::head_word("President Obama"), "Obama");
assert_eq!(SimpleCorefResolver::head_word("Obama"), "Obama");
assert_eq!(
SimpleCorefResolver::head_word("the United States"),
"States"
);
assert_eq!(SimpleCorefResolver::head_word(""), "");
}
#[test]
fn fuzzy_rejects_short_substrings() {
let r = resolver();
assert!(
!r.names_match("ORG:ceo", "ORG:ceoville"),
"3-char substring 'ceo' should not match 'ceoville'"
);
assert!(
!r.names_match("PER:the", "PER:other"),
"'the' should not match 'other' via substring"
);
assert!(
!r.names_match("ORG:art", "ORG:article"),
"'art' should not match 'article'"
);
}
#[test]
fn fuzzy_accepts_long_substrings() {
let r = resolver();
assert!(
r.names_match("PER:obama", "PER:barack obama"),
"'obama' should match 'barack obama'"
);
assert!(
r.names_match("PER:smith", "PER:john smith"),
"'smith' should match 'john smith'"
);
}
#[test]
fn fuzzy_rejects_low_ratio_non_word_substrings() {
let r = resolver();
assert!(
!r.names_match("ORG:angel", "ORG:los angeles international airport"),
"Low-ratio non-word substring should not match"
);
}
#[test]
fn fuzzy_accepts_word_boundary_even_low_ratio() {
let r = resolver();
assert!(
r.names_match("ORG:march", "ORG:march of the penguins documentary film"),
"Word-boundary match should still work for complete words"
);
}
#[test]
fn fuzzy_accepts_word_boundary_match() {
let r = resolver();
assert!(
r.names_match("PER:ceo", "PER:ceo john"),
"'ceo' should match 'ceo john' at word boundary"
);
}
#[test]
fn fuzzy_last_word_match() {
let r = resolver();
assert!(
r.names_match("PER:obama", "PER:barack obama"),
"Last word 'obama' should match multi-word"
);
assert!(
r.names_match("PER:barack obama", "PER:obama"),
"Last word match should be symmetric"
);
}
#[test]
fn fuzzy_rejects_different_types() {
let r = resolver();
assert!(
!r.names_match("PER:obama", "ORG:obama"),
"Different entity types should not match"
);
}
#[test]
fn fuzzy_exact_match() {
let r = resolver();
assert!(r.names_match("PER:john", "PER:john"));
}
#[test]
fn fuzzy_no_transitive_chain_via_short_substrings() {
let r = resolver();
assert!(
!r.names_match("PER:ceo", "PER:thursday"),
"CEO should not match Thursday"
);
assert!(
!r.names_match("PER:thursday", "PER:shuntaro furukawa"),
"Thursday should not match Shuntaro Furukawa"
);
assert!(
!r.names_match("PER:ceo", "PER:shuntaro furukawa"),
"CEO should not match Shuntaro Furukawa"
);
}
#[test]
fn integration_no_spurious_merge_short_names() {
let cfg = CorefConfig {
relaxed_head_match: false,
proper_containment: false,
strict_head_match: false,
proper_head_word_match: false,
precise_constructs: false,
acronym_matching: false,
..Default::default()
};
let r = SimpleCorefResolver::new(cfg);
let entities = vec![
Entity::new("CEO", EntityType::Person, 0, 3, 0.9),
Entity::new("Furukawa", EntityType::Person, 20, 28, 0.9),
];
let resolved = r.resolve(&entities);
assert_ne!(
resolved[0].canonical_id, resolved[1].canonical_id,
"CEO and Furukawa should NOT be in the same cluster via fuzzy matching"
);
}
#[test]
fn proper_entities_not_spuriously_merged() {
let r = resolver();
let entities = vec![
Entity::new(
"Nobel",
EntityType::custom("proper", EntityCategory::Misc),
0,
5,
0.8,
),
Entity::new(
"Emmanuelle",
EntityType::custom("proper", EntityCategory::Misc),
20,
30,
0.8,
),
];
let resolved = r.resolve(&entities);
assert_ne!(
resolved[0].canonical_id, resolved[1].canonical_id,
"wildcard-type entities 'Nobel' and 'Emmanuelle' should NOT be merged"
);
}
#[test]
fn coref_does_not_merge_distinct_people() {
let r = resolver();
let entities = vec![
Entity::new("Jennifer Doudna", EntityType::Person, 0, 15, 0.9),
Entity::new("Emmanuelle Charpentier", EntityType::Person, 20, 42, 0.9),
];
let resolved = r.resolve(&entities);
assert_ne!(
resolved[0].canonical_id, resolved[1].canonical_id,
"Doudna and Charpentier should NOT be in the same cluster"
);
}
#[test]
fn coref_exact_match_still_works_for_wildcard() {
let r = resolver();
let entities = vec![
Entity::new(
"Nobel",
EntityType::custom("proper", EntityCategory::Misc),
0,
5,
0.8,
),
Entity::new(
"Nobel",
EntityType::custom("proper", EntityCategory::Misc),
20,
25,
0.8,
),
];
let resolved = r.resolve(&entities);
assert_eq!(
resolved[0].canonical_id, resolved[1].canonical_id,
"identical wildcard-type entities should still be merged via exact match"
);
}
}