1use crate::{Model, Result};
188use anno_core::{Gender, MentionType};
189use std::collections::{HashMap, HashSet};
190
191#[derive(Debug, Clone)]
193struct ScoredPair {
194 mention_idx: usize,
196 antecedent_idx: usize,
198 score: f64,
200}
201
202#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
212pub enum ClusteringStrategy {
213 #[default]
215 LeftToRight,
216 EasyFirst,
220}
221
222#[derive(Debug, Clone)]
247pub struct MentionRankingConfig {
248 pub link_threshold: f64,
250
251 pub pronoun_max_antecedents: usize,
258
259 pub proper_max_antecedents: usize,
263
264 pub nominal_max_antecedents: usize,
268
269 pub max_distance: usize,
272
273 pub enable_global_proper_coref: bool,
281
282 pub global_proper_threshold: f64,
285
286 pub clustering_strategy: ClusteringStrategy,
291
292 pub use_non_coref_constraints: bool,
295
296 pub non_coref_threshold: f64,
299
300 pub string_match_weight: f64,
305 pub type_compat_weight: f64,
307 pub distance_weight: f64,
309
310 pub salience_weight: f64,
320
321 pub enable_be_phrase_detection: bool,
328
329 pub be_phrase_weight: f64,
331
332 pub enable_acronym_matching: bool,
334
335 pub acronym_weight: f64,
337
338 pub enable_context_filtering: bool,
341
342 pub enable_synonym_matching: bool,
350
351 pub synonym_weight: f64,
353
354 pub enable_nominal_adjective_detection: bool,
382
383 pub language: String,
398}
399
400impl Default for MentionRankingConfig {
401 fn default() -> Self {
402 Self {
403 link_threshold: 0.3,
404
405 pronoun_max_antecedents: 30, proper_max_antecedents: 300, nominal_max_antecedents: 300, max_distance: 100,
412
413 enable_global_proper_coref: false, global_proper_threshold: 0.7,
416
417 clustering_strategy: ClusteringStrategy::LeftToRight,
419 use_non_coref_constraints: false,
420 non_coref_threshold: 0.2,
421
422 string_match_weight: 1.0,
424 type_compat_weight: 0.5,
425 distance_weight: 0.1,
426
427 salience_weight: 0.0,
429
430 enable_be_phrase_detection: false,
432 be_phrase_weight: 0.8,
433 enable_acronym_matching: false,
434 acronym_weight: 0.7,
435 enable_context_filtering: false,
436 enable_synonym_matching: false,
437 synonym_weight: 0.5,
438
439 enable_nominal_adjective_detection: false,
441
442 language: "en".to_string(),
444 }
445 }
446}
447
448impl MentionRankingConfig {
449 #[must_use]
456 pub fn book_scale() -> Self {
457 Self {
458 link_threshold: 0.3,
459
460 pronoun_max_antecedents: 30,
462 proper_max_antecedents: 300,
463 nominal_max_antecedents: 300,
464
465 max_distance: 500, enable_global_proper_coref: true,
469 global_proper_threshold: 0.7,
470
471 clustering_strategy: ClusteringStrategy::EasyFirst,
472 use_non_coref_constraints: true,
473 non_coref_threshold: 0.2,
474
475 string_match_weight: 1.0,
477 type_compat_weight: 0.5,
478 distance_weight: 0.05, salience_weight: 0.2,
482
483 enable_be_phrase_detection: true,
485 be_phrase_weight: 0.8,
486 enable_acronym_matching: true,
487 acronym_weight: 0.7,
488 enable_context_filtering: true,
489 enable_synonym_matching: false, synonym_weight: 0.5,
491 enable_nominal_adjective_detection: false,
492 language: "en".to_string(),
493 }
494 }
495
496 #[must_use]
515 pub fn clinical() -> Self {
516 Self {
517 link_threshold: 0.3,
518
519 pronoun_max_antecedents: 30,
521 proper_max_antecedents: 100,
522 nominal_max_antecedents: 100,
523
524 max_distance: 200,
525
526 enable_global_proper_coref: true,
528 global_proper_threshold: 0.6,
529
530 clustering_strategy: ClusteringStrategy::EasyFirst,
532 use_non_coref_constraints: true,
533 non_coref_threshold: 0.2,
534
535 string_match_weight: 1.2,
537 type_compat_weight: 0.5,
538 distance_weight: 0.08,
539
540 salience_weight: 0.15,
542
543 enable_be_phrase_detection: true,
545 be_phrase_weight: 0.9, enable_acronym_matching: true,
547 acronym_weight: 0.8, enable_context_filtering: true,
549 enable_synonym_matching: true, synonym_weight: 0.6,
551 enable_nominal_adjective_detection: false,
552 language: "en".to_string(),
553 }
554 }
555
556 #[must_use]
561 pub fn with_salience(mut self, weight: f64) -> Self {
562 self.salience_weight = weight.clamp(0.0, 1.0);
563 self
564 }
565
566 #[must_use]
568 pub fn max_antecedents_for_type(&self, mention_type: MentionType) -> usize {
569 match mention_type {
570 MentionType::Pronominal => self.pronoun_max_antecedents,
571 MentionType::Proper => self.proper_max_antecedents,
572 MentionType::Nominal => self.nominal_max_antecedents,
573 MentionType::Zero | MentionType::Unknown => self.nominal_max_antecedents,
575 }
576 }
577}
578
579#[derive(Debug, Clone)]
611pub struct RankedMention {
612 pub start: usize,
616
617 pub end: usize,
621
622 pub text: String,
626
627 pub mention_type: MentionType,
632
633 pub gender: Option<Gender>,
640
641 pub number: Option<Number>,
649
650 pub head: String,
655}
656
657impl RankedMention {
658 #[must_use]
660 pub fn span(&self) -> (usize, usize) {
661 (self.start, self.end)
662 }
663}
664
665impl From<&RankedMention> for anno_core::Mention {
669 fn from(mention: &RankedMention) -> Self {
670 Self {
671 text: mention.text.clone(),
672 start: mention.start,
673 end: mention.end,
674 head_start: None,
675 head_end: None,
676 entity_type: None,
677 mention_type: Some(mention.mention_type),
678 }
679 }
680}
681
682impl From<RankedMention> for anno_core::Mention {
683 fn from(mention: RankedMention) -> Self {
684 Self::from(&mention)
685 }
686}
687
688impl From<&crate::Entity> for RankedMention {
692 fn from(entity: &crate::Entity) -> Self {
693 Self {
694 start: entity.start,
695 end: entity.end,
696 text: entity.text.clone(),
697 mention_type: MentionType::classify(&entity.text),
698 gender: None,
699 number: None,
700 head: extract_head(&entity.text),
701 }
702 }
703}
704
705impl From<crate::Entity> for RankedMention {
706 fn from(entity: crate::Entity) -> Self {
707 Self::from(&entity)
708 }
709}
710
711fn extract_head(text: &str) -> String {
713 text.split_whitespace().last().unwrap_or(text).to_string()
714}
715
716pub use anno_core::Number;
719
720#[derive(Debug, Clone)]
722pub struct MentionCluster {
723 pub id: usize,
725 pub mentions: Vec<RankedMention>,
727}
728
729impl MentionCluster {
730 #[must_use]
738 pub fn to_signals(
739 &self,
740 signal_id_base: anno_core::SignalId,
741 ) -> Vec<anno_core::Signal<anno_core::Location>> {
742 self.mentions
743 .iter()
744 .enumerate()
745 .map(|(idx, mention)| anno_core::Signal {
746 id: signal_id_base + idx as u64,
747 location: anno_core::Location::Text {
748 start: mention.start,
749 end: mention.end,
750 },
751 surface: mention.text.clone(),
752 label: anno_core::TypeLabel::from(mention.mention_type.as_label()),
753 confidence: 1.0,
754 hierarchical: None,
755 provenance: None,
756 modality: anno_core::Modality::Symbolic,
757 normalized: None,
758 negated: false,
759 quantifier: None,
760 })
761 .collect()
762 }
763
764 #[must_use]
775 pub fn to_track(
776 &self,
777 signal_id_base: anno_core::SignalId,
778 ) -> (
779 anno_core::Track,
780 Vec<anno_core::Signal<anno_core::Location>>,
781 ) {
782 let signals = self.to_signals(signal_id_base);
783
784 let canonical_surface = self
786 .mentions
787 .iter()
788 .find(|m| m.mention_type == MentionType::Proper)
789 .or_else(|| self.mentions.first())
790 .map(|m| m.text.clone())
791 .unwrap_or_default();
792
793 let mut track =
795 anno_core::Track::new(anno_core::TrackId::new(self.id as u64), canonical_surface);
796 track.entity_type = None;
798
799 for (idx, _) in signals.iter().enumerate() {
800 track.add_signal(signal_id_base + idx as u64, idx as u32);
801 }
802
803 (track, signals)
804 }
805
806 #[must_use]
808 pub fn canonical_mention(&self) -> Option<&RankedMention> {
809 self.mentions
810 .iter()
811 .find(|m| m.mention_type == MentionType::Proper)
812 .or_else(|| self.mentions.first())
813 }
814}
815
816impl RankedMention {
817 #[must_use]
819 pub fn to_signal(
820 &self,
821 signal_id: anno_core::SignalId,
822 ) -> anno_core::Signal<anno_core::Location> {
823 anno_core::Signal {
824 id: signal_id,
825 location: anno_core::Location::Text {
826 start: self.start,
827 end: self.end,
828 },
829 surface: self.text.clone(),
830 label: anno_core::TypeLabel::from(self.mention_type.as_label()),
831 confidence: 1.0,
832 hierarchical: None,
833 provenance: None,
834 modality: anno_core::Modality::Symbolic,
835 normalized: None,
836 negated: false,
837 quantifier: None,
838 }
839 }
840}
841
842pub struct MentionRankingCoref {
853 config: MentionRankingConfig,
855 ner: Option<Box<dyn Model>>,
857 salience_scores: Option<HashMap<String, f64>>,
860}
861
862impl std::fmt::Debug for MentionRankingCoref {
863 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
864 f.debug_struct("MentionRankingCoref")
865 .field("config", &self.config)
866 .field("ner", &self.ner.as_ref().map(|_| "Some(dyn Model)"))
867 .field(
868 "salience_scores",
869 &self
870 .salience_scores
871 .as_ref()
872 .map(|s| format!("{} entities", s.len())),
873 )
874 .finish()
875 }
876}
877
878impl MentionRankingCoref {
879 #[must_use]
881 pub fn new() -> Self {
882 Self::with_config(MentionRankingConfig::default())
883 }
884
885 #[must_use]
887 pub fn with_config(config: MentionRankingConfig) -> Self {
888 Self {
889 config,
890 ner: None,
891 salience_scores: None,
892 }
893 }
894
895 pub fn with_ner(mut self, ner: Box<dyn Model>) -> Self {
897 self.ner = Some(ner);
898 self
899 }
900
901 #[must_use]
926 pub fn with_salience(mut self, scores: HashMap<String, f64>) -> Self {
927 let normalized: HashMap<String, f64> = scores
929 .into_iter()
930 .map(|(k, v)| (k.to_lowercase(), v))
931 .collect();
932 self.salience_scores = Some(normalized);
933 self
934 }
935
936 fn get_salience(&self, text: &str) -> f64 {
938 self.salience_scores
939 .as_ref()
940 .and_then(|s| s.get(&text.to_lowercase()).copied())
941 .unwrap_or(0.0)
942 }
943
944 fn is_be_phrase_link(&self, text: &str, m1: &RankedMention, m2: &RankedMention) -> bool {
959 let (earlier, later) = if m1.end <= m2.start {
961 (m1, m2)
962 } else if m2.end <= m1.start {
963 (m2, m1)
964 } else {
965 return false; };
967
968 let text_chars: Vec<char> = text.chars().collect();
970 if later.start > text_chars.len() || earlier.end > text_chars.len() {
971 return false;
972 }
973
974 let between: String = text_chars
975 .get(earlier.end..later.start)
976 .unwrap_or(&[])
977 .iter()
978 .collect();
979 let between_lower = between.to_lowercase();
980
981 static BE_PATTERNS: &[&str] = &[
983 " is ",
984 " are ",
985 " was ",
986 " were ",
987 " be ",
988 " being ",
989 " been ",
990 " refers to ",
991 " means ",
992 " indicates ",
993 " represents ",
994 " also known as ",
995 " aka ",
996 " i.e. ",
997 " ie ",
998 " namely ",
999 " called ",
1000 " named ",
1001 " known as ",
1002 " defined as ",
1003 ];
1004
1005 BE_PATTERNS.iter().any(|p| between_lower.contains(p))
1006 }
1007
1008 fn is_acronym_match(&self, m1: &RankedMention, m2: &RankedMention) -> bool {
1022 anno_core::coalesce::similarity::is_acronym_match(&m1.text, &m2.text)
1023 }
1024
1025 fn is_pleonastic_it(&self, text_lower: &str, it_byte_pos: usize) -> bool {
1037 let after_it = &text_lower[it_byte_pos + 2..]; let after_it_trimmed = after_it.trim_start();
1040
1041 const WEATHER_VERBS: &[&str] = &[
1043 "rain",
1044 "rains",
1045 "rained",
1046 "raining",
1047 "snow",
1048 "snows",
1049 "snowed",
1050 "snowing",
1051 "hail",
1052 "hails",
1053 "hailed",
1054 "hailing",
1055 "thunder",
1056 "thunders",
1057 "thundered",
1058 "thundering",
1059 ];
1060
1061 const WEATHER_ADJS: &[&str] = &[
1063 "sunny", "cloudy", "foggy", "windy", "rainy", "snowy", "cold", "hot", "warm", "cool",
1064 "humid", "dry", "freezing", "chilly", "muggy", "overcast",
1065 ];
1066
1067 const MODAL_ADJS: &[&str] = &[
1069 "important",
1070 "necessary",
1071 "possible",
1072 "impossible",
1073 "likely",
1074 "unlikely",
1075 "clear",
1076 "obvious",
1077 "evident",
1078 "apparent",
1079 "true",
1080 "false",
1081 "certain",
1082 "uncertain",
1083 "doubtful",
1084 "essential",
1085 "vital",
1086 "crucial",
1087 "critical",
1088 "imperative",
1089 "fortunate",
1090 "unfortunate",
1091 "surprising",
1092 "unsurprising",
1093 "strange",
1094 "odd",
1095 "weird",
1096 "remarkable",
1097 "noteworthy",
1098 "known",
1099 "unknown",
1100 "believed",
1101 "thought",
1102 "said",
1103 "reported",
1104 "estimated",
1105 "assumed",
1106 "expected",
1107 "hoped",
1108 "feared",
1109 ];
1110
1111 const COGNITIVE_VERBS: &[&str] = &[
1113 "seems",
1114 "seem",
1115 "seemed",
1116 "appears",
1117 "appear",
1118 "appeared",
1119 "turns out",
1120 "turned out",
1121 "happens",
1122 "happen",
1123 "happened",
1124 "follows",
1125 "follow",
1126 "followed",
1127 "matters",
1128 "matter",
1129 "mattered",
1130 "helps",
1131 "help",
1132 "helped",
1133 "hurts",
1134 "hurt",
1135 ];
1136
1137 for verb in WEATHER_VERBS {
1139 if let Some(after_verb) = after_it_trimmed.strip_prefix(verb) {
1140 if after_verb.is_empty() || after_verb.starts_with(|c: char| !c.is_alphanumeric()) {
1141 return true;
1142 }
1143 }
1144 }
1145
1146 for verb in COGNITIVE_VERBS {
1148 if let Some(after_verb) = after_it_trimmed.strip_prefix(verb) {
1149 if after_verb.is_empty() || after_verb.starts_with(|c: char| !c.is_alphanumeric()) {
1150 return true;
1151 }
1152 }
1153 }
1154
1155 let copula_patterns = ["is ", "was ", "'s ", "has been ", "will be ", "would be "];
1158 for copula in copula_patterns {
1159 if let Some(after_copula) = after_it_trimmed.strip_prefix(copula) {
1160 let after_copula = after_copula.trim_start();
1161
1162 for verb in WEATHER_VERBS {
1164 if let Some(after_verb) = after_copula.strip_prefix(verb) {
1165 if after_verb.is_empty()
1166 || after_verb.starts_with(|c: char| !c.is_alphanumeric())
1167 {
1168 return true;
1169 }
1170 }
1171 }
1172
1173 for adj in WEATHER_ADJS {
1175 if let Some(after_adj) = after_copula.strip_prefix(adj) {
1176 if after_adj.is_empty()
1177 || after_adj.starts_with(|c: char| !c.is_alphanumeric())
1178 {
1179 return true;
1180 }
1181 }
1182 }
1183
1184 for adj in MODAL_ADJS {
1186 if let Some(after_adj) = after_copula.strip_prefix(adj) {
1187 if after_adj.is_empty()
1189 || after_adj.starts_with(" that")
1190 || after_adj.starts_with(" to")
1191 || after_adj.starts_with(|c: char| !c.is_alphanumeric())
1192 {
1193 return true;
1194 }
1195 }
1196 }
1197
1198 let time_words = ["noon", "midnight", "morning", "evening", "night", "time"];
1201 for tw in time_words {
1202 if after_copula.starts_with(tw) {
1203 return true;
1204 }
1205 }
1206
1207 if after_copula.starts_with(|c: char| c.is_ascii_digit()) {
1209 return true;
1210 }
1211 }
1212 }
1213
1214 false
1215 }
1216
1217 fn should_filter_by_context(&self, text: &str, m1: &RankedMention, m2: &RankedMention) -> bool {
1225 let text_chars: Vec<char> = text.chars().collect();
1226 let char_count = text_chars.len();
1227
1228 let context_window = 20;
1230
1231 let m1_context_start = m1.start.saturating_sub(context_window);
1232 let m1_context_end = (m1.end + context_window).min(char_count);
1233 let m1_context: String = text_chars
1234 .get(m1_context_start..m1_context_end)
1235 .unwrap_or(&[])
1236 .iter()
1237 .collect();
1238
1239 let m2_context_start = m2.start.saturating_sub(context_window);
1240 let m2_context_end = (m2.end + context_window).min(char_count);
1241 let m2_context: String = text_chars
1242 .get(m2_context_start..m2_context_end)
1243 .unwrap_or(&[])
1244 .iter()
1245 .collect();
1246
1247 let date1 = Self::extract_date(&m1_context);
1249 let date2 = Self::extract_date(&m2_context);
1250 if let (Some(d1), Some(d2)) = (&date1, &date2) {
1251 if d1 != d2 {
1252 return true; }
1254 }
1255
1256 let m1_negated = Self::has_negation_context(&m1_context);
1259 let m2_negated = Self::has_negation_context(&m2_context);
1260 if m1_negated != m2_negated {
1261 return true;
1262 }
1263
1264 false
1265 }
1266
1267 fn extract_date(context: &str) -> Option<String> {
1269 let date_patterns = [
1271 r"\d{4}-\d{2}-\d{2}", r"\d{2}/\d{2}/\d{4}", r"\d{1,2}/\d{1,2}/\d{2,4}", ];
1275
1276 for pattern in &date_patterns {
1277 if let Ok(re) = regex::Regex::new(pattern) {
1278 if let Some(m) = re.find(context) {
1279 return Some(m.as_str().to_string());
1280 }
1281 }
1282 }
1283 None
1284 }
1285
1286 fn has_negation_context(context: &str) -> bool {
1288 let lower = context.to_lowercase();
1289 static NEGATION_MARKERS: &[&str] = &[
1290 "not ",
1291 "no ",
1292 "never ",
1293 "without ",
1294 "denies ",
1295 "denied ",
1296 "negative for ",
1297 "neg for ",
1298 "ruled out ",
1299 "r/o ",
1300 ];
1301 NEGATION_MARKERS.iter().any(|m| lower.contains(m))
1302 }
1303
1304 fn are_synonyms(&self, m1: &RankedMention, m2: &RankedMention) -> bool {
1332 let t1 = m1.text.to_lowercase();
1333 let t2 = m2.text.to_lowercase();
1334
1335 if t1 == t2 {
1336 return true;
1337 }
1338
1339 let similarity = anno_core::coalesce::similarity::multilingual_similarity(&t1, &t2);
1343 similarity > 0.8
1344 }
1345
1346 pub fn resolve(&self, text: &str) -> Result<Vec<MentionCluster>> {
1348 if text.trim().is_empty() {
1349 return Ok(vec![]);
1350 }
1351
1352 let mut mentions = self.detect_mentions(text)?;
1354
1355 if mentions.is_empty() {
1356 return Ok(vec![]);
1357 }
1358
1359 mentions.sort_by_key(|m| (m.start, m.end));
1361
1362 for mention in &mut mentions {
1364 self.extract_features(mention);
1365 }
1366
1367 let clusters = self.link_mentions(&mentions, text);
1369
1370 Ok(clusters)
1371 }
1372
1373 fn get_pronoun_patterns(&self) -> Vec<(&'static str, Gender, Number)> {
1378 let lang_code = self
1379 .config
1380 .language
1381 .split('-')
1382 .next()
1383 .unwrap_or(&self.config.language)
1384 .to_lowercase();
1385
1386 match lang_code.as_str() {
1387 "es" => vec![
1388 ("él", Gender::Masculine, Number::Singular),
1390 ("ella", Gender::Feminine, Number::Singular),
1391 ("ellos", Gender::Masculine, Number::Plural),
1392 ("ellas", Gender::Feminine, Number::Plural),
1393 ("lo", Gender::Masculine, Number::Singular),
1394 ("la", Gender::Feminine, Number::Singular),
1395 ("los", Gender::Masculine, Number::Plural),
1396 ("las", Gender::Feminine, Number::Plural),
1397 ("le", Gender::Unknown, Number::Singular), ("les", Gender::Unknown, Number::Plural),
1399 ("su", Gender::Unknown, Number::Unknown),
1400 ("sus", Gender::Unknown, Number::Plural),
1401 ("suyo", Gender::Masculine, Number::Singular),
1402 ("suya", Gender::Feminine, Number::Singular),
1403 ("suyos", Gender::Masculine, Number::Plural),
1404 ("suyas", Gender::Feminine, Number::Plural),
1405 ("se", Gender::Unknown, Number::Unknown), ("nosotros", Gender::Masculine, Number::Plural),
1407 ("nosotras", Gender::Feminine, Number::Plural),
1408 ("vosotros", Gender::Masculine, Number::Plural),
1409 ("vosotras", Gender::Feminine, Number::Plural),
1410 ("usted", Gender::Unknown, Number::Singular),
1411 ("ustedes", Gender::Unknown, Number::Plural),
1412 ("elle", Gender::Unknown, Number::Singular), ("elles", Gender::Unknown, Number::Plural), ],
1418 "fr" => vec![
1419 ("il", Gender::Masculine, Number::Singular),
1421 ("elle", Gender::Feminine, Number::Singular),
1422 ("ils", Gender::Masculine, Number::Plural),
1423 ("elles", Gender::Feminine, Number::Plural),
1424 ("le", Gender::Masculine, Number::Singular),
1425 ("la", Gender::Feminine, Number::Singular),
1426 ("les", Gender::Unknown, Number::Plural),
1427 ("lui", Gender::Unknown, Number::Singular),
1428 ("leur", Gender::Unknown, Number::Plural),
1429 ("son", Gender::Masculine, Number::Singular),
1430 ("sa", Gender::Feminine, Number::Singular),
1431 ("ses", Gender::Unknown, Number::Plural),
1432 ("se", Gender::Unknown, Number::Unknown), ("nous", Gender::Unknown, Number::Plural),
1434 ("vous", Gender::Unknown, Number::Unknown),
1435 ("iel", Gender::Unknown, Number::Singular), ("iels", Gender::Unknown, Number::Plural), ],
1441 "de" => vec![
1442 ("er", Gender::Masculine, Number::Singular),
1444 ("sie", Gender::Feminine, Number::Singular),
1445 ("es", Gender::Neutral, Number::Singular),
1446 ("sie", Gender::Unknown, Number::Plural), ("ihn", Gender::Masculine, Number::Singular),
1448 ("ihr", Gender::Feminine, Number::Singular),
1449 ("ihm", Gender::Masculine, Number::Singular),
1450 ("ihnen", Gender::Unknown, Number::Plural),
1451 ("sein", Gender::Masculine, Number::Singular),
1452 ("seine", Gender::Feminine, Number::Singular),
1453 ("sein", Gender::Neutral, Number::Singular),
1454 ("ihre", Gender::Feminine, Number::Singular),
1455 ("ihr", Gender::Unknown, Number::Plural),
1456 ("sich", Gender::Unknown, Number::Unknown), ("wir", Gender::Unknown, Number::Plural),
1458 ("ihr", Gender::Unknown, Number::Plural), ("sie", Gender::Unknown, Number::Plural), ("sier", Gender::Unknown, Number::Singular), ("xier", Gender::Unknown, Number::Singular), ("dier", Gender::Unknown, Number::Singular), ],
1467 "ar" => vec![
1468 ("هو", Gender::Masculine, Number::Singular), ("هي", Gender::Feminine, Number::Singular), ("هم", Gender::Masculine, Number::Plural), ("هن", Gender::Feminine, Number::Plural), ("هما", Gender::Unknown, Number::Plural), ],
1475 "ru" => vec![
1476 ("он", Gender::Masculine, Number::Singular),
1478 ("она", Gender::Feminine, Number::Singular),
1479 ("оно", Gender::Neutral, Number::Singular),
1480 ("они", Gender::Unknown, Number::Plural),
1481 ("его", Gender::Masculine, Number::Singular),
1482 ("её", Gender::Feminine, Number::Singular),
1483 ("их", Gender::Unknown, Number::Plural),
1484 ("себя", Gender::Unknown, Number::Unknown), ("мы", Gender::Unknown, Number::Plural),
1486 ("вы", Gender::Unknown, Number::Unknown),
1487 ],
1488 "zh" => vec![
1489 ("他", Gender::Masculine, Number::Singular), ("她", Gender::Feminine, Number::Singular), ("它", Gender::Neutral, Number::Singular), ("牠", Gender::Neutral, Number::Singular), ("祂", Gender::Neutral, Number::Singular), ("怹", Gender::Unknown, Number::Singular), ("其", Gender::Unknown, Number::Singular), ("他们", Gender::Masculine, Number::Plural), ("她们", Gender::Feminine, Number::Plural), ("它们", Gender::Neutral, Number::Plural), ],
1508 "ja" => vec![
1509 ("彼", Gender::Masculine, Number::Singular), ("彼女", Gender::Feminine, Number::Singular), ("彼ら", Gender::Unknown, Number::Plural), ("その人", Gender::Unknown, Number::Singular), ("あの人", Gender::Unknown, Number::Singular), ],
1520 "ko" => vec![
1521 ("그", Gender::Masculine, Number::Singular), ("그녀", Gender::Feminine, Number::Singular), ("그들", Gender::Unknown, Number::Plural), ("그 사람", Gender::Unknown, Number::Singular), ("그분", Gender::Unknown, Number::Singular), ],
1530 _ => {
1531 vec![
1533 ("he", Gender::Masculine, Number::Singular),
1535 ("she", Gender::Feminine, Number::Singular),
1536 ("it", Gender::Neutral, Number::Singular),
1537 ("they", Gender::Unknown, Number::Unknown), ("him", Gender::Masculine, Number::Singular),
1539 ("her", Gender::Feminine, Number::Singular),
1540 ("them", Gender::Unknown, Number::Unknown), ("his", Gender::Masculine, Number::Singular),
1542 ("hers", Gender::Feminine, Number::Singular),
1543 ("its", Gender::Neutral, Number::Singular),
1544 ("their", Gender::Unknown, Number::Unknown), ("theirs", Gender::Unknown, Number::Unknown),
1546 ("themself", Gender::Unknown, Number::Singular), ("themselves", Gender::Unknown, Number::Plural), ("himself", Gender::Masculine, Number::Singular),
1550 ("herself", Gender::Feminine, Number::Singular),
1551 ("itself", Gender::Neutral, Number::Singular),
1552 ("i", Gender::Unknown, Number::Singular),
1554 ("me", Gender::Unknown, Number::Singular),
1555 ("my", Gender::Unknown, Number::Singular),
1556 ("mine", Gender::Unknown, Number::Singular),
1557 ("myself", Gender::Unknown, Number::Singular),
1558 ("we", Gender::Unknown, Number::Plural),
1559 ("us", Gender::Unknown, Number::Plural),
1560 ("our", Gender::Unknown, Number::Plural),
1561 ("ours", Gender::Unknown, Number::Plural),
1562 ("ourselves", Gender::Unknown, Number::Plural),
1563 ("you", Gender::Unknown, Number::Unknown), ("your", Gender::Unknown, Number::Unknown),
1565 ("yours", Gender::Unknown, Number::Unknown),
1566 ("yourself", Gender::Unknown, Number::Singular),
1567 ("yourselves", Gender::Unknown, Number::Plural),
1568 ("ze", Gender::Unknown, Number::Singular),
1570 ("hir", Gender::Unknown, Number::Singular),
1571 ("hirs", Gender::Unknown, Number::Singular),
1572 ("hirself", Gender::Unknown, Number::Singular),
1573 ("xe", Gender::Unknown, Number::Singular),
1575 ("xem", Gender::Unknown, Number::Singular),
1576 ("xyr", Gender::Unknown, Number::Singular),
1577 ("xyrs", Gender::Unknown, Number::Singular),
1578 ("xemself", Gender::Unknown, Number::Singular),
1579 ("ey", Gender::Unknown, Number::Singular), ("em", Gender::Unknown, Number::Singular),
1582 ("eir", Gender::Unknown, Number::Singular),
1583 ("eirs", Gender::Unknown, Number::Singular),
1584 ("emself", Gender::Unknown, Number::Singular),
1585 ("fae", Gender::Unknown, Number::Singular),
1587 ("faer", Gender::Unknown, Number::Singular),
1588 ("faers", Gender::Unknown, Number::Singular),
1589 ("faerself", Gender::Unknown, Number::Singular),
1590 ("this", Gender::Unknown, Number::Singular),
1592 ("that", Gender::Unknown, Number::Singular),
1593 ("these", Gender::Unknown, Number::Plural),
1594 ("those", Gender::Unknown, Number::Plural),
1595 ("someone", Gender::Unknown, Number::Singular),
1597 ("somebody", Gender::Unknown, Number::Singular),
1598 ("anyone", Gender::Unknown, Number::Singular),
1599 ("anybody", Gender::Unknown, Number::Singular),
1600 ("everyone", Gender::Unknown, Number::Singular), ("everybody", Gender::Unknown, Number::Singular),
1602 ("no one", Gender::Unknown, Number::Singular),
1603 ("nobody", Gender::Unknown, Number::Singular),
1604 ("one", Gender::Unknown, Number::Singular),
1606 ("oneself", Gender::Unknown, Number::Singular),
1607 ("who", Gender::Unknown, Number::Unknown),
1609 ("whom", Gender::Unknown, Number::Unknown),
1610 ("whose", Gender::Unknown, Number::Unknown),
1611 ("which", Gender::Unknown, Number::Unknown),
1612 ("each other", Gender::Unknown, Number::Plural),
1614 ("one another", Gender::Unknown, Number::Plural),
1615 ]
1616 }
1617 }
1618 }
1619
1620 fn detect_mentions(&self, text: &str) -> Result<Vec<RankedMention>> {
1622 let mut mentions = Vec::new();
1623
1624 if let Some(ref ner) = self.ner {
1626 let entities = ner.extract_entities(text, None)?;
1627 for entity in entities {
1628 mentions.push(RankedMention {
1629 start: entity.start,
1630 end: entity.end,
1631 text: entity.text.clone(),
1632 mention_type: MentionType::Proper,
1633 gender: None,
1634 number: None,
1635 head: self.get_head(&entity.text),
1636 });
1637 }
1638 }
1639
1640 let pronoun_patterns = self.get_pronoun_patterns();
1657
1658 let text_lower = text.to_lowercase();
1767 let text_chars: Vec<char> = text.chars().collect();
1768 for (pronoun, gender, number) in pronoun_patterns {
1769 let mut search_start_byte = 0;
1770 while let Some(pos) = text_lower[search_start_byte..].find(pronoun) {
1771 let abs_byte_pos = search_start_byte + pos;
1772 let end_byte_pos = abs_byte_pos + pronoun.len();
1773
1774 let char_pos = text[..abs_byte_pos].chars().count();
1776 let end_char_pos = char_pos + pronoun.chars().count();
1777
1778 let is_word_start = char_pos == 0
1780 || match text_chars.get(char_pos.saturating_sub(1)) {
1781 None => true,
1782 Some(c) => !c.is_alphanumeric(),
1783 };
1784 let is_word_end = end_char_pos >= text_chars.len()
1785 || match text_chars.get(end_char_pos) {
1786 None => true,
1787 Some(c) => !c.is_alphanumeric(),
1788 };
1789
1790 if is_word_start && is_word_end {
1791 if pronoun == "it" && self.is_pleonastic_it(&text_lower, abs_byte_pos) {
1794 search_start_byte = end_byte_pos;
1795 continue;
1796 }
1797
1798 let char_start = char_pos;
1800 let char_end = end_char_pos;
1801
1802 mentions.push(RankedMention {
1803 start: char_start,
1804 end: char_end,
1805 text: text[abs_byte_pos..end_byte_pos].to_string(),
1806 mention_type: MentionType::Pronominal,
1807 gender: Some(gender),
1808 number: Some(number),
1809 head: pronoun.to_string(),
1810 });
1811 }
1812
1813 search_start_byte = end_byte_pos;
1814 }
1815 }
1816
1817 let words: Vec<_> = text.split_whitespace().collect();
1819 let mut search_byte_pos = 0; for (i, word) in words.iter().enumerate() {
1822 let at_sentence_start = i == 0
1824 || match text[..text.find(word).unwrap_or(0)].chars().last() {
1825 None => true,
1826 Some(c) => c == '.' || c == '!' || c == '?',
1827 };
1828
1829 if !at_sentence_start
1830 && word.chars().next().is_some_and(|c| c.is_uppercase())
1831 && word.chars().count() > 1
1832 {
1834 if let Some(rel_byte_pos) = text[search_byte_pos..].find(word) {
1836 let abs_byte_pos = search_byte_pos + rel_byte_pos;
1837 let char_start = text[..abs_byte_pos].chars().count();
1839 let char_end = char_start + word.chars().count();
1840
1841 mentions.push(RankedMention {
1842 start: char_start,
1843 end: char_end,
1844 text: word.to_string(),
1845 mention_type: MentionType::Proper,
1846 gender: None,
1847 number: Some(Number::Singular),
1848 head: word.to_string(),
1849 });
1850 }
1851 }
1852
1853 search_byte_pos += word.len() + 1; }
1855
1856 if self.config.enable_nominal_adjective_detection {
1865 const NOMINALIZED_ADJECTIVES: &[&str] = &[
1868 "poor",
1870 "rich",
1871 "wealthy",
1872 "homeless",
1873 "unemployed",
1874 "employed",
1875 "young",
1877 "old",
1878 "elderly",
1879 "aged",
1880 "sick",
1882 "ill",
1883 "healthy",
1884 "wounded",
1885 "injured",
1886 "disabled",
1887 "blind",
1888 "deaf",
1889 "dead",
1891 "living",
1892 "deceased",
1893 "accused",
1895 "condemned",
1896 "convicted",
1897 "guilty",
1898 "innocent",
1899 "insured",
1900 "uninsured",
1901 "gifted",
1903 "talented",
1904 "educated",
1905 "literate",
1906 "illiterate",
1907 "powerful",
1909 "powerless",
1910 "oppressed",
1911 "weak",
1912 "famous",
1913 "infamous",
1914 "righteous",
1916 "wicked",
1917 "blessed",
1918 "damned",
1919 "faithful",
1920 "hungry",
1922 "needy",
1923 "privileged",
1924 "underprivileged",
1925 "disadvantaged",
1926 "marginalized",
1927 ];
1928
1929 let (determiners, adjectives): (Vec<&str>, Vec<&str>) =
1935 match self.config.language.as_str() {
1936 "de" => {
1937 let dets = vec!["die ", "diese ", "jene "];
1940 let adjs = vec![
1941 "armen",
1942 "reichen",
1943 "alten",
1944 "jungen",
1945 "kranken",
1946 "gesunden",
1947 "toten",
1948 "lebenden",
1949 "blinden",
1950 "tauben",
1951 "arbeitslosen",
1952 "obdachlosen",
1953 "mächtigen",
1954 "schwachen",
1955 "unterdrückten",
1956 ];
1957 (dets, adjs)
1958 }
1959 "fr" => {
1960 let dets = vec!["les ", "ces "];
1962 let adjs = vec![
1963 "pauvres",
1964 "riches",
1965 "vieux",
1966 "jeunes",
1967 "malades",
1968 "morts",
1969 "vivants",
1970 "aveugles",
1971 "sourds",
1972 "faibles",
1973 "puissants",
1974 "opprimés",
1975 "affamés",
1976 "marginalisés",
1977 ];
1978 (dets, adjs)
1979 }
1980 "es" => {
1981 let dets = vec!["los ", "las ", "estos ", "estas "];
1984 let adjs = vec![
1985 "pobres",
1986 "ricos",
1987 "viejos",
1988 "jóvenes",
1989 "enfermos",
1990 "muertos",
1991 "vivos",
1992 "ciegos",
1993 "sordos",
1994 "débiles",
1995 "poderosos",
1996 "oprimidos",
1997 "hambrientos",
1998 "marginados",
1999 ];
2000 (dets, adjs)
2001 }
2002 _ => {
2003 let dets = vec!["the ", "these ", "those "];
2005 (dets, NOMINALIZED_ADJECTIVES.to_vec())
2006 }
2007 };
2008
2009 for det in &determiners {
2010 for adj in &adjectives {
2011 let pattern = format!("{}{}", det, adj);
2012 let pattern_lower = pattern.to_lowercase();
2013
2014 let mut search_start = 0;
2015 while let Some(rel_pos) = text_lower[search_start..].find(&pattern_lower) {
2016 let abs_byte_pos = search_start + rel_pos;
2017 let end_byte_pos = abs_byte_pos + pattern.len();
2018
2019 let following_text = &text_lower[end_byte_pos..];
2026 let next_word: String = following_text
2027 .chars()
2028 .skip_while(|c| c.is_whitespace())
2029 .take_while(|c| c.is_alphabetic())
2030 .collect();
2031
2032 let valid_followers: Vec<&str> = match self.config.language.as_str() {
2034 "de" => vec![
2035 "sind", "waren", "haben", "hatten", "werden", "wurden", "brauchen",
2037 "müssen", "können", "sollen", "wollen", "und", "oder", "aber", "die", "welche",
2039 ],
2040 "fr" => vec![
2041 "sont",
2043 "étaient",
2044 "ont",
2045 "avaient",
2046 "seront",
2047 "peuvent",
2048 "doivent",
2049 "veulent",
2050 "méritent",
2051 "et",
2053 "ou",
2054 "mais",
2055 "qui",
2056 "que",
2057 ],
2058 "es" => vec![
2059 "son",
2061 "eran",
2062 "tienen",
2063 "tenían",
2064 "serán",
2065 "pueden",
2066 "deben",
2067 "quieren",
2068 "merecen",
2069 "necesitan",
2070 "sufren",
2071 "luchan",
2072 "reciben",
2073 "buscan",
2074 "y",
2076 "o",
2077 "pero",
2078 "que",
2079 "quienes",
2080 ],
2081 _ => vec![
2082 "are", "were", "is", "was", "be", "been", "being", "have", "has",
2084 "had", "having", "do", "does", "did", "can", "could", "will",
2085 "would", "shall", "should", "may", "might", "must", "need", "want",
2086 "get", "got", "struggle", "suffer", "deserve", "receive", "face",
2087 "lack", "seek", "and", "or", "but", "who", "whom", "whose", "that",
2088 "which", "in", "of", "from", "with", "without", "among",
2089 ],
2090 };
2091
2092 let is_valid_nominal =
2094 next_word.is_empty() || valid_followers.contains(&next_word.as_str());
2095
2096 if is_valid_nominal {
2097 let char_start = text[..abs_byte_pos].chars().count();
2099 let char_end = char_start + pattern.chars().count();
2100
2101 mentions.push(RankedMention {
2102 start: char_start,
2103 end: char_end,
2104 text: text[abs_byte_pos..end_byte_pos].to_string(),
2105 mention_type: MentionType::Nominal,
2106 gender: Some(Gender::Unknown), number: Some(Number::Plural), head: adj.to_string(), });
2110 }
2111
2112 search_start = end_byte_pos;
2113 }
2114 }
2115 }
2116 }
2117
2118 mentions.sort_by_key(|m| (m.start, std::cmp::Reverse(m.end)));
2120 let mut deduped = Vec::new();
2121 let mut covered_end = 0;
2122
2123 for mention in mentions {
2124 if mention.start >= covered_end {
2125 covered_end = mention.end;
2126 deduped.push(mention);
2127 }
2128 }
2129
2130 Ok(deduped)
2131 }
2132
2133 fn extract_features(&self, mention: &mut RankedMention) {
2135 if mention.gender.is_none() && mention.mention_type == MentionType::Proper {
2137 mention.gender = self.guess_gender(&mention.text);
2138 }
2139
2140 if mention.number.is_none() {
2142 mention.number = Some(Number::Singular); }
2144 }
2145
2146 fn guess_gender(&self, text: &str) -> Option<Gender> {
2148 let masc_names = [
2149 "john", "james", "michael", "david", "robert", "william", "richard",
2150 ];
2151 let fem_names = [
2152 "mary",
2153 "jennifer",
2154 "lisa",
2155 "sarah",
2156 "jessica",
2157 "emily",
2158 "elizabeth",
2159 ];
2160
2161 let first_word = text.split_whitespace().next()?.to_lowercase();
2162
2163 if masc_names.contains(&first_word.as_str()) {
2164 Some(Gender::Masculine)
2165 } else if fem_names.contains(&first_word.as_str()) {
2166 Some(Gender::Feminine)
2167 } else {
2168 None
2169 }
2170 }
2171
2172 fn get_head(&self, text: &str) -> String {
2174 text.split_whitespace().last().unwrap_or(text).to_string()
2176 }
2177
2178 fn link_mentions(&self, mentions: &[RankedMention], text: &str) -> Vec<MentionCluster> {
2185 match self.config.clustering_strategy {
2186 ClusteringStrategy::LeftToRight => self.link_mentions_left_to_right(mentions, text),
2187 ClusteringStrategy::EasyFirst => self.link_mentions_easy_first(mentions, text),
2188 }
2189 }
2190
2191 fn link_mentions_left_to_right(
2193 &self,
2194 mentions: &[RankedMention],
2195 text: &str,
2196 ) -> Vec<MentionCluster> {
2197 let mut mention_to_cluster: HashMap<usize, usize> = HashMap::new();
2198 let mut clusters: Vec<Vec<usize>> = Vec::new();
2199
2200 for (i, mention) in mentions.iter().enumerate() {
2201 let mut best_antecedent: Option<usize> = None;
2202 let mut best_score = self.config.link_threshold;
2203
2204 let max_antecedents = self.config.max_antecedents_for_type(mention.mention_type);
2206
2207 for j in (0..i).rev().take(max_antecedents) {
2209 let antecedent = &mentions[j];
2210
2211 let distance = mention.start.saturating_sub(antecedent.end);
2213 if distance > self.config.max_distance {
2214 break;
2215 }
2216
2217 let score = self.score_pair(mention, antecedent, distance, Some(text));
2218 if score > best_score {
2219 best_score = score;
2220 best_antecedent = Some(j);
2221 }
2222 }
2223
2224 if let Some(ant_idx) = best_antecedent {
2225 if let Some(&cluster_id) = mention_to_cluster.get(&ant_idx) {
2227 clusters[cluster_id].push(i);
2228 mention_to_cluster.insert(i, cluster_id);
2229 } else {
2230 let cluster_id = clusters.len();
2232 clusters.push(vec![ant_idx, i]);
2233 mention_to_cluster.insert(ant_idx, cluster_id);
2234 mention_to_cluster.insert(i, cluster_id);
2235 }
2236 }
2237 }
2238
2239 let clusters = if self.config.enable_global_proper_coref {
2241 self.apply_global_proper_coref(mentions, clusters)
2242 } else {
2243 clusters
2244 };
2245
2246 clusters
2248 .into_iter()
2249 .enumerate()
2250 .map(|(id, indices)| MentionCluster {
2251 id,
2252 mentions: indices.into_iter().map(|i| mentions[i].clone()).collect(),
2253 })
2254 .collect()
2255 }
2256
2257 fn link_mentions_easy_first(
2262 &self,
2263 mentions: &[RankedMention],
2264 text: &str,
2265 ) -> Vec<MentionCluster> {
2266 let mut scored_pairs: Vec<ScoredPair> = Vec::new();
2268 let mut non_coref_pairs: HashSet<(usize, usize)> = HashSet::new();
2269
2270 for (i, mention) in mentions.iter().enumerate() {
2271 let max_antecedents = self.config.max_antecedents_for_type(mention.mention_type);
2272
2273 for j in (0..i).rev().take(max_antecedents) {
2274 let antecedent = &mentions[j];
2275 let distance = mention.start.saturating_sub(antecedent.end);
2276 if distance > self.config.max_distance {
2277 break;
2278 }
2279
2280 let score = self.score_pair(mention, antecedent, distance, Some(text));
2281
2282 if self.config.use_non_coref_constraints && score < self.config.non_coref_threshold
2284 {
2285 non_coref_pairs.insert((j.min(i), j.max(i)));
2288 }
2289
2290 if score > self.config.link_threshold {
2291 scored_pairs.push(ScoredPair {
2292 mention_idx: i,
2293 antecedent_idx: j,
2294 score,
2295 });
2296 }
2297 }
2298 }
2299
2300 scored_pairs.sort_by(|a, b| {
2302 b.score
2303 .partial_cmp(&a.score)
2304 .unwrap_or(std::cmp::Ordering::Equal)
2305 });
2306
2307 let mut mention_to_cluster: HashMap<usize, usize> = HashMap::new();
2309 let mut clusters: Vec<Vec<usize>> = Vec::new();
2310 let mut processed: HashSet<usize> = HashSet::new();
2311
2312 for pair in scored_pairs {
2313 if processed.contains(&pair.mention_idx) {
2315 continue;
2316 }
2317
2318 let key = (
2320 pair.antecedent_idx.min(pair.mention_idx),
2321 pair.antecedent_idx.max(pair.mention_idx),
2322 );
2323 if self.config.use_non_coref_constraints && non_coref_pairs.contains(&key) {
2324 continue;
2325 }
2326
2327 let would_violate = if self.config.use_non_coref_constraints {
2329 self.would_violate_constraint(
2330 pair.mention_idx,
2331 pair.antecedent_idx,
2332 &mention_to_cluster,
2333 &clusters,
2334 &non_coref_pairs,
2335 )
2336 } else {
2337 false
2338 };
2339
2340 if would_violate {
2341 continue;
2342 }
2343
2344 processed.insert(pair.mention_idx);
2346
2347 if let Some(&cluster_id) = mention_to_cluster.get(&pair.antecedent_idx) {
2348 clusters[cluster_id].push(pair.mention_idx);
2349 mention_to_cluster.insert(pair.mention_idx, cluster_id);
2350 } else {
2351 let cluster_id = clusters.len();
2352 clusters.push(vec![pair.antecedent_idx, pair.mention_idx]);
2353 mention_to_cluster.insert(pair.antecedent_idx, cluster_id);
2354 mention_to_cluster.insert(pair.mention_idx, cluster_id);
2355 }
2356 }
2357
2358 let clusters = if self.config.enable_global_proper_coref {
2360 self.apply_global_proper_coref(mentions, clusters)
2361 } else {
2362 clusters
2363 };
2364
2365 clusters
2367 .into_iter()
2368 .enumerate()
2369 .map(|(id, indices)| MentionCluster {
2370 id,
2371 mentions: indices.into_iter().map(|i| mentions[i].clone()).collect(),
2372 })
2373 .collect()
2374 }
2375
2376 fn would_violate_constraint(
2378 &self,
2379 mention_idx: usize,
2380 antecedent_idx: usize,
2381 mention_to_cluster: &HashMap<usize, usize>,
2382 clusters: &[Vec<usize>],
2383 non_coref_pairs: &HashSet<(usize, usize)>,
2384 ) -> bool {
2385 let mut members = vec![mention_idx];
2387 if let Some(&cluster_id) = mention_to_cluster.get(&antecedent_idx) {
2388 members.extend(clusters[cluster_id].iter().copied());
2389 } else {
2390 members.push(antecedent_idx);
2391 }
2392
2393 for i in 0..members.len() {
2395 for j in (i + 1)..members.len() {
2396 let key = (members[i].min(members[j]), members[i].max(members[j]));
2397 if non_coref_pairs.contains(&key) {
2398 return true;
2399 }
2400 }
2401 }
2402
2403 false
2404 }
2405
2406 fn apply_global_proper_coref(
2411 &self,
2412 mentions: &[RankedMention],
2413 mut clusters: Vec<Vec<usize>>,
2414 ) -> Vec<Vec<usize>> {
2415 let mut proper_to_cluster: HashMap<String, usize> = HashMap::new();
2417 let mut cluster_to_propers: HashMap<usize, Vec<String>> = HashMap::new();
2418
2419 for (cluster_idx, cluster) in clusters.iter().enumerate() {
2420 for &mention_idx in cluster {
2421 let mention = &mentions[mention_idx];
2422 if mention.mention_type == MentionType::Proper {
2423 let normalized = mention.text.to_lowercase();
2424 proper_to_cluster.insert(normalized.clone(), cluster_idx);
2425 cluster_to_propers
2426 .entry(cluster_idx)
2427 .or_default()
2428 .push(normalized);
2429 }
2430 }
2431 }
2432
2433 let mut unclustered_propers: Vec<(usize, String)> = Vec::new();
2435 let mut mention_to_cluster: HashMap<usize, usize> = HashMap::new();
2436
2437 for (cluster_idx, cluster) in clusters.iter().enumerate() {
2438 for &mention_idx in cluster {
2439 mention_to_cluster.insert(mention_idx, cluster_idx);
2440 }
2441 }
2442
2443 for (i, mention) in mentions.iter().enumerate() {
2444 if mention.mention_type == MentionType::Proper && !mention_to_cluster.contains_key(&i) {
2445 unclustered_propers.push((i, mention.text.to_lowercase()));
2446 }
2447 }
2448
2449 for (mention_idx, normalized) in unclustered_propers {
2451 if let Some(&cluster_idx) = proper_to_cluster.get(&normalized) {
2452 clusters[cluster_idx].push(mention_idx);
2453 }
2454 }
2455
2456 let mut merged = vec![false; clusters.len()];
2459 let mut merge_map: HashMap<usize, usize> = HashMap::new();
2460
2461 for (idx, cluster) in clusters.iter().enumerate() {
2462 if merged[idx] {
2463 continue;
2464 }
2465
2466 let propers: Vec<_> = cluster
2467 .iter()
2468 .filter_map(|&i| {
2469 let m = &mentions[i];
2470 if m.mention_type == MentionType::Proper {
2471 Some(m.text.to_lowercase())
2472 } else {
2473 None
2474 }
2475 })
2476 .collect();
2477
2478 for (other_idx, other_cluster) in clusters.iter().enumerate() {
2480 if other_idx <= idx || merged[other_idx] {
2481 continue;
2482 }
2483
2484 let other_propers: Vec<_> = other_cluster
2485 .iter()
2486 .filter_map(|&i| {
2487 let m = &mentions[i];
2488 if m.mention_type == MentionType::Proper {
2489 Some(m.text.to_lowercase())
2490 } else {
2491 None
2492 }
2493 })
2494 .collect();
2495
2496 if propers.iter().any(|p| other_propers.contains(p)) {
2498 merged[other_idx] = true;
2499 merge_map.insert(other_idx, idx);
2500 }
2501 }
2502 }
2503
2504 if !merge_map.is_empty() {
2506 let mut final_clusters: Vec<Vec<usize>> = Vec::new();
2507 let mut old_to_new: HashMap<usize, usize> = HashMap::new();
2508
2509 for (old_idx, cluster) in clusters.into_iter().enumerate() {
2510 if merged[old_idx] {
2511 let mut target = merge_map[&old_idx];
2513 while let Some(&next) = merge_map.get(&target) {
2514 target = next;
2515 }
2516 if let Some(&new_idx) = old_to_new.get(&target) {
2517 final_clusters[new_idx].extend(cluster);
2518 }
2519 } else {
2520 let new_idx = final_clusters.len();
2521 old_to_new.insert(old_idx, new_idx);
2522 final_clusters.push(cluster);
2523 }
2524 }
2525
2526 final_clusters
2527 } else {
2528 clusters
2529 }
2530 }
2531
2532 fn score_pair(
2541 &self,
2542 mention: &RankedMention,
2543 antecedent: &RankedMention,
2544 distance: usize,
2545 text: Option<&str>,
2546 ) -> f64 {
2547 let mut score = 0.0;
2548
2549 if self.config.enable_context_filtering {
2554 if let Some(txt) = text {
2555 if self.should_filter_by_context(txt, mention, antecedent) {
2556 return -1.0; }
2558 }
2559 }
2560
2561 let m_lower = mention.text.to_lowercase();
2565 let a_lower = antecedent.text.to_lowercase();
2566
2567 if m_lower == a_lower {
2569 score += self.config.string_match_weight * 1.0;
2570 }
2571 else if mention.head.to_lowercase() == antecedent.head.to_lowercase() {
2573 score += self.config.string_match_weight * 0.6;
2574 }
2575 else if m_lower.contains(&a_lower) || a_lower.contains(&m_lower) {
2577 score += self.config.string_match_weight * 0.3;
2578 }
2579
2580 if self.config.enable_be_phrase_detection {
2585 if let Some(txt) = text {
2586 if self.is_be_phrase_link(txt, mention, antecedent) {
2587 score += self.config.be_phrase_weight;
2588 }
2589 }
2590 }
2591
2592 if self.config.enable_acronym_matching && self.is_acronym_match(mention, antecedent) {
2597 score += self.config.acronym_weight;
2598 }
2599
2600 if self.config.enable_synonym_matching && self.are_synonyms(mention, antecedent) {
2605 score += self.config.synonym_weight;
2606 }
2607
2608 match (mention.mention_type, antecedent.mention_type) {
2612 (MentionType::Pronominal, MentionType::Proper) => {
2613 score += self.config.type_compat_weight * 0.5;
2614 }
2615 (MentionType::Pronominal, MentionType::Pronominal) => {
2616 if mention.text.to_lowercase() == antecedent.text.to_lowercase() {
2618 score += self.config.type_compat_weight * 0.3;
2619 }
2620 }
2621 (MentionType::Proper, MentionType::Proper) => {
2622 score += self.config.type_compat_weight * 0.4;
2623 }
2624 _ => {}
2625 }
2626
2627 if let (Some(m_gender), Some(a_gender)) = (mention.gender, antecedent.gender) {
2631 if m_gender == a_gender {
2632 score += self.config.type_compat_weight * 0.3;
2633 } else if m_gender != Gender::Unknown && a_gender != Gender::Unknown {
2634 score -= self.config.type_compat_weight * 0.5; }
2636 }
2637
2638 if let (Some(m_number), Some(a_number)) = (mention.number, antecedent.number) {
2647 if m_number == a_number {
2648 score += self.config.type_compat_weight * 0.2;
2650 } else if m_number.is_compatible(&a_number) {
2651 score += self.config.type_compat_weight * 0.05;
2654 } else {
2655 score -= self.config.type_compat_weight * 0.4;
2657 }
2658 }
2659
2660 score -= self.config.distance_weight * (distance as f64).ln().max(0.0);
2664
2665 if self.config.salience_weight > 0.0 {
2669 let salience = self.get_salience(&antecedent.text);
2670 score += self.config.salience_weight * salience;
2671 }
2672
2673 score
2674 }
2675}
2676
2677impl Default for MentionRankingCoref {
2678 fn default() -> Self {
2679 Self::new()
2680 }
2681}
2682
2683impl MentionRankingCoref {
2688 pub fn resolve_to_grounded(
2717 &self,
2718 text: &str,
2719 ) -> Result<(
2720 Vec<anno_core::Signal<anno_core::Location>>,
2721 Vec<anno_core::Track>,
2722 )> {
2723 let clusters = self.resolve(text)?;
2724
2725 let mut all_signals = Vec::new();
2726 let mut all_tracks = Vec::new();
2727 let mut signal_id_offset = anno_core::SignalId::ZERO;
2728
2729 for cluster in clusters {
2730 let (track, signals) = cluster.to_track(signal_id_offset);
2731 signal_id_offset += signals.len() as u64;
2732 all_signals.extend(signals);
2733 all_tracks.push(track);
2734 }
2735
2736 Ok((all_signals, all_tracks))
2737 }
2738
2739 pub fn resolve_into_document(
2748 &self,
2749 text: &str,
2750 doc: &mut anno_core::GroundedDocument,
2751 ) -> Result<Vec<anno_core::TrackId>> {
2752 let (signals, tracks) = self.resolve_to_grounded(text)?;
2753 let mut track_ids = Vec::new();
2754
2755 for signal in signals {
2757 doc.signals.push(signal);
2758 }
2759
2760 for track in tracks {
2762 track_ids.push(track.id);
2763 doc.tracks.insert(track.id, track);
2764 }
2765
2766 Ok(track_ids)
2767 }
2768}
2769
2770use crate::Entity;
2775use anno_core::CoreferenceResolver;
2776
2777impl CoreferenceResolver for MentionRankingCoref {
2778 fn resolve(&self, entities: &[Entity]) -> Vec<Entity> {
2779 if entities.is_empty() {
2780 return vec![];
2781 }
2782
2783 let mut mentions: Vec<RankedMention> = entities
2785 .iter()
2786 .map(|e| {
2787 let mention_type = if e.text.chars().all(|c| c.is_lowercase()) {
2788 MentionType::Pronominal
2789 } else if e.text.chars().next().is_some_and(|c| c.is_uppercase()) {
2790 MentionType::Proper
2791 } else {
2792 MentionType::Nominal
2793 };
2794
2795 let gender = self.guess_gender(&e.text);
2796 let lower = e.text.to_lowercase();
2799 let number = if ["we", "us"].iter().any(|p| lower == *p) {
2800 Some(Number::Plural)
2801 } else if ["they", "them", "their", "you"].iter().any(|p| lower == *p) {
2802 Some(Number::Unknown) } else {
2804 Some(Number::Singular)
2805 };
2806
2807 RankedMention {
2808 start: e.start,
2809 end: e.end,
2810 text: e.text.clone(),
2811 mention_type,
2812 gender,
2813 number,
2814 head: self.get_head(&e.text),
2815 }
2816 })
2817 .collect();
2818
2819 mentions.sort_by_key(|m| (m.start, m.end));
2821
2822 for mention in &mut mentions {
2824 self.extract_features(mention);
2825 }
2826
2827 let clusters = self.link_mentions(&mentions, "");
2831
2832 let mut canonical_map: HashMap<(usize, usize), usize> = HashMap::new();
2834 for cluster in &clusters {
2835 for mention in &cluster.mentions {
2836 canonical_map.insert((mention.start, mention.end), cluster.id);
2837 }
2838 }
2839
2840 let max_cluster_id = clusters.iter().map(|c| c.id).max().unwrap_or(0);
2842 let mut next_singleton_id = max_cluster_id + 1;
2843
2844 entities
2846 .iter()
2847 .map(|e| {
2848 let mut entity = e.clone();
2849 if let Some(&cluster_id) = canonical_map.get(&(e.start, e.end)) {
2850 entity.canonical_id = Some(anno_core::CanonicalId::new(cluster_id as u64));
2851 } else {
2852 entity.canonical_id =
2854 Some(anno_core::CanonicalId::new(next_singleton_id as u64));
2855 next_singleton_id += 1;
2856 }
2857 entity
2858 })
2859 .collect()
2860 }
2861
2862 fn name(&self) -> &'static str {
2863 "MentionRankingCoref"
2864 }
2865}
2866
2867#[cfg(test)]
2868mod tests {
2869 use super::*;
2870
2871 #[test]
2872 fn test_basic_resolution() {
2873 let coref = MentionRankingCoref::new();
2874 let clusters = coref.resolve("John saw Mary. He waved to her.").unwrap();
2875
2876 for cluster in &clusters {
2878 assert!(!cluster.mentions.is_empty());
2879 for mention in &cluster.mentions {
2880 assert!(mention.start <= mention.end);
2881 }
2882 }
2883 }
2884
2885 #[test]
2886 fn test_empty_input() {
2887 let coref = MentionRankingCoref::new();
2888 let clusters = coref.resolve("").unwrap();
2889 assert!(clusters.is_empty());
2890 }
2891
2892 #[test]
2893 fn test_pronoun_detection() {
2894 let coref = MentionRankingCoref::new();
2895 let mentions = coref.detect_mentions("He saw her.").unwrap();
2896
2897 let pronouns: Vec<_> = mentions
2898 .iter()
2899 .filter(|m| m.mention_type == MentionType::Pronominal)
2900 .collect();
2901
2902 assert!(
2903 pronouns.len() >= 2,
2904 "Should detect 'He' and 'her' as pronouns"
2905 );
2906 }
2907
2908 #[test]
2909 fn test_gender_inference() {
2910 let coref = MentionRankingCoref::new();
2911
2912 assert_eq!(coref.guess_gender("John"), Some(Gender::Masculine));
2913 assert_eq!(coref.guess_gender("Mary Smith"), Some(Gender::Feminine));
2914 assert_eq!(coref.guess_gender("Google"), None);
2915 }
2916
2917 #[test]
2918 fn test_pair_scoring() {
2919 let coref = MentionRankingCoref::new();
2920
2921 let m1 = RankedMention {
2922 start: 0,
2923 end: 4,
2924 text: "John".to_string(),
2925 mention_type: MentionType::Proper,
2926 gender: Some(Gender::Masculine),
2927 number: Some(Number::Singular),
2928 head: "John".to_string(),
2929 };
2930
2931 let m2 = RankedMention {
2932 start: 10,
2933 end: 12,
2934 text: "He".to_string(),
2935 mention_type: MentionType::Pronominal,
2936 gender: Some(Gender::Masculine),
2937 number: Some(Number::Singular),
2938 head: "He".to_string(),
2939 };
2940
2941 let score = coref.score_pair(&m2, &m1, 6, None);
2942 assert!(score > 0.0, "Pronoun with matching gender should link");
2943 }
2944
2945 #[test]
2946 fn test_gender_mismatch_penalty() {
2947 let coref = MentionRankingCoref::new();
2948
2949 let m1 = RankedMention {
2950 start: 0,
2951 end: 4,
2952 text: "Mary".to_string(),
2953 mention_type: MentionType::Proper,
2954 gender: Some(Gender::Feminine),
2955 number: Some(Number::Singular),
2956 head: "Mary".to_string(),
2957 };
2958
2959 let m2 = RankedMention {
2960 start: 10,
2961 end: 12,
2962 text: "He".to_string(),
2963 mention_type: MentionType::Pronominal,
2964 gender: Some(Gender::Masculine),
2965 number: Some(Number::Singular),
2966 head: "He".to_string(),
2967 };
2968
2969 let score = coref.score_pair(&m2, &m1, 6, None);
2970 assert!(
2971 score < 0.5,
2972 "Gender mismatch should have low/negative score"
2973 );
2974 }
2975
2976 #[test]
2977 fn test_config() {
2978 let config = MentionRankingConfig {
2979 link_threshold: 0.5,
2980 ..Default::default()
2981 };
2982
2983 let coref = MentionRankingCoref::with_config(config);
2984 assert_eq!(coref.config.link_threshold, 0.5);
2985 }
2986
2987 #[test]
2988 fn test_unicode_offsets() {
2989 let coref = MentionRankingCoref::new();
2990 let text = "北京很美. He likes it.";
2991 let char_count = text.chars().count();
2992
2993 let clusters = coref.resolve(text).unwrap();
2994
2995 for cluster in &clusters {
2996 for mention in &cluster.mentions {
2997 assert!(mention.start <= mention.end);
2998 assert!(mention.end <= char_count);
2999 }
3000 }
3001 }
3002
3003 #[test]
3008 fn test_type_specific_antecedent_limits() {
3009 let config = MentionRankingConfig::default();
3010
3011 assert_eq!(config.pronoun_max_antecedents, 30);
3013 assert_eq!(config.proper_max_antecedents, 300);
3014 assert_eq!(config.nominal_max_antecedents, 300);
3015
3016 assert_eq!(config.max_antecedents_for_type(MentionType::Pronominal), 30);
3018 assert_eq!(config.max_antecedents_for_type(MentionType::Proper), 300);
3019 assert_eq!(config.max_antecedents_for_type(MentionType::Nominal), 300);
3020 assert_eq!(config.max_antecedents_for_type(MentionType::Zero), 300);
3021 assert_eq!(config.max_antecedents_for_type(MentionType::Unknown), 300);
3022 }
3023
3024 #[test]
3025 fn test_book_scale_config() {
3026 let config = MentionRankingConfig::book_scale();
3027
3028 assert!(config.enable_global_proper_coref);
3030 assert_eq!(config.clustering_strategy, ClusteringStrategy::EasyFirst);
3031 assert!(config.use_non_coref_constraints);
3032
3033 assert!(config.max_distance > 100);
3035 }
3036
3037 #[test]
3038 fn test_pronoun_antecedent_limit_enforced() {
3039 let config = MentionRankingConfig {
3041 pronoun_max_antecedents: 2,
3042 ..Default::default()
3043 };
3044 let coref = MentionRankingCoref::with_config(config);
3045
3046 assert_eq!(coref.config.pronoun_max_antecedents, 2);
3049 }
3050
3051 #[test]
3056 fn test_clustering_strategy_default() {
3057 let config = MentionRankingConfig::default();
3058 assert_eq!(config.clustering_strategy, ClusteringStrategy::LeftToRight);
3059 }
3060
3061 #[test]
3062 fn test_easy_first_clustering() {
3063 let config = MentionRankingConfig {
3064 clustering_strategy: ClusteringStrategy::EasyFirst,
3065 ..Default::default()
3066 };
3067 let coref = MentionRankingCoref::with_config(config);
3068
3069 let clusters = coref.resolve("John went home. He was tired.").unwrap();
3071 for cluster in &clusters {
3072 assert!(!cluster.mentions.is_empty());
3073 }
3074 }
3075
3076 #[test]
3077 fn test_left_to_right_vs_easy_first_produces_clusters() {
3078 let text = "John met Mary. He greeted her warmly. She smiled at him.";
3079
3080 let l2r_config = MentionRankingConfig {
3082 clustering_strategy: ClusteringStrategy::LeftToRight,
3083 ..Default::default()
3084 };
3085 let l2r_coref = MentionRankingCoref::with_config(l2r_config);
3086 let l2r_clusters = l2r_coref.resolve(text).unwrap();
3087
3088 let ef_config = MentionRankingConfig {
3090 clustering_strategy: ClusteringStrategy::EasyFirst,
3091 ..Default::default()
3092 };
3093 let ef_coref = MentionRankingCoref::with_config(ef_config);
3094 let ef_clusters = ef_coref.resolve(text).unwrap();
3095
3096 assert!(
3098 !l2r_clusters.is_empty() || !ef_clusters.is_empty(),
3099 "At least one strategy should produce clusters"
3100 );
3101 }
3102
3103 #[test]
3108 fn test_global_proper_coref_config() {
3109 let config = MentionRankingConfig {
3110 enable_global_proper_coref: true,
3111 global_proper_threshold: 0.8,
3112 ..Default::default()
3113 };
3114
3115 assert!(config.enable_global_proper_coref);
3116 assert!((config.global_proper_threshold - 0.8).abs() < 0.001);
3117 }
3118
3119 #[test]
3120 fn test_global_proper_coref_same_name() {
3121 let config = MentionRankingConfig {
3123 enable_global_proper_coref: true,
3124 ..Default::default()
3125 };
3126 let coref = MentionRankingCoref::with_config(config);
3127
3128 let text = "John arrived. He was happy. Later John left.";
3131 let clusters = coref.resolve(text).unwrap();
3132
3133 for cluster in &clusters {
3137 for mention in &cluster.mentions {
3138 assert!(mention.start <= mention.end);
3139 }
3140 }
3141 }
3142
3143 #[test]
3148 fn test_non_coref_constraints_config() {
3149 let config = MentionRankingConfig {
3150 use_non_coref_constraints: true,
3151 non_coref_threshold: 0.1,
3152 ..Default::default()
3153 };
3154
3155 assert!(config.use_non_coref_constraints);
3156 assert!((config.non_coref_threshold - 0.1).abs() < 0.001);
3157 }
3158
3159 #[test]
3160 fn test_easy_first_with_non_coref_constraints() {
3161 let config = MentionRankingConfig {
3162 clustering_strategy: ClusteringStrategy::EasyFirst,
3163 use_non_coref_constraints: true,
3164 ..Default::default()
3165 };
3166 let coref = MentionRankingCoref::with_config(config);
3167
3168 let clusters = coref.resolve("John and Mary went to the store.").unwrap();
3170
3171 for cluster in &clusters {
3173 for mention in &cluster.mentions {
3174 assert!(mention.start <= mention.end);
3175 }
3176 }
3177 }
3178
3179 #[test]
3184 fn test_full_book_scale_pipeline() {
3185 let config = MentionRankingConfig::book_scale();
3186 let coref = MentionRankingCoref::with_config(config);
3187
3188 let text = "Elizabeth Bennett was a spirited young woman. She lived at Longbourn \
3190 with her family. Her mother, Mrs. Bennett, was determined to see her \
3191 daughters married well. Elizabeth often walked in the countryside. \
3192 She enjoyed the solitude it offered.";
3193
3194 let clusters = coref.resolve(text).unwrap();
3195
3196 for cluster in &clusters {
3198 assert!(!cluster.mentions.is_empty());
3199 for mention in &cluster.mentions {
3200 assert!(mention.start <= mention.end);
3201 assert!(mention.end <= text.chars().count());
3202 }
3203 }
3204 }
3205
3206 #[test]
3207 fn test_mention_type_distribution() {
3208 let coref = MentionRankingCoref::new();
3209 let text = "Dr. Smith saw John. He examined him carefully.";
3210 let mentions = coref.detect_mentions(text).unwrap();
3211
3212 let pronoun_count = mentions
3213 .iter()
3214 .filter(|m| m.mention_type == MentionType::Pronominal)
3215 .count();
3216 let proper_count = mentions
3217 .iter()
3218 .filter(|m| m.mention_type == MentionType::Proper)
3219 .count();
3220
3221 assert!(pronoun_count > 0, "Should detect pronouns");
3223 assert!(proper_count > 0, "Should detect proper nouns");
3224 }
3225
3226 #[test]
3231 fn test_salience_config_default() {
3232 let config = MentionRankingConfig::default();
3233 assert!((config.salience_weight - 0.0).abs() < 0.001);
3235 }
3236
3237 #[test]
3238 fn test_salience_config_builder() {
3239 let config = MentionRankingConfig::default().with_salience(0.25);
3240 assert!((config.salience_weight - 0.25).abs() < 0.001);
3241
3242 let clamped = MentionRankingConfig::default().with_salience(1.5);
3244 assert!((clamped.salience_weight - 1.0).abs() < 0.001);
3245 }
3246
3247 #[test]
3248 fn test_salience_book_scale_enabled() {
3249 let config = MentionRankingConfig::book_scale();
3250 assert!(
3251 config.salience_weight > 0.0,
3252 "Book-scale should enable salience"
3253 );
3254 }
3255
3256 #[test]
3257 fn test_with_salience_scores() {
3258 let mut scores = HashMap::new();
3259 scores.insert("john".to_string(), 0.8);
3260 scores.insert("Mary".to_string(), 0.6); let coref = MentionRankingCoref::new().with_salience(scores);
3263
3264 assert!((coref.get_salience("john") - 0.8).abs() < 0.001);
3266 assert!((coref.get_salience("John") - 0.8).abs() < 0.001);
3267 assert!((coref.get_salience("JOHN") - 0.8).abs() < 0.001);
3268 assert!((coref.get_salience("mary") - 0.6).abs() < 0.001);
3269
3270 assert!((coref.get_salience("unknown") - 0.0).abs() < 0.001);
3272 }
3273
3274 #[test]
3275 fn test_salience_boosts_antecedent_score() {
3276 let config = MentionRankingConfig {
3278 salience_weight: 0.3,
3279 ..Default::default()
3280 };
3281
3282 let mut scores = HashMap::new();
3284 scores.insert("john".to_string(), 1.0);
3285 scores.insert("mary".to_string(), 0.0);
3286
3287 let coref = MentionRankingCoref::with_config(config).with_salience(scores);
3288
3289 let mention = RankedMention {
3290 start: 20,
3291 end: 22,
3292 text: "He".to_string(),
3293 mention_type: MentionType::Pronominal,
3294 gender: Some(Gender::Masculine),
3295 number: Some(Number::Singular),
3296 head: "He".to_string(),
3297 };
3298
3299 let john = RankedMention {
3300 start: 0,
3301 end: 4,
3302 text: "John".to_string(),
3303 mention_type: MentionType::Proper,
3304 gender: Some(Gender::Masculine),
3305 number: Some(Number::Singular),
3306 head: "John".to_string(),
3307 };
3308
3309 let bob = RankedMention {
3310 start: 10,
3311 end: 13,
3312 text: "Bob".to_string(), mention_type: MentionType::Proper,
3314 gender: Some(Gender::Masculine),
3315 number: Some(Number::Singular),
3316 head: "Bob".to_string(),
3317 };
3318
3319 let score_john = coref.score_pair(&mention, &john, 16, None);
3320 let score_bob = coref.score_pair(&mention, &bob, 7, None);
3321
3322 assert!(
3326 score_john > score_bob - 0.1, "Salient antecedent should score higher: john={}, bob={}",
3328 score_john,
3329 score_bob
3330 );
3331 }
3332
3333 #[test]
3334 fn test_salience_no_effect_when_disabled() {
3335 let config = MentionRankingConfig {
3336 salience_weight: 0.0, ..Default::default()
3338 };
3339
3340 let mut scores = HashMap::new();
3341 scores.insert("john".to_string(), 1.0);
3342
3343 let coref = MentionRankingCoref::with_config(config.clone()).with_salience(scores);
3344
3345 let mention = RankedMention {
3346 start: 10,
3347 end: 12,
3348 text: "He".to_string(),
3349 mention_type: MentionType::Pronominal,
3350 gender: Some(Gender::Masculine),
3351 number: Some(Number::Singular),
3352 head: "He".to_string(),
3353 };
3354
3355 let antecedent = RankedMention {
3356 start: 0,
3357 end: 4,
3358 text: "John".to_string(),
3359 mention_type: MentionType::Proper,
3360 gender: Some(Gender::Masculine),
3361 number: Some(Number::Singular),
3362 head: "John".to_string(),
3363 };
3364
3365 let coref_no_salience = MentionRankingCoref::with_config(config);
3367 let score_without = coref_no_salience.score_pair(&mention, &antecedent, 6, None);
3368
3369 let score_with = coref.score_pair(&mention, &antecedent, 6, None);
3371
3372 assert!(
3374 (score_without - score_with).abs() < 0.001,
3375 "Salience should have no effect when weight=0"
3376 );
3377 }
3378
3379 #[test]
3380 fn test_salience_resolution_integration() {
3381 let config = MentionRankingConfig {
3383 salience_weight: 0.2,
3384 ..Default::default()
3385 };
3386
3387 let mut scores = HashMap::new();
3388 scores.insert("president".to_string(), 0.9);
3389 scores.insert("john".to_string(), 0.7);
3390 scores.insert("meeting".to_string(), 0.3);
3391
3392 let coref = MentionRankingCoref::with_config(config).with_salience(scores);
3393
3394 let text = "John met the President. He was nervous.";
3395 let clusters = coref.resolve(text).unwrap();
3396
3397 for cluster in &clusters {
3399 assert!(!cluster.mentions.is_empty());
3400 for mention in &cluster.mentions {
3401 assert!(mention.start <= mention.end);
3402 assert!(mention.end <= text.chars().count());
3403 }
3404 }
3405 }
3406
3407 #[test]
3408 fn test_salience_with_multilingual_text() {
3409 let config = MentionRankingConfig {
3410 salience_weight: 0.2,
3411 ..Default::default()
3412 };
3413
3414 let mut scores = HashMap::new();
3415 scores.insert("北京".to_string(), 0.8);
3416 scores.insert("習近平".to_string(), 0.9);
3417
3418 let coref = MentionRankingCoref::with_config(config).with_salience(scores);
3419
3420 assert!((coref.get_salience("北京") - 0.8).abs() < 0.001);
3422 assert!((coref.get_salience("習近平") - 0.9).abs() < 0.001);
3423 }
3424
3425 #[test]
3430 fn test_mention_cluster_to_signals() {
3431 let cluster = MentionCluster {
3432 id: 0,
3433 mentions: vec![
3434 RankedMention {
3435 start: 0,
3436 end: 4,
3437 text: "John".to_string(),
3438 mention_type: MentionType::Proper,
3439 gender: Some(Gender::Masculine),
3440 number: Some(Number::Singular),
3441 head: "John".to_string(),
3442 },
3443 RankedMention {
3444 start: 15,
3445 end: 17,
3446 text: "He".to_string(),
3447 mention_type: MentionType::Pronominal,
3448 gender: Some(Gender::Masculine),
3449 number: Some(Number::Singular),
3450 head: "He".to_string(),
3451 },
3452 ],
3453 };
3454
3455 let signals = cluster.to_signals(anno_core::SignalId::new(100));
3456
3457 assert_eq!(signals.len(), 2);
3458 assert_eq!(signals[0].id, anno_core::SignalId::new(100));
3459 assert_eq!(signals[1].id, anno_core::SignalId::new(101));
3460 assert_eq!(signals[0].surface, "John");
3461 assert_eq!(signals[1].surface, "He");
3462
3463 if let anno_core::Location::Text { start, end } = &signals[0].location {
3465 assert_eq!(*start, 0);
3466 assert_eq!(*end, 4);
3467 } else {
3468 panic!("Expected Text location");
3469 }
3470 }
3471
3472 #[test]
3473 fn test_mention_cluster_to_track() {
3474 let cluster = MentionCluster {
3475 id: 42,
3476 mentions: vec![
3477 RankedMention {
3478 start: 0,
3479 end: 4,
3480 text: "John".to_string(),
3481 mention_type: MentionType::Proper,
3482 gender: Some(Gender::Masculine),
3483 number: Some(Number::Singular),
3484 head: "John".to_string(),
3485 },
3486 RankedMention {
3487 start: 15,
3488 end: 17,
3489 text: "He".to_string(),
3490 mention_type: MentionType::Pronominal,
3491 gender: Some(Gender::Masculine),
3492 number: Some(Number::Singular),
3493 head: "He".to_string(),
3494 },
3495 ],
3496 };
3497
3498 let (track, signals) = cluster.to_track(anno_core::SignalId::new(0));
3499
3500 assert_eq!(track.id, anno_core::TrackId::new(42));
3502 assert_eq!(track.canonical_surface, "John"); assert_eq!(track.signals.len(), 2);
3504
3505 assert_eq!(signals.len(), 2);
3507 assert_eq!(signals[0].surface, "John");
3508 assert_eq!(signals[1].surface, "He");
3509 }
3510
3511 #[test]
3512 fn test_canonical_mention_prefers_proper() {
3513 let cluster = MentionCluster {
3515 id: 0,
3516 mentions: vec![
3517 RankedMention {
3518 start: 0,
3519 end: 2,
3520 text: "He".to_string(),
3521 mention_type: MentionType::Pronominal,
3522 gender: Some(Gender::Masculine),
3523 number: Some(Number::Singular),
3524 head: "He".to_string(),
3525 },
3526 RankedMention {
3527 start: 10,
3528 end: 14,
3529 text: "John".to_string(),
3530 mention_type: MentionType::Proper,
3531 gender: Some(Gender::Masculine),
3532 number: Some(Number::Singular),
3533 head: "John".to_string(),
3534 },
3535 ],
3536 };
3537
3538 let canonical = cluster.canonical_mention().unwrap();
3540 assert_eq!(canonical.text, "John");
3541 }
3542
3543 #[test]
3544 fn test_resolve_to_grounded() {
3545 let coref = MentionRankingCoref::new();
3546 let (signals, tracks) = coref
3547 .resolve_to_grounded("John saw Mary. He waved.")
3548 .unwrap();
3549
3550 assert!(!signals.is_empty());
3552
3553 for signal in &signals {
3555 if let anno_core::Location::Text { start, end } = &signal.location {
3556 assert!(start <= end);
3557 } else {
3558 panic!("Expected Text location");
3559 }
3560 }
3561
3562 for track in &tracks {
3564 assert!(!track.signals.is_empty());
3565 assert!(!track.canonical_surface.is_empty());
3566 }
3567 }
3568
3569 #[test]
3570 fn test_resolve_into_document() {
3571 let coref = MentionRankingCoref::new();
3572 let text = "John saw Mary. He waved to her.";
3573 let mut doc = anno_core::GroundedDocument::new("test_doc", text);
3574
3575 let track_ids = coref.resolve_into_document(text, &mut doc).unwrap();
3576
3577 assert!(!doc.signals.is_empty());
3579 assert!(!doc.tracks.is_empty());
3580
3581 for track_id in &track_ids {
3583 assert!(doc.tracks.contains_key(track_id));
3584 }
3585 }
3586
3587 #[test]
3588 fn test_ranked_mention_to_signal() {
3589 let mention = RankedMention {
3590 start: 10,
3591 end: 20,
3592 text: "the company".to_string(),
3593 mention_type: MentionType::Nominal,
3594 gender: None,
3595 number: Some(Number::Singular),
3596 head: "company".to_string(),
3597 };
3598
3599 let signal = mention.to_signal(anno_core::SignalId::new(999));
3600
3601 assert_eq!(signal.id, anno_core::SignalId::new(999));
3602 assert_eq!(signal.surface, "the company");
3603 assert_eq!(signal.label, "nominal".into());
3604 assert_eq!(signal.modality, anno_core::Modality::Symbolic);
3605
3606 if let anno_core::Location::Text { start, end } = signal.location {
3607 assert_eq!(start, 10);
3608 assert_eq!(end, 20);
3609 } else {
3610 panic!("Expected Text location");
3611 }
3612 }
3613
3614 #[test]
3615 fn test_grounded_integration_unicode() {
3616 let coref = MentionRankingCoref::new();
3617 let text = "習近平在北京。他很忙。"; let (signals, _tracks) = coref.resolve_to_grounded(text).unwrap();
3620 let char_count = text.chars().count();
3621
3622 for signal in &signals {
3624 if let anno_core::Location::Text { start, end } = &signal.location {
3625 assert!(*start <= *end);
3626 assert!(
3627 *end <= char_count,
3628 "Signal end {} exceeds char count {}",
3629 end,
3630 char_count
3631 );
3632 }
3633 }
3634 }
3635
3636 #[test]
3641 fn test_be_phrase_detection() {
3642 let config = MentionRankingConfig::clinical();
3643 let coref = MentionRankingCoref::with_config(config);
3644
3645 let text = "The patient is John Smith. He was seen by Dr. Jones.";
3646
3647 let m1 = RankedMention {
3649 start: 4,
3650 end: 11,
3651 text: "patient".to_string(),
3652 mention_type: MentionType::Nominal,
3653 gender: None,
3654 number: Some(Number::Singular),
3655 head: "patient".to_string(),
3656 };
3657
3658 let m2 = RankedMention {
3659 start: 15,
3660 end: 25,
3661 text: "John Smith".to_string(),
3662 mention_type: MentionType::Proper,
3663 gender: Some(Gender::Masculine),
3664 number: Some(Number::Singular),
3665 head: "Smith".to_string(),
3666 };
3667
3668 assert!(
3670 coref.is_be_phrase_link(text, &m1, &m2),
3671 "Should detect 'is' between patient and John Smith"
3672 );
3673
3674 let score = coref.score_pair(&m1, &m2, 4, Some(text));
3676 assert!(score > 0.5, "Be-phrase should boost score: got {}", score);
3677 }
3678
3679 #[test]
3680 fn test_be_phrase_detection_negative() {
3681 let coref = MentionRankingCoref::new();
3682
3683 let text = "John saw Mary at the store.";
3684
3685 let m1 = RankedMention {
3686 start: 0,
3687 end: 4,
3688 text: "John".to_string(),
3689 mention_type: MentionType::Proper,
3690 gender: Some(Gender::Masculine),
3691 number: Some(Number::Singular),
3692 head: "John".to_string(),
3693 };
3694
3695 let m2 = RankedMention {
3696 start: 9,
3697 end: 13,
3698 text: "Mary".to_string(),
3699 mention_type: MentionType::Proper,
3700 gender: Some(Gender::Feminine),
3701 number: Some(Number::Singular),
3702 head: "Mary".to_string(),
3703 };
3704
3705 assert!(
3707 !coref.is_be_phrase_link(text, &m1, &m2),
3708 "Should not detect be-phrase between John and Mary"
3709 );
3710 }
3711
3712 #[test]
3713 fn test_acronym_matching() {
3714 let coref = MentionRankingCoref::new();
3715
3716 let mrsa = RankedMention {
3717 start: 0,
3718 end: 4,
3719 text: "MRSA".to_string(),
3720 mention_type: MentionType::Proper,
3721 gender: None,
3722 number: Some(Number::Singular),
3723 head: "MRSA".to_string(),
3724 };
3725
3726 let full = RankedMention {
3727 start: 20,
3728 end: 65,
3729 text: "Methicillin-resistant Staphylococcus aureus".to_string(),
3730 mention_type: MentionType::Proper,
3731 gender: None,
3732 number: Some(Number::Singular),
3733 head: "aureus".to_string(),
3734 };
3735
3736 assert!(
3737 coref.is_acronym_match(&mrsa, &full),
3738 "MRSA should match Methicillin-resistant Staphylococcus aureus"
3739 );
3740 }
3741
3742 #[test]
3743 fn test_acronym_matching_who() {
3744 let coref = MentionRankingCoref::new();
3745
3746 let who = RankedMention {
3747 start: 0,
3748 end: 3,
3749 text: "WHO".to_string(),
3750 mention_type: MentionType::Proper,
3751 gender: None,
3752 number: Some(Number::Singular),
3753 head: "WHO".to_string(),
3754 };
3755
3756 let full = RankedMention {
3757 start: 10,
3758 end: 35,
3759 text: "World Health Organization".to_string(),
3760 mention_type: MentionType::Proper,
3761 gender: None,
3762 number: Some(Number::Singular),
3763 head: "Organization".to_string(),
3764 };
3765
3766 assert!(
3767 coref.is_acronym_match(&who, &full),
3768 "WHO should match World Health Organization"
3769 );
3770 }
3771
3772 #[test]
3773 fn test_acronym_matching_negative() {
3774 let coref = MentionRankingCoref::new();
3775
3776 let ibm = RankedMention {
3777 start: 0,
3778 end: 3,
3779 text: "IBM".to_string(),
3780 mention_type: MentionType::Proper,
3781 gender: None,
3782 number: Some(Number::Singular),
3783 head: "IBM".to_string(),
3784 };
3785
3786 let apple = RankedMention {
3787 start: 10,
3788 end: 25,
3789 text: "Apple Inc".to_string(),
3790 mention_type: MentionType::Proper,
3791 gender: None,
3792 number: Some(Number::Singular),
3793 head: "Apple".to_string(),
3794 };
3795
3796 assert!(
3797 !coref.is_acronym_match(&ibm, &apple),
3798 "IBM should not match Apple Inc"
3799 );
3800 }
3801
3802 #[test]
3803 fn test_context_filtering_different_dates() {
3804 let config = MentionRankingConfig::clinical();
3805 let coref = MentionRankingCoref::with_config(config);
3806
3807 let text = "On 2024-01-15 the patient presented. On 2024-02-20 the patient returned.";
3809
3810 let m1 = RankedMention {
3811 start: 17,
3812 end: 24,
3813 text: "patient".to_string(),
3814 mention_type: MentionType::Nominal,
3815 gender: None,
3816 number: Some(Number::Singular),
3817 head: "patient".to_string(),
3818 };
3819
3820 let m2 = RankedMention {
3821 start: 50,
3822 end: 57,
3823 text: "patient".to_string(),
3824 mention_type: MentionType::Nominal,
3825 gender: None,
3826 number: Some(Number::Singular),
3827 head: "patient".to_string(),
3828 };
3829
3830 assert!(
3832 coref.should_filter_by_context(text, &m1, &m2),
3833 "Should filter link between patients with different dates"
3834 );
3835 }
3836
3837 #[test]
3838 fn test_context_filtering_negation() {
3839 let config = MentionRankingConfig::clinical();
3840 let coref = MentionRankingCoref::with_config(config);
3841
3842 let text = "Patient is not a diabetic. This is important. The diabetic protocol was used.";
3845 let m1 = RankedMention {
3850 start: 17,
3851 end: 25,
3852 text: "diabetic".to_string(),
3853 mention_type: MentionType::Nominal,
3854 gender: None,
3855 number: Some(Number::Singular),
3856 head: "diabetic".to_string(),
3857 };
3858
3859 let m2 = RankedMention {
3861 start: 50,
3862 end: 58,
3863 text: "diabetic".to_string(),
3864 mention_type: MentionType::Nominal,
3865 gender: None,
3866 number: Some(Number::Singular),
3867 head: "diabetic".to_string(),
3868 };
3869
3870 let text_chars: Vec<char> = text.chars().collect();
3872 let m1_context: String = text_chars
3873 [m1.start.saturating_sub(20)..m1.end.min(text_chars.len())]
3874 .iter()
3875 .collect();
3876 let m2_context: String = text_chars
3877 [m2.start.saturating_sub(20)..m2.end.min(text_chars.len())]
3878 .iter()
3879 .collect();
3880 eprintln!("m1 context: '{}'", m1_context);
3881 eprintln!("m2 context: '{}'", m2_context);
3882
3883 assert!(
3885 m1_context.contains("not"),
3886 "m1 context should contain 'not'"
3887 );
3888 assert!(
3889 !m2_context.contains("not"),
3890 "m2 context should not contain 'not'"
3891 );
3892
3893 assert!(
3895 coref.should_filter_by_context(text, &m1, &m2),
3896 "Should filter link between negated ('{}') and non-negated ('{}') mentions",
3897 m1_context,
3898 m2_context
3899 );
3900 }
3901
3902 #[test]
3903 fn test_synonym_matching_high_similarity() {
3904 let coref = MentionRankingCoref::new();
3907
3908 let obama = RankedMention {
3909 start: 0,
3910 end: 5,
3911 text: "Obama".to_string(),
3912 mention_type: MentionType::Proper,
3913 gender: None,
3914 number: Some(Number::Singular),
3915 head: "Obama".to_string(),
3916 };
3917
3918 let obama_lower = RankedMention {
3919 start: 10,
3920 end: 15,
3921 text: "obama".to_string(),
3922 mention_type: MentionType::Proper,
3923 gender: None,
3924 number: Some(Number::Singular),
3925 head: "obama".to_string(),
3926 };
3927
3928 assert!(
3930 coref.are_synonyms(&obama, &obama_lower),
3931 "Obama and obama should match (case-insensitive)"
3932 );
3933 }
3934
3935 #[test]
3936 fn test_synonym_matching_low_similarity_no_match() {
3937 let coref = MentionRankingCoref::new();
3941
3942 let heart = RankedMention {
3943 start: 0,
3944 end: 5,
3945 text: "heart".to_string(),
3946 mention_type: MentionType::Nominal,
3947 gender: None,
3948 number: Some(Number::Singular),
3949 head: "heart".to_string(),
3950 };
3951
3952 let cardiac = RankedMention {
3953 start: 10,
3954 end: 17,
3955 text: "cardiac".to_string(),
3956 mention_type: MentionType::Nominal,
3957 gender: None,
3958 number: Some(Number::Singular),
3959 head: "cardiac".to_string(),
3960 };
3961
3962 assert!(
3967 !coref.are_synonyms(&heart, &cardiac),
3968 "heart/cardiac require domain-specific SynonymSource"
3969 );
3970 }
3971
3972 #[test]
3973 fn test_clinical_config() {
3974 let config = MentionRankingConfig::clinical();
3975
3976 assert!(config.enable_be_phrase_detection);
3978 assert!(config.enable_acronym_matching);
3979 assert!(config.enable_context_filtering);
3980 assert!(config.enable_synonym_matching);
3981
3982 assert!(config.be_phrase_weight > 0.5);
3984 assert!(config.acronym_weight > 0.5);
3985 assert!(config.synonym_weight > 0.3);
3986 }
3987
3988 #[test]
3989 fn test_clinical_resolution_integration() {
3990 let config = MentionRankingConfig::clinical();
3991 let coref = MentionRankingCoref::with_config(config);
3992
3993 let text = "The patient is John Smith. Pt was admitted with MRSA. \
3995 Methicillin-resistant Staphylococcus aureus was treated.";
3996
3997 let clusters = coref.resolve(text).unwrap();
3998
3999 assert!(
4001 !clusters.is_empty(),
4002 "Should find clusters in clinical text"
4003 );
4004
4005 for cluster in &clusters {
4007 let texts: Vec<_> = cluster.mentions.iter().map(|m| &m.text).collect();
4008 eprintln!("Cluster {}: {:?}", cluster.id, texts);
4009 }
4010 }
4011
4012 #[test]
4013 fn test_i2b2_scoring_with_all_features() {
4014 let config = MentionRankingConfig::clinical();
4015 let coref = MentionRankingCoref::with_config(config);
4016
4017 let text = "Resolution of organism is MRSA.";
4019
4020 let m1 = RankedMention {
4021 start: 14,
4022 end: 22,
4023 text: "organism".to_string(),
4024 mention_type: MentionType::Nominal,
4025 gender: None,
4026 number: Some(Number::Singular),
4027 head: "organism".to_string(),
4028 };
4029
4030 let m2 = RankedMention {
4031 start: 26,
4032 end: 30,
4033 text: "MRSA".to_string(),
4034 mention_type: MentionType::Proper,
4035 gender: None,
4036 number: Some(Number::Singular),
4037 head: "MRSA".to_string(),
4038 };
4039
4040 let score = coref.score_pair(&m1, &m2, 4, Some(text));
4042 assert!(
4043 score > 0.7,
4044 "Be-phrase pattern should yield high score, got {}",
4045 score
4046 );
4047 }
4048
4049 #[test]
4054 fn test_nominal_adjective_detection_basic() {
4055 let config = MentionRankingConfig {
4056 enable_nominal_adjective_detection: true,
4057 ..Default::default()
4058 };
4059 let coref = MentionRankingCoref::with_config(config);
4060
4061 let text = "The poor are struggling while the rich get richer.";
4062 let mentions = coref.detect_mentions(text).unwrap();
4063
4064 let texts: Vec<_> = mentions.iter().map(|m| m.text.as_str()).collect();
4065 assert!(
4066 texts.contains(&"The poor"),
4067 "Should detect 'The poor': {:?}",
4068 texts
4069 );
4070 assert!(
4071 texts.contains(&"the rich"),
4072 "Should detect 'the rich': {:?}",
4073 texts
4074 );
4075
4076 let poor_mention = mentions
4078 .iter()
4079 .find(|m| m.text.to_lowercase() == "the poor");
4080 assert!(poor_mention.is_some());
4081 assert_eq!(poor_mention.unwrap().number, Some(Number::Plural));
4082 assert_eq!(poor_mention.unwrap().mention_type, MentionType::Nominal);
4083 }
4084
4085 #[test]
4086 fn test_nominal_adjective_not_before_noun() {
4087 let config = MentionRankingConfig {
4090 enable_nominal_adjective_detection: true,
4091 ..Default::default()
4092 };
4093 let coref = MentionRankingCoref::with_config(config);
4094
4095 let text = "The poor performance was criticized.";
4096 let mentions = coref.detect_mentions(text).unwrap();
4097
4098 let texts: Vec<_> = mentions.iter().map(|m| m.text.as_str()).collect();
4099 assert!(
4100 !texts.contains(&"The poor"),
4101 "Should NOT detect 'The poor' when followed by noun: {:?}",
4102 texts
4103 );
4104 }
4105
4106 #[test]
4107 fn test_nominal_adjective_at_sentence_end() {
4108 let config = MentionRankingConfig {
4109 enable_nominal_adjective_detection: true,
4110 ..Default::default()
4111 };
4112 let coref = MentionRankingCoref::with_config(config);
4113
4114 let text = "We must help the elderly.";
4115 let mentions = coref.detect_mentions(text).unwrap();
4116
4117 let texts: Vec<_> = mentions.iter().map(|m| m.text.as_str()).collect();
4118 assert!(
4119 texts.contains(&"the elderly"),
4120 "Should detect 'the elderly' at end: {:?}",
4121 texts
4122 );
4123 }
4124
4125 #[test]
4126 fn test_nominal_adjective_with_punctuation() {
4127 let config = MentionRankingConfig {
4128 enable_nominal_adjective_detection: true,
4129 ..Default::default()
4130 };
4131 let coref = MentionRankingCoref::with_config(config);
4132
4133 let text = "The accused, the condemned, and the guilty were present.";
4134 let mentions = coref.detect_mentions(text).unwrap();
4135
4136 let texts: Vec<_> = mentions.iter().map(|m| m.text.as_str()).collect();
4137 assert!(
4138 texts.contains(&"The accused"),
4139 "Should detect 'The accused': {:?}",
4140 texts
4141 );
4142 assert!(
4143 texts.contains(&"the condemned"),
4144 "Should detect 'the condemned': {:?}",
4145 texts
4146 );
4147 assert!(
4148 texts.contains(&"the guilty"),
4149 "Should detect 'the guilty': {:?}",
4150 texts
4151 );
4152 }
4153
4154 #[test]
4155 fn test_nominal_adjective_these_those() {
4156 let config = MentionRankingConfig {
4157 enable_nominal_adjective_detection: true,
4158 ..Default::default()
4159 };
4160 let coref = MentionRankingCoref::with_config(config);
4161
4162 let text = "These homeless need shelter. Those unemployed seek work.";
4163 let mentions = coref.detect_mentions(text).unwrap();
4164
4165 let texts: Vec<_> = mentions.iter().map(|m| m.text.as_str()).collect();
4166 assert!(
4167 texts.contains(&"These homeless"),
4168 "Should detect 'These homeless': {:?}",
4169 texts
4170 );
4171 assert!(
4172 texts.contains(&"Those unemployed"),
4173 "Should detect 'Those unemployed': {:?}",
4174 texts
4175 );
4176 }
4177
4178 #[test]
4179 fn test_nominal_adjective_disabled_by_default() {
4180 let coref = MentionRankingCoref::new();
4181
4182 let text = "The poor are struggling.";
4183 let mentions = coref.detect_mentions(text).unwrap();
4184
4185 let has_the_poor = mentions.iter().any(|m| m.text.to_lowercase() == "the poor");
4187 assert!(
4188 !has_the_poor,
4189 "Nominal adjective detection should be disabled by default"
4190 );
4191 }
4192
4193 #[test]
4198 fn test_singular_they_number_unknown() {
4199 let coref = MentionRankingCoref::new();
4200
4201 let text = "Alex said they would come. They brought their friends.";
4203 let mentions = coref.detect_mentions(text).unwrap();
4204
4205 let they_mentions: Vec<_> = mentions
4207 .iter()
4208 .filter(|m| m.text.to_lowercase() == "they")
4209 .collect();
4210
4211 for they in &they_mentions {
4212 assert_eq!(
4213 they.number,
4214 Some(Number::Unknown),
4215 "'they' should have Number::Unknown for singular/plural ambiguity"
4216 );
4217 }
4218 }
4219
4220 #[test]
4221 fn test_their_number_unknown() {
4222 let coref = MentionRankingCoref::new();
4223
4224 let text = "Someone left their umbrella.";
4225 let mentions = coref.detect_mentions(text).unwrap();
4226
4227 let their = mentions.iter().find(|m| m.text.to_lowercase() == "their");
4228 assert!(their.is_some(), "Should detect 'their'");
4229 assert_eq!(
4230 their.unwrap().number,
4231 Some(Number::Unknown),
4232 "'their' should have Number::Unknown"
4233 );
4234 }
4235
4236 #[test]
4237 fn test_themself_vs_themselves() {
4238 let coref = MentionRankingCoref::new();
4241
4242 let text = "The student prepared themself. The students prepared themselves.";
4243 let mentions = coref.detect_mentions(text).unwrap();
4244
4245 let themself = mentions
4246 .iter()
4247 .find(|m| m.text.to_lowercase() == "themself");
4248 let themselves = mentions
4249 .iter()
4250 .find(|m| m.text.to_lowercase() == "themselves");
4251
4252 assert!(themself.is_some(), "Should detect 'themself'");
4253 assert!(themselves.is_some(), "Should detect 'themselves'");
4254
4255 assert_eq!(
4256 themself.unwrap().number,
4257 Some(Number::Singular),
4258 "'themself' is explicitly singular"
4259 );
4260 assert_eq!(
4261 themselves.unwrap().number,
4262 Some(Number::Plural),
4263 "'themselves' is explicitly plural"
4264 );
4265 }
4266
4267 #[test]
4272 fn test_neopronoun_ze_hir() {
4273 let coref = MentionRankingCoref::new();
4274
4275 let text = "Ze told me to text hir, but I don't have hirs number.";
4276 let mentions = coref.detect_mentions(text).unwrap();
4277
4278 let ze = mentions.iter().find(|m| m.text.to_lowercase() == "ze");
4279 let hir = mentions.iter().find(|m| m.text.to_lowercase() == "hir");
4280 let hirs = mentions.iter().find(|m| m.text.to_lowercase() == "hirs");
4281
4282 assert!(ze.is_some(), "Should detect 'ze'");
4283 assert!(hir.is_some(), "Should detect 'hir'");
4284 assert!(hirs.is_some(), "Should detect 'hirs'");
4285
4286 assert_eq!(ze.unwrap().number, Some(Number::Singular));
4288 assert_eq!(hir.unwrap().number, Some(Number::Singular));
4289 assert_eq!(hirs.unwrap().number, Some(Number::Singular));
4290
4291 assert_eq!(ze.unwrap().gender, Some(Gender::Unknown));
4293 }
4294
4295 #[test]
4296 fn test_neopronoun_xe_xem() {
4297 let coref = MentionRankingCoref::new();
4298
4299 let text = "Xe said xem would bring xyr notes.";
4300 let mentions = coref.detect_mentions(text).unwrap();
4301
4302 let xe = mentions.iter().find(|m| m.text.to_lowercase() == "xe");
4303 let xem = mentions.iter().find(|m| m.text.to_lowercase() == "xem");
4304 let xyr = mentions.iter().find(|m| m.text.to_lowercase() == "xyr");
4305
4306 assert!(xe.is_some(), "Should detect 'xe'");
4307 assert!(xem.is_some(), "Should detect 'xem'");
4308 assert!(xyr.is_some(), "Should detect 'xyr'");
4309
4310 assert_eq!(xe.unwrap().number, Some(Number::Singular));
4311 assert_eq!(xe.unwrap().gender, Some(Gender::Unknown));
4312 }
4313
4314 #[test]
4315 fn test_neopronoun_spivak_ey_em() {
4316 let coref = MentionRankingCoref::new();
4317
4318 let text = "Ey told me to call em later.";
4319 let mentions = coref.detect_mentions(text).unwrap();
4320
4321 let ey = mentions.iter().find(|m| m.text.to_lowercase() == "ey");
4322 let em = mentions.iter().find(|m| m.text.to_lowercase() == "em");
4323
4324 assert!(ey.is_some(), "Should detect 'ey' (Spivak pronoun)");
4325 assert!(em.is_some(), "Should detect 'em' (Spivak pronoun)");
4326
4327 assert_eq!(ey.unwrap().number, Some(Number::Singular));
4328 }
4329
4330 #[test]
4331 fn test_neopronoun_fae_faer() {
4332 let coref = MentionRankingCoref::new();
4333
4334 let text = "Fae said faer class was cancelled.";
4335 let mentions = coref.detect_mentions(text).unwrap();
4336
4337 let fae = mentions.iter().find(|m| m.text.to_lowercase() == "fae");
4338 let faer = mentions.iter().find(|m| m.text.to_lowercase() == "faer");
4339
4340 assert!(fae.is_some(), "Should detect 'fae'");
4341 assert!(faer.is_some(), "Should detect 'faer'");
4342
4343 assert_eq!(fae.unwrap().number, Some(Number::Singular));
4344 }
4345
4346 #[test]
4351 fn test_ranked_mention_from_entity() {
4352 let entity = crate::Entity::new("Barack Obama", crate::EntityType::Person, 0, 12, 0.95);
4353 let mention = RankedMention::from(&entity);
4354
4355 assert_eq!(mention.start, 0);
4356 assert_eq!(mention.end, 12);
4357 assert_eq!(mention.text, "Barack Obama");
4358 assert_eq!(mention.head, "Obama"); assert_eq!(mention.mention_type, MentionType::Proper);
4360 }
4361
4362 #[test]
4363 fn test_ranked_mention_to_coref_mention() {
4364 let mention = RankedMention {
4365 start: 10,
4366 end: 20,
4367 text: "the patient".to_string(),
4368 mention_type: MentionType::Nominal,
4369 gender: Some(Gender::Unknown),
4370 number: Some(Number::Singular),
4371 head: "patient".to_string(),
4372 };
4373
4374 let coref_mention: anno_core::Mention = (&mention).into();
4375
4376 assert_eq!(coref_mention.start, 10);
4377 assert_eq!(coref_mention.end, 20);
4378 assert_eq!(coref_mention.text, "the patient");
4379 assert_eq!(coref_mention.mention_type, Some(MentionType::Nominal));
4380 }
4381
4382 #[test]
4383 fn test_ranked_mention_span() {
4384 let mention = RankedMention {
4385 start: 5,
4386 end: 15,
4387 text: "test".to_string(),
4388 mention_type: MentionType::Nominal,
4389 gender: None,
4390 number: None,
4391 head: "test".to_string(),
4392 };
4393
4394 assert_eq!(mention.span(), (5, 15));
4395 }
4396
4397 #[test]
4402 fn test_nominal_adjective_pronoun_resolution() {
4403 let config = MentionRankingConfig {
4406 enable_nominal_adjective_detection: true,
4407 link_threshold: 0.1, ..Default::default()
4409 };
4410 let coref = MentionRankingCoref::with_config(config);
4411
4412 let text = "We must help the poor. They deserve better.";
4414
4415 let detected = coref.detect_mentions(text).unwrap();
4417 let detected_texts: Vec<_> = detected.iter().map(|m| m.text.as_str()).collect();
4418
4419 assert!(
4420 detected.iter().any(|m| m.text.to_lowercase() == "the poor"),
4421 "Should detect 'the poor' in detect_mentions: {:?}",
4422 detected_texts
4423 );
4424 assert!(
4425 detected.iter().any(|m| m.text.to_lowercase() == "they"),
4426 "Should detect 'They' in detect_mentions: {:?}",
4427 detected_texts
4428 );
4429
4430 let the_poor = detected
4432 .iter()
4433 .find(|m| m.text.to_lowercase() == "the poor")
4434 .unwrap();
4435 let they = detected
4436 .iter()
4437 .find(|m| m.text.to_lowercase() == "they")
4438 .unwrap();
4439
4440 let distance = they.start.saturating_sub(the_poor.end);
4441 let score = coref.score_pair(they, the_poor, distance, Some(text));
4442
4443 assert!(
4446 score > -0.5,
4447 "Score between 'They' and 'the poor' should not be strongly negative, got {}",
4448 score
4449 );
4450
4451 }
4456
4457 #[test]
4462 fn test_neopronoun_xe_detection() {
4463 let coref = MentionRankingCoref::new();
4464 let text = "Alex introduced xemself. Xe said xe was happy to be here.";
4465 let mentions = coref.detect_mentions(text).unwrap();
4466
4467 let texts: Vec<_> = mentions.iter().map(|m| m.text.to_lowercase()).collect();
4468 assert!(
4469 texts.contains(&"xemself".to_string()),
4470 "Should detect 'xemself': {:?}",
4471 texts
4472 );
4473 assert!(
4474 texts.contains(&"xe".to_string()),
4475 "Should detect 'xe': {:?}",
4476 texts
4477 );
4478 }
4479
4480 #[test]
4481 fn test_neopronoun_ze_detection() {
4482 let coref = MentionRankingCoref::new();
4483 let text = "Jordan uses ze/hir pronouns. Hir presentation was excellent.";
4484 let mentions = coref.detect_mentions(text).unwrap();
4485
4486 let texts: Vec<_> = mentions.iter().map(|m| m.text.to_lowercase()).collect();
4487 assert!(
4488 texts.contains(&"ze".to_string()),
4489 "Should detect 'ze': {:?}",
4490 texts
4491 );
4492 assert!(
4493 texts.contains(&"hir".to_string()),
4494 "Should detect 'hir': {:?}",
4495 texts
4496 );
4497 }
4498
4499 #[test]
4500 fn test_neopronoun_ey_detection() {
4501 let coref = MentionRankingCoref::new();
4502 let text = "Sam asked em to pass eir notebook.";
4503 let mentions = coref.detect_mentions(text).unwrap();
4504
4505 let texts: Vec<_> = mentions.iter().map(|m| m.text.to_lowercase()).collect();
4506 assert!(
4507 texts.contains(&"em".to_string()),
4508 "Should detect 'em': {:?}",
4509 texts
4510 );
4511 assert!(
4512 texts.contains(&"eir".to_string()),
4513 "Should detect 'eir': {:?}",
4514 texts
4515 );
4516 }
4517
4518 #[test]
4519 fn test_neopronoun_fae_detection() {
4520 let coref = MentionRankingCoref::new();
4521 let text = "River explained faer perspective. Fae was very articulate.";
4522 let mentions = coref.detect_mentions(text).unwrap();
4523
4524 let texts: Vec<_> = mentions.iter().map(|m| m.text.to_lowercase()).collect();
4525 assert!(
4526 texts.contains(&"faer".to_string()),
4527 "Should detect 'faer': {:?}",
4528 texts
4529 );
4530 assert!(
4531 texts.contains(&"fae".to_string()),
4532 "Should detect 'fae': {:?}",
4533 texts
4534 );
4535 }
4536
4537 #[test]
4538 fn test_neopronoun_gender_and_number() {
4539 let coref = MentionRankingCoref::new();
4540 let text = "Xe arrived early.";
4541 let mentions = coref.detect_mentions(text).unwrap();
4542
4543 let xe_mention = mentions.iter().find(|m| m.text.to_lowercase() == "xe");
4544 assert!(xe_mention.is_some(), "Should detect 'xe'");
4545
4546 let xe = xe_mention.unwrap();
4547 assert_eq!(
4549 xe.number,
4550 Some(Number::Singular),
4551 "Neopronouns are singular"
4552 );
4553 assert_eq!(
4554 xe.gender,
4555 Some(Gender::Unknown),
4556 "Neopronouns use Unknown gender"
4557 );
4558 }
4559
4560 #[test]
4561 fn test_neopronoun_coreference_linking() {
4562 let coref = MentionRankingCoref::new();
4566 let text = "Xe said xe would be late. Xem was right.";
4567 let mentions = coref.detect_mentions(text).unwrap();
4568
4569 let texts: Vec<_> = mentions.iter().map(|m| m.text.to_lowercase()).collect();
4571 assert!(
4572 texts.iter().filter(|t| *t == "xe").count() >= 2,
4573 "Should detect multiple 'xe': {:?}",
4574 texts
4575 );
4576 assert!(
4577 texts.contains(&"xem".to_string()),
4578 "Should detect 'xem': {:?}",
4579 texts
4580 );
4581
4582 for m in &mentions {
4584 if ["xe", "xem"].contains(&m.text.to_lowercase().as_str()) {
4585 assert_eq!(
4586 m.mention_type,
4587 MentionType::Pronominal,
4588 "Neopronouns should be Pronominal type"
4589 );
4590 }
4591 }
4592 }
4593
4594 #[test]
4599 fn test_dual_number_compatibility_scoring() {
4600 let coref = MentionRankingCoref::new();
4604
4605 let dual_mention = RankedMention {
4607 start: 0,
4608 end: 5,
4609 text: "كتابان".to_string(), mention_type: MentionType::Nominal,
4611 gender: Some(Gender::Neutral),
4612 number: Some(Number::Dual),
4613 head: "كتابان".to_string(),
4614 };
4615
4616 let plural_mention = RankedMention {
4617 start: 10,
4618 end: 15,
4619 text: "هم".to_string(), mention_type: MentionType::Pronominal,
4621 gender: Some(Gender::Unknown),
4622 number: Some(Number::Plural),
4623 head: "هم".to_string(),
4624 };
4625
4626 let singular_mention = RankedMention {
4627 start: 20,
4628 end: 22,
4629 text: "هو".to_string(), mention_type: MentionType::Pronominal,
4631 gender: Some(Gender::Masculine),
4632 number: Some(Number::Singular),
4633 head: "هو".to_string(),
4634 };
4635
4636 assert!(
4638 Number::Dual.is_compatible(&Number::Plural),
4639 "Dual should be compatible with Plural"
4640 );
4641 assert!(
4642 !Number::Dual.is_compatible(&Number::Singular),
4643 "Dual should NOT be compatible with Singular"
4644 );
4645
4646 let score_dual_plural = coref.score_pair(&plural_mention, &dual_mention, 5, None);
4648 let score_dual_singular = coref.score_pair(&singular_mention, &dual_mention, 5, None);
4649
4650 assert!(
4651 score_dual_plural > score_dual_singular,
4652 "Dual-Plural score ({}) should be higher than Dual-Singular ({})",
4653 score_dual_plural,
4654 score_dual_singular
4655 );
4656 }
4657
4658 #[test]
4659 fn test_number_compatibility_unknown() {
4660 assert!(Number::Unknown.is_compatible(&Number::Singular));
4663 assert!(Number::Unknown.is_compatible(&Number::Plural));
4664 assert!(Number::Unknown.is_compatible(&Number::Dual));
4665 assert!(Number::Unknown.is_compatible(&Number::Unknown));
4666
4667 let coref = MentionRankingCoref::new();
4669
4670 let they_mention = RankedMention {
4671 start: 0,
4672 end: 4,
4673 text: "They".to_string(),
4674 mention_type: MentionType::Pronominal,
4675 gender: Some(Gender::Unknown),
4676 number: Some(Number::Unknown), head: "They".to_string(),
4678 };
4679
4680 let singular_mention = RankedMention {
4681 start: 10,
4682 end: 14,
4683 text: "Alex".to_string(),
4684 mention_type: MentionType::Proper,
4685 gender: Some(Gender::Unknown),
4686 number: Some(Number::Singular),
4687 head: "Alex".to_string(),
4688 };
4689
4690 let plural_mention = RankedMention {
4691 start: 20,
4692 end: 30,
4693 text: "the students".to_string(),
4694 mention_type: MentionType::Nominal,
4695 gender: Some(Gender::Unknown),
4696 number: Some(Number::Plural),
4697 head: "students".to_string(),
4698 };
4699
4700 let score_they_singular = coref.score_pair(&they_mention, &singular_mention, 5, None);
4702 let score_they_plural = coref.score_pair(&they_mention, &plural_mention, 5, None);
4703
4704 assert!(
4706 score_they_singular > -1.0,
4707 "'They' ↔ singular should not be heavily penalized: {}",
4708 score_they_singular
4709 );
4710 assert!(
4711 score_they_plural > -1.0,
4712 "'They' ↔ plural should not be heavily penalized: {}",
4713 score_they_plural
4714 );
4715 }
4716
4717 #[test]
4722 fn test_pleonastic_it_weather() {
4723 let coref = MentionRankingCoref::new();
4725
4726 let weather_texts = [
4727 "It rains every day in Seattle.",
4728 "It is raining outside.",
4729 "It snows heavily in winter.",
4730 "It was snowing when we arrived.",
4731 "It thundered all night.",
4732 ];
4733
4734 for text in weather_texts {
4735 let mentions = coref.detect_mentions(text).unwrap();
4736 let has_it = mentions.iter().any(|m| m.text.to_lowercase() == "it");
4737 assert!(
4738 !has_it,
4739 "Weather 'it' should be filtered as pleonastic in: '{}'\nDetected: {:?}",
4740 text,
4741 mentions.iter().map(|m| &m.text).collect::<Vec<_>>()
4742 );
4743 }
4744 }
4745
4746 #[test]
4747 fn test_pleonastic_it_weather_adjectives() {
4748 let coref = MentionRankingCoref::new();
4749
4750 let weather_adj_texts = [
4751 "It is sunny today.",
4752 "It was cold last night.",
4753 "It's foggy this morning.",
4754 "It will be warm tomorrow.",
4755 ];
4756
4757 for text in weather_adj_texts {
4758 let mentions = coref.detect_mentions(text).unwrap();
4759 let has_it = mentions.iter().any(|m| m.text.to_lowercase() == "it");
4760 assert!(
4761 !has_it,
4762 "Weather adjective 'it' should be filtered: '{}'\nDetected: {:?}",
4763 text,
4764 mentions.iter().map(|m| &m.text).collect::<Vec<_>>()
4765 );
4766 }
4767 }
4768
4769 #[test]
4770 fn test_pleonastic_it_modal() {
4771 let coref = MentionRankingCoref::new();
4772
4773 let modal_texts = [
4774 "It is important that we finish on time.",
4775 "It is likely that he will arrive late.",
4776 "It was clear that something was wrong.",
4777 "It is necessary to complete the form.",
4778 "It's obvious that she was upset.",
4779 ];
4780
4781 for text in modal_texts {
4782 let mentions = coref.detect_mentions(text).unwrap();
4783 let has_it = mentions.iter().any(|m| m.text.to_lowercase() == "it");
4784 assert!(
4785 !has_it,
4786 "Modal 'it' should be filtered: '{}'\nDetected: {:?}",
4787 text,
4788 mentions.iter().map(|m| &m.text).collect::<Vec<_>>()
4789 );
4790 }
4791 }
4792
4793 #[test]
4794 fn test_pleonastic_it_cognitive_verbs() {
4795 let coref = MentionRankingCoref::new();
4796
4797 let cognitive_texts = [
4798 "It seems that the project is delayed.",
4799 "It appears he was mistaken.",
4800 "It turns out she was right.",
4801 "It happened that we met by chance.",
4802 ];
4803
4804 for text in cognitive_texts {
4805 let mentions = coref.detect_mentions(text).unwrap();
4806 let has_it = mentions.iter().any(|m| m.text.to_lowercase() == "it");
4807 assert!(
4808 !has_it,
4809 "Cognitive verb 'it' should be filtered: '{}'\nDetected: {:?}",
4810 text,
4811 mentions.iter().map(|m| &m.text).collect::<Vec<_>>()
4812 );
4813 }
4814 }
4815
4816 #[test]
4817 fn test_referential_it_not_filtered() {
4818 let coref = MentionRankingCoref::new();
4820
4821 let referential_texts = [
4822 "I read the book. It was fascinating.",
4823 "The car broke down. We had to push it.",
4824 "She gave him a gift. He loved it.",
4825 ];
4826
4827 for text in referential_texts {
4828 let mentions = coref.detect_mentions(text).unwrap();
4829 let has_it = mentions.iter().any(|m| m.text.to_lowercase() == "it");
4830 assert!(
4831 has_it,
4832 "Referential 'it' should be detected: '{}'\nDetected: {:?}",
4833 text,
4834 mentions.iter().map(|m| &m.text).collect::<Vec<_>>()
4835 );
4836 }
4837 }
4838
4839 #[test]
4840 fn test_pleonastic_it_time_expressions() {
4841 let coref = MentionRankingCoref::new();
4842
4843 let time_texts = [
4844 "It is midnight.",
4845 "It was noon when we left.",
4846 "It is 5 o'clock.",
4847 ];
4848
4849 for text in time_texts {
4850 let mentions = coref.detect_mentions(text).unwrap();
4851 let has_it = mentions.iter().any(|m| m.text.to_lowercase() == "it");
4852 assert!(
4853 !has_it,
4854 "Time expression 'it' should be filtered: '{}'\nDetected: {:?}",
4855 text,
4856 mentions.iter().map(|m| &m.text).collect::<Vec<_>>()
4857 );
4858 }
4859 }
4860
4861 #[test]
4866 fn test_demonstrative_pronoun_detection() {
4867 let coref = MentionRankingCoref::new();
4868
4869 let text = "I saw the problem. This was unexpected. Those are the facts.";
4870 let mentions = coref.detect_mentions(text).unwrap();
4871 let texts: Vec<_> = mentions.iter().map(|m| m.text.to_lowercase()).collect();
4872
4873 assert!(
4874 texts.contains(&"this".to_string()),
4875 "Should detect 'This': {:?}",
4876 texts
4877 );
4878 assert!(
4879 texts.contains(&"those".to_string()),
4880 "Should detect 'Those': {:?}",
4881 texts
4882 );
4883 }
4884
4885 #[test]
4886 fn test_demonstrative_pronoun_number() {
4887 let coref = MentionRankingCoref::new();
4888
4889 let text = "This is important. These are facts. That was clear. Those were obvious.";
4891 let mentions = coref.detect_mentions(text).unwrap();
4892
4893 let this_m = mentions.iter().find(|m| m.text.to_lowercase() == "this");
4894 let these_m = mentions.iter().find(|m| m.text.to_lowercase() == "these");
4895 let that_m = mentions.iter().find(|m| m.text.to_lowercase() == "that");
4896 let those_m = mentions.iter().find(|m| m.text.to_lowercase() == "those");
4897
4898 assert_eq!(this_m.map(|m| m.number), Some(Some(Number::Singular)));
4899 assert_eq!(these_m.map(|m| m.number), Some(Some(Number::Plural)));
4900 assert_eq!(that_m.map(|m| m.number), Some(Some(Number::Singular)));
4901 assert_eq!(those_m.map(|m| m.number), Some(Some(Number::Plural)));
4902 }
4903
4904 #[test]
4909 fn test_indefinite_pronoun_detection() {
4910 let coref = MentionRankingCoref::new();
4911
4912 let text = "Someone called yesterday. Everyone was surprised.";
4913 let mentions = coref.detect_mentions(text).unwrap();
4914 let texts: Vec<_> = mentions.iter().map(|m| m.text.to_lowercase()).collect();
4915
4916 assert!(
4917 texts.contains(&"someone".to_string()),
4918 "Should detect 'Someone': {:?}",
4919 texts
4920 );
4921 assert!(
4922 texts.contains(&"everyone".to_string()),
4923 "Should detect 'Everyone': {:?}",
4924 texts
4925 );
4926 }
4927
4928 #[test]
4929 fn test_indefinite_pronouns_are_singular() {
4930 let coref = MentionRankingCoref::new();
4933
4934 let text = "Everyone was there. Nobody left early.";
4935 let mentions = coref.detect_mentions(text).unwrap();
4936
4937 let everyone_m = mentions
4938 .iter()
4939 .find(|m| m.text.to_lowercase() == "everyone");
4940 let nobody_m = mentions.iter().find(|m| m.text.to_lowercase() == "nobody");
4941
4942 assert!(everyone_m.is_some(), "Should detect 'Everyone'");
4943 assert!(nobody_m.is_some(), "Should detect 'Nobody'");
4944
4945 assert_eq!(
4946 everyone_m.unwrap().number,
4947 Some(Number::Singular),
4948 "'everyone' is grammatically singular"
4949 );
4950 assert_eq!(
4951 nobody_m.unwrap().number,
4952 Some(Number::Singular),
4953 "'nobody' is grammatically singular"
4954 );
4955 }
4956
4957 #[test]
4958 fn test_impersonal_one_detection() {
4959 let coref = MentionRankingCoref::new();
4961
4962 let text = "One should always be prepared. One never knows what might happen.";
4963 let mentions = coref.detect_mentions(text).unwrap();
4964 let one_count = mentions
4965 .iter()
4966 .filter(|m| m.text.to_lowercase() == "one")
4967 .count();
4968
4969 assert!(
4970 one_count >= 2,
4971 "Should detect impersonal 'one': {:?}",
4972 mentions.iter().map(|m| &m.text).collect::<Vec<_>>()
4973 );
4974 }
4975
4976 #[test]
4981 fn test_reflexive_pronoun_detection() {
4982 let coref = MentionRankingCoref::new();
4983
4984 let text = "John saw himself in the mirror. Mary hurt herself.";
4985 let mentions = coref.detect_mentions(text).unwrap();
4986 let texts: Vec<_> = mentions.iter().map(|m| m.text.to_lowercase()).collect();
4987
4988 assert!(
4989 texts.contains(&"himself".to_string()),
4990 "Should detect 'himself': {:?}",
4991 texts
4992 );
4993 assert!(
4994 texts.contains(&"herself".to_string()),
4995 "Should detect 'herself': {:?}",
4996 texts
4997 );
4998 }
4999
5000 #[test]
5001 fn test_reflexive_pronoun_gender() {
5002 let coref = MentionRankingCoref::new();
5003
5004 let text = "He saw himself. She saw herself. It fixed itself.";
5005 let mentions = coref.detect_mentions(text).unwrap();
5006
5007 let himself = mentions.iter().find(|m| m.text.to_lowercase() == "himself");
5008 let herself = mentions.iter().find(|m| m.text.to_lowercase() == "herself");
5009 let itself = mentions.iter().find(|m| m.text.to_lowercase() == "itself");
5010
5011 assert!(himself.is_some(), "Should detect 'himself'");
5012 assert!(herself.is_some(), "Should detect 'herself'");
5013 assert!(itself.is_some(), "Should detect 'itself'");
5014
5015 assert_eq!(himself.unwrap().gender, Some(Gender::Masculine));
5016 assert_eq!(herself.unwrap().gender, Some(Gender::Feminine));
5017 assert_eq!(itself.unwrap().gender, Some(Gender::Neutral));
5018 }
5019
5020 #[test]
5025 fn test_reciprocal_pronoun_detection() {
5026 let coref = MentionRankingCoref::new();
5027
5028 let text = "John and Mary looked at each other. The teams competed against one another.";
5029 let mentions = coref.detect_mentions(text).unwrap();
5030 let texts: Vec<_> = mentions.iter().map(|m| m.text.to_lowercase()).collect();
5031
5032 assert!(
5033 texts.contains(&"each other".to_string()),
5034 "Should detect 'each other': {:?}",
5035 texts
5036 );
5037 assert!(
5038 texts.contains(&"one another".to_string()),
5039 "Should detect 'one another': {:?}",
5040 texts
5041 );
5042 }
5043
5044 #[test]
5045 fn test_reciprocal_pronouns_are_plural() {
5046 let coref = MentionRankingCoref::new();
5048
5049 let text = "They helped each other.";
5050 let mentions = coref.detect_mentions(text).unwrap();
5051
5052 let each_other = mentions
5053 .iter()
5054 .find(|m| m.text.to_lowercase() == "each other");
5055 assert!(each_other.is_some(), "Should detect 'each other'");
5056 assert_eq!(
5057 each_other.unwrap().number,
5058 Some(Number::Plural),
5059 "Reciprocals are grammatically plural"
5060 );
5061 }
5062
5063 use proptest::prelude::*;
5073
5074 fn text_with_pronouns() -> impl Strategy<Value = String> {
5076 prop::collection::vec(
5077 prop_oneof![
5078 Just("he".to_string()),
5079 Just("she".to_string()),
5080 Just("they".to_string()),
5081 Just("it".to_string()),
5082 Just("the dog".to_string()),
5083 Just("John".to_string()),
5084 "[a-z]{3,10}".prop_map(|s| s),
5085 ],
5086 3..15,
5087 )
5088 .prop_map(|words| words.join(" ") + ".")
5089 }
5090
5091 #[test]
5096 fn test_multilingual_nominal_adjective_german() {
5097 let config = MentionRankingConfig {
5098 enable_nominal_adjective_detection: true,
5099 language: "de".to_string(),
5100 ..Default::default()
5101 };
5102
5103 let coref = MentionRankingCoref::with_config(config);
5104 let text = "Die Armen leiden unter der Krise.";
5105 let mentions = coref.detect_mentions(text).unwrap();
5106
5107 let has_armen = mentions
5108 .iter()
5109 .any(|m| m.text.to_lowercase().contains("armen"));
5110 assert!(
5111 has_armen,
5112 "Should detect 'die Armen' as a nominal adjective in German"
5113 );
5114 }
5115
5116 #[test]
5117 fn test_multilingual_nominal_adjective_french() {
5118 let config = MentionRankingConfig {
5119 enable_nominal_adjective_detection: true,
5120 language: "fr".to_string(),
5121 ..Default::default()
5122 };
5123
5124 let coref = MentionRankingCoref::with_config(config);
5125 let text = "Les pauvres ont besoin d'aide.";
5126 let mentions = coref.detect_mentions(text).unwrap();
5127
5128 let has_pauvres = mentions
5129 .iter()
5130 .any(|m| m.text.to_lowercase().contains("pauvres"));
5131 assert!(
5132 has_pauvres,
5133 "Should detect 'les pauvres' as a nominal adjective in French"
5134 );
5135 }
5136
5137 #[test]
5138 fn test_multilingual_nominal_adjective_spanish() {
5139 let config = MentionRankingConfig {
5140 enable_nominal_adjective_detection: true,
5141 language: "es".to_string(),
5142 ..Default::default()
5143 };
5144
5145 let coref = MentionRankingCoref::with_config(config);
5146 let text = "Los pobres necesitan ayuda.";
5147 let mentions = coref.detect_mentions(text).unwrap();
5148
5149 let has_pobres = mentions
5150 .iter()
5151 .any(|m| m.text.to_lowercase().contains("pobres"));
5152 assert!(
5153 has_pobres,
5154 "Should detect 'los pobres' as a nominal adjective in Spanish"
5155 );
5156 }
5157
5158 #[test]
5159 fn test_config_language_field() {
5160 let config = MentionRankingConfig::default();
5162 assert_eq!(config.language, "en");
5163
5164 let book_config = MentionRankingConfig::book_scale();
5166 assert_eq!(book_config.language, "en");
5167
5168 let clinical_config = MentionRankingConfig::clinical();
5170 assert_eq!(clinical_config.language, "en");
5171 }
5172
5173 proptest! {
5174 #![proptest_config(ProptestConfig::with_cases(50))]
5175
5176 #[test]
5180 fn mention_spans_within_bounds(text in text_with_pronouns()) {
5181 let coref = MentionRankingCoref::new();
5182 if let Ok(mentions) = coref.detect_mentions(&text) {
5183 let char_count = text.chars().count();
5184 for mention in &mentions {
5185 prop_assert!(
5186 mention.start <= mention.end,
5187 "Start {} > end {} for '{}'",
5188 mention.start, mention.end, mention.text
5189 );
5190 prop_assert!(
5191 mention.end <= char_count,
5192 "End {} > text length {} for '{}'",
5193 mention.end, char_count, mention.text
5194 );
5195 }
5196 }
5197 }
5198
5199 #[test]
5203 fn mention_text_matches_span(text in text_with_pronouns()) {
5204 let coref = MentionRankingCoref::new();
5205 if let Ok(mentions) = coref.detect_mentions(&text) {
5206 for mention in &mentions {
5207 let extracted: String = text.chars()
5208 .skip(mention.start)
5209 .take(mention.end - mention.start)
5210 .collect();
5211 prop_assert_eq!(
5213 extracted.to_lowercase(),
5214 mention.text.to_lowercase(),
5215 "Extracted text doesn't match stored text"
5216 );
5217 }
5218 }
5219 }
5220
5221 #[test]
5223 fn pronouns_are_pronominal(text in text_with_pronouns()) {
5224 let coref = MentionRankingCoref::new();
5225 if let Ok(mentions) = coref.detect_mentions(&text) {
5226 let pronouns = ["he", "she", "it", "they", "him", "her", "them"];
5227 for mention in &mentions {
5228 if pronouns.contains(&mention.text.to_lowercase().as_str()) {
5229 prop_assert_eq!(
5230 mention.mention_type,
5231 MentionType::Pronominal,
5232 "'{}' should be Pronominal",
5233 mention.text
5234 );
5235 }
5236 }
5237 }
5238 }
5239
5240 #[test]
5242 fn pronouns_have_gender(text in text_with_pronouns()) {
5243 let coref = MentionRankingCoref::new();
5244 if let Ok(mentions) = coref.detect_mentions(&text) {
5245 for mention in &mentions {
5246 if mention.mention_type == MentionType::Pronominal {
5247 prop_assert!(
5248 mention.gender.is_some(),
5249 "Pronoun '{}' should have gender",
5250 mention.text
5251 );
5252 }
5253 }
5254 }
5255 }
5256
5257 #[test]
5259 fn pronouns_have_number(text in text_with_pronouns()) {
5260 let coref = MentionRankingCoref::new();
5261 if let Ok(mentions) = coref.detect_mentions(&text) {
5262 for mention in &mentions {
5263 if mention.mention_type == MentionType::Pronominal {
5264 prop_assert!(
5265 mention.number.is_some(),
5266 "Pronoun '{}' should have number",
5267 mention.text
5268 );
5269 }
5270 }
5271 }
5272 }
5273
5274 #[test]
5276 fn clusters_partition_mentions(text in text_with_pronouns()) {
5277 let coref = MentionRankingCoref::new();
5278 if let Ok(clusters) = coref.resolve(&text) {
5279 let mut all_mentions: Vec<_> = clusters.iter()
5281 .flat_map(|c| &c.mentions)
5282 .collect();
5283
5284 let original_len = all_mentions.len();
5286 all_mentions.sort_by_key(|m| (m.start, m.end));
5287 all_mentions.dedup_by_key(|m| (m.start, m.end));
5288 prop_assert_eq!(
5289 all_mentions.len(),
5290 original_len,
5291 "Duplicate mentions across clusters"
5292 );
5293 }
5294 }
5295
5296 #[test]
5300 fn score_pair_deterministic(text in text_with_pronouns()) {
5301 let coref = MentionRankingCoref::new();
5302 if let Ok(mentions) = coref.detect_mentions(&text) {
5303 if mentions.len() >= 2 {
5304 let distance = mentions[1].start.saturating_sub(mentions[0].end);
5305 let score1 = coref.score_pair(&mentions[0], &mentions[1], distance, Some(&text));
5306 let score2 = coref.score_pair(&mentions[0], &mentions[1], distance, Some(&text));
5307 prop_assert!(
5308 (score1 - score2).abs() < 0.0001,
5309 "Scoring should be deterministic"
5310 );
5311 }
5312 }
5313 }
5314 }
5315}