Skip to main content

anno/backends/mention_ranking/
algorithm.rs

1//! MentionRankingCoref model: scoring, clustering, and coreference resolution.
2
3#[allow(unused_imports)]
4use super::types::*;
5#[allow(unused_imports)]
6use super::*;
7
8/// Mention-ranking coreference resolver.
9pub struct MentionRankingCoref {
10    /// Configuration.
11    config: MentionRankingConfig,
12    /// Optional NER model for mention detection.
13    ner: Option<Box<dyn Model>>,
14    /// Optional pre-computed salience scores (entity text -> salience).
15    /// Keys should be lowercase for case-insensitive lookup.
16    salience_scores: Option<HashMap<String, f64>>,
17}
18
19impl std::fmt::Debug for MentionRankingCoref {
20    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
21        f.debug_struct("MentionRankingCoref")
22            .field("config", &self.config)
23            .field("ner", &self.ner.as_ref().map(|_| "Some(dyn Model)"))
24            .field(
25                "salience_scores",
26                &self
27                    .salience_scores
28                    .as_ref()
29                    .map(|s| format!("{} entities", s.len())),
30            )
31            .finish()
32    }
33}
34
35impl MentionRankingCoref {
36    /// Create a new mention-ranking coref resolver.
37    #[must_use]
38    pub fn new() -> Self {
39        Self::with_config(MentionRankingConfig::default())
40    }
41
42    /// Create with custom configuration.
43    #[must_use]
44    pub fn with_config(config: MentionRankingConfig) -> Self {
45        Self {
46            config,
47            ner: None,
48            salience_scores: None,
49        }
50    }
51
52    /// Set the NER model for mention detection.
53    pub fn with_ner(mut self, ner: Box<dyn Model>) -> Self {
54        self.ner = Some(ner);
55        self
56    }
57
58    /// Set pre-computed salience scores for entities.
59    ///
60    /// Salience scores should be in range [0, 1] where higher means more
61    /// important/salient. Keys are entity text (will be lowercased for lookup).
62    ///
63    /// Use with `config.salience_weight > 0` to enable salience-weighted scoring.
64    ///
65    /// # Example
66    ///
67    /// ```rust,ignore
68    /// use anno::salience::{EntityRanker, TextRankSalience};
69    ///
70    /// let ranker = TextRankSalience::default();
71    /// let ranked = ranker.rank(text, &entities);
72    ///
73    /// // Normalize scores to [0, 1]
74    /// let max_score = ranked.iter().map(|(_, s)| *s).fold(0.0_f64, f64::max);
75    /// let salience_scores: HashMap<String, f64> = ranked.into_iter()
76    ///     .map(|(e, score)| (e.text.to_lowercase(), score / max_score.max(1e-10)))
77    ///     .collect();
78    ///
79    /// let coref = MentionRankingCoref::new()
80    ///     .with_salience(salience_scores);
81    /// ```
82    #[must_use]
83    pub fn with_salience(mut self, scores: HashMap<String, f64>) -> Self {
84        // Normalize keys to lowercase
85        let normalized: HashMap<String, f64> = scores
86            .into_iter()
87            .map(|(k, v)| (k.to_lowercase(), v))
88            .collect();
89        self.salience_scores = Some(normalized);
90        self
91    }
92
93    /// Get salience score for an entity (returns 0.0 if not found).
94    fn get_salience(&self, text: &str) -> f64 {
95        self.salience_scores
96            .as_ref()
97            .and_then(|s| s.get(&text.to_lowercase()).copied())
98            .unwrap_or(0.0)
99    }
100
101    // =========================================================================
102    // i2b2-inspired rule-based features (Chen et al. 2011)
103    // =========================================================================
104
105    /// Check if two mentions are connected by a "be phrase" (X is Y pattern).
106    ///
107    /// From Chen et al. (2011): "if there is a 'be phrase' between two concepts
108    /// of the same type, they are probably saying 'something is something'."
109    ///
110    /// # Examples
111    ///
112    /// - "Resolution of organism is Methicillin-resistant Staphylococcus" → true
113    /// - "The patient is John Smith" → true
114    /// - "John saw Mary" → false
115    fn is_be_phrase_link(&self, text: &str, m1: &RankedMention, m2: &RankedMention) -> bool {
116        // Ensure mentions don't overlap and are ordered
117        let (earlier, later) = if m1.end <= m2.start {
118            (m1, m2)
119        } else if m2.end <= m1.start {
120            (m2, m1)
121        } else {
122            return false; // Overlapping mentions
123        };
124
125        // Get text between mentions (convert char offsets to get the substring)
126        let text_chars: Vec<char> = text.chars().collect();
127        if later.start > text_chars.len() || earlier.end > text_chars.len() {
128            return false;
129        }
130
131        let between: String = text_chars
132            .get(earlier.end..later.start)
133            .unwrap_or(&[])
134            .iter()
135            .collect();
136        let between_lower = between.to_lowercase();
137
138        // Be-phrase patterns from i2b2 paper
139        static BE_PATTERNS: &[&str] = &[
140            " is ",
141            " are ",
142            " was ",
143            " were ",
144            " be ",
145            " being ",
146            " been ",
147            " refers to ",
148            " means ",
149            " indicates ",
150            " represents ",
151            " also known as ",
152            " aka ",
153            " i.e. ",
154            " ie ",
155            " namely ",
156            " called ",
157            " named ",
158            " known as ",
159            " defined as ",
160        ];
161
162        BE_PATTERNS.iter().any(|p| between_lower.contains(p))
163    }
164
165    /// Check if one mention is an acronym of the other.
166    ///
167    /// Delegates to the language-agnostic `anno::coalesce::is_acronym_match` function.
168    ///
169    /// From Chen et al. (2011): "The first letters of each word in concepts
170    /// that have two or more words are taken and compared to whole words
171    /// in other concepts."
172    ///
173    /// # Examples
174    ///
175    /// - "MRSA" ↔ "Methicillin-resistant Staphylococcus aureus" → true
176    /// - "WHO" ↔ "World Health Organization" → true
177    /// - "IBM" ↔ "Apple" → false
178    fn is_acronym_match(&self, m1: &RankedMention, m2: &RankedMention) -> bool {
179        anno_core::coalesce::similarity::is_acronym_match(&m1.text, &m2.text)
180    }
181
182    /// Check if "it" at the given position is pleonastic (non-referential).
183    ///
184    /// Pleonastic "it" is a grammatical placeholder that doesn't refer to any
185    /// entity. Common patterns include:
186    /// - Weather: "it rains", "it is sunny", "it's cold"
187    /// - Modal: "it is important that...", "it is likely..."
188    /// - Cognitive: "it seems", "it appears", "it turns out"
189    /// - Cleft: "it was John who..."
190    ///
191    /// Based on: Boyd et al. "Identification of Pleonastic It Using the Web"
192    /// and Stanford CoreNLP's PleonasticFilter patterns.
193    fn is_pleonastic_it(&self, text_lower: &str, it_byte_pos: usize) -> bool {
194        // Get the text after "it"
195        let after_it = &text_lower[it_byte_pos + 2..]; // Skip "it"
196        let after_it_trimmed = after_it.trim_start();
197
198        // Weather verbs: "it rains", "it snows", "it hails"
199        const WEATHER_VERBS: &[&str] = &[
200            "rain",
201            "rains",
202            "rained",
203            "raining",
204            "snow",
205            "snows",
206            "snowed",
207            "snowing",
208            "hail",
209            "hails",
210            "hailed",
211            "hailing",
212            "thunder",
213            "thunders",
214            "thundered",
215            "thundering",
216        ];
217
218        // Weather adjectives: "it is sunny", "it's cold"
219        const WEATHER_ADJS: &[&str] = &[
220            "sunny", "cloudy", "foggy", "windy", "rainy", "snowy", "cold", "hot", "warm", "cool",
221            "humid", "dry", "freezing", "chilly", "muggy", "overcast",
222        ];
223
224        // Modal/cognitive adjectives: "it is important", "it seems likely"
225        const MODAL_ADJS: &[&str] = &[
226            "important",
227            "necessary",
228            "possible",
229            "impossible",
230            "likely",
231            "unlikely",
232            "clear",
233            "obvious",
234            "evident",
235            "apparent",
236            "true",
237            "false",
238            "certain",
239            "uncertain",
240            "doubtful",
241            "essential",
242            "vital",
243            "crucial",
244            "critical",
245            "imperative",
246            "fortunate",
247            "unfortunate",
248            "surprising",
249            "unsurprising",
250            "strange",
251            "odd",
252            "weird",
253            "remarkable",
254            "noteworthy",
255            "known",
256            "unknown",
257            "believed",
258            "thought",
259            "said",
260            "reported",
261            "estimated",
262            "assumed",
263            "expected",
264            "hoped",
265            "feared",
266        ];
267
268        // Cognitive verbs: "it seems", "it appears"
269        const COGNITIVE_VERBS: &[&str] = &[
270            "seems",
271            "seem",
272            "seemed",
273            "appears",
274            "appear",
275            "appeared",
276            "turns out",
277            "turned out",
278            "happens",
279            "happen",
280            "happened",
281            "follows",
282            "follow",
283            "followed",
284            "matters",
285            "matter",
286            "mattered",
287            "helps",
288            "help",
289            "helped",
290            "hurts",
291            "hurt",
292        ];
293
294        // Check for weather verbs directly
295        for verb in WEATHER_VERBS {
296            if let Some(after_verb) = after_it_trimmed.strip_prefix(verb) {
297                if after_verb.is_empty() || after_verb.starts_with(|c: char| !c.is_alphanumeric()) {
298                    return true;
299                }
300            }
301        }
302
303        // Check for cognitive verbs
304        for verb in COGNITIVE_VERBS {
305            if let Some(after_verb) = after_it_trimmed.strip_prefix(verb) {
306                if after_verb.is_empty() || after_verb.starts_with(|c: char| !c.is_alphanumeric()) {
307                    return true;
308                }
309            }
310        }
311
312        // Check for "it is/was/has been/will be + MODAL_ADJ"
313        // Also handles contractions: "it's"
314        let copula_patterns = ["is ", "was ", "'s ", "has been ", "will be ", "would be "];
315        for copula in copula_patterns {
316            if let Some(after_copula) = after_it_trimmed.strip_prefix(copula) {
317                let after_copula = after_copula.trim_start();
318
319                // Check weather verbs after copula: "it is raining"
320                for verb in WEATHER_VERBS {
321                    if let Some(after_verb) = after_copula.strip_prefix(verb) {
322                        if after_verb.is_empty()
323                            || after_verb.starts_with(|c: char| !c.is_alphanumeric())
324                        {
325                            return true;
326                        }
327                    }
328                }
329
330                // Check weather adjectives
331                for adj in WEATHER_ADJS {
332                    if let Some(after_adj) = after_copula.strip_prefix(adj) {
333                        if after_adj.is_empty()
334                            || after_adj.starts_with(|c: char| !c.is_alphanumeric())
335                        {
336                            return true;
337                        }
338                    }
339                }
340
341                // Check modal adjectives
342                for adj in MODAL_ADJS {
343                    if let Some(after_adj) = after_copula.strip_prefix(adj) {
344                        // Modal adjectives often followed by "that", "to", or end of clause
345                        if after_adj.is_empty()
346                            || after_adj.starts_with(" that")
347                            || after_adj.starts_with(" to")
348                            || after_adj.starts_with(|c: char| !c.is_alphanumeric())
349                        {
350                            return true;
351                        }
352                    }
353                }
354
355                // Check for "it is/was + time expression"
356                // "it is 5 o'clock", "it was midnight"
357                let time_words = ["noon", "midnight", "morning", "evening", "night", "time"];
358                for tw in time_words {
359                    if after_copula.starts_with(tw) {
360                        return true;
361                    }
362                }
363
364                // Check for numeric time: "it is 5", "it's 3:00"
365                if after_copula.starts_with(|c: char| c.is_ascii_digit()) {
366                    return true;
367                }
368            }
369        }
370
371        false
372    }
373
374    /// Check if two mentions should NOT be linked based on context clues.
375    ///
376    /// From Chen et al. (2011): "eliminate links that actually refer to two
377    /// different entities based on clues found in the sentences surrounding
378    /// the mentions... including dates, locations, or descriptive modifiers."
379    ///
380    /// Returns true if the link should be filtered out.
381    fn should_filter_by_context(&self, text: &str, m1: &RankedMention, m2: &RankedMention) -> bool {
382        let text_chars: Vec<char> = text.chars().collect();
383        let char_count = text_chars.len();
384
385        // Get context windows around each mention (20 chars before and after)
386        let context_window = 20;
387
388        let m1_context_start = m1.start.saturating_sub(context_window);
389        let m1_context_end = (m1.end + context_window).min(char_count);
390        let m1_context: String = text_chars
391            .get(m1_context_start..m1_context_end)
392            .unwrap_or(&[])
393            .iter()
394            .collect();
395
396        let m2_context_start = m2.start.saturating_sub(context_window);
397        let m2_context_end = (m2.end + context_window).min(char_count);
398        let m2_context: String = text_chars
399            .get(m2_context_start..m2_context_end)
400            .unwrap_or(&[])
401            .iter()
402            .collect();
403
404        // Check for different dates (YYYY-MM-DD or MM/DD/YYYY patterns)
405        let date1 = Self::extract_date(&m1_context);
406        let date2 = Self::extract_date(&m2_context);
407        if let (Some(d1), Some(d2)) = (&date1, &date2) {
408            if d1 != d2 {
409                return true; // Different dates → different entities
410            }
411        }
412
413        // Check for negation context mismatches
414        // "not a smoker" vs "smoker" should not link
415        let m1_negated = Self::has_negation_context(&m1_context);
416        let m2_negated = Self::has_negation_context(&m2_context);
417        if m1_negated != m2_negated {
418            return true;
419        }
420
421        false
422    }
423
424    /// Extract a date from context text if present.
425    fn extract_date(context: &str) -> Option<String> {
426        // Simple date patterns: YYYY-MM-DD or MM/DD/YYYY
427        let date_patterns = [
428            r"\d{4}-\d{2}-\d{2}",       // ISO format
429            r"\d{2}/\d{2}/\d{4}",       // US format
430            r"\d{1,2}/\d{1,2}/\d{2,4}", // Flexible US
431        ];
432
433        for pattern in &date_patterns {
434            if let Ok(re) = regex::Regex::new(pattern) {
435                if let Some(m) = re.find(context) {
436                    return Some(m.as_str().to_string());
437                }
438            }
439        }
440        None
441    }
442
443    /// Check if context contains negation markers.
444    fn has_negation_context(context: &str) -> bool {
445        let lower = context.to_lowercase();
446        static NEGATION_MARKERS: &[&str] = &[
447            "not ",
448            "no ",
449            "never ",
450            "without ",
451            "denies ",
452            "denied ",
453            "negative for ",
454            "neg for ",
455            "ruled out ",
456            "r/o ",
457        ];
458        NEGATION_MARKERS.iter().any(|m| lower.contains(m))
459    }
460
461    /// Check if two mentions are synonyms.
462    ///
463    /// This method checks for synonym relationships between mentions.
464    /// By default, it uses high string similarity as a proxy for synonymy.
465    ///
466    /// For domain-specific synonym matching (medical, legal, etc.), integrate
467    /// a custom `anno::coalesce::SynonymSource` implementation. Available sources:
468    /// - UMLS MRCONSO for medical terminology
469    /// - WordNet for general English
470    /// - Wikidata aliases for multilingual entities
471    ///
472    /// The pluggable synonym infrastructure is defined in `anno::coalesce::similarity`:
473    /// - `SynonymSource` trait: implement to provide custom lookups
474    /// - `ChainedSynonyms`: combine multiple sources
475    /// - `SynonymMatch`: result type with canonical ID and confidence
476    ///
477    /// # Design Decision
478    ///
479    /// We deliberately removed the hardcoded English medical synonym table
480    /// (kidney→renal, heart→cardiac, etc.) that was here previously.
481    /// Hardcoded tables:
482    /// - Only work for one language (English)
483    /// - Only work for one domain (medical)
484    /// - Create maintenance burden
485    /// - Don't scale to new domains
486    ///
487    /// Instead, use high string similarity or integrate a proper knowledge base.
488    fn are_synonyms(&self, m1: &RankedMention, m2: &RankedMention) -> bool {
489        let t1 = m1.text.to_lowercase();
490        let t2 = m2.text.to_lowercase();
491
492        if t1 == t2 {
493            return true;
494        }
495
496        // Use multilingual string similarity from coalesce as a proxy.
497        // High similarity (>0.8) suggests related terms.
498        // This works across languages without hardcoded tables.
499        let similarity = anno_core::coalesce::similarity::multilingual_similarity(&t1, &t2);
500        similarity > 0.8
501    }
502
503    /// Resolve coreferences in text.
504    pub fn resolve(&self, text: &str) -> Result<Vec<MentionCluster>> {
505        if text.trim().is_empty() {
506            return Ok(vec![]);
507        }
508
509        // Step 1: Detect mentions
510        let mut mentions = self.detect_mentions(text)?;
511
512        if mentions.is_empty() {
513            return Ok(vec![]);
514        }
515
516        // Sort by position
517        mentions.sort_by_key(|m| (m.start, m.end));
518
519        // Step 2: Extract features for each mention
520        for mention in &mut mentions {
521            self.extract_features(mention);
522        }
523
524        // Step 3: Rank antecedents and link (pass text for context-aware features)
525        let clusters = self.link_mentions(&mentions, text);
526
527        Ok(clusters)
528    }
529
530    /// Get language-specific pronoun patterns.
531    ///
532    /// Returns (pronoun_text, gender, number) tuples for the specified language.
533    /// Falls back to English if language is not supported.
534    fn get_pronoun_patterns(&self) -> Vec<(&'static str, Gender, Number)> {
535        let lang_code = self
536            .config
537            .language
538            .split('-')
539            .next()
540            .unwrap_or(&self.config.language)
541            .to_lowercase();
542
543        match lang_code.as_str() {
544            "es" => vec![
545                // Spanish pronouns
546                ("él", Gender::Masculine, Number::Singular),
547                ("ella", Gender::Feminine, Number::Singular),
548                ("ellos", Gender::Masculine, Number::Plural),
549                ("ellas", Gender::Feminine, Number::Plural),
550                ("lo", Gender::Masculine, Number::Singular),
551                ("la", Gender::Feminine, Number::Singular),
552                ("los", Gender::Masculine, Number::Plural),
553                ("las", Gender::Feminine, Number::Plural),
554                ("le", Gender::Unknown, Number::Singular), // Leísmo - can be gender-neutral
555                ("les", Gender::Unknown, Number::Plural),
556                ("su", Gender::Unknown, Number::Unknown),
557                ("sus", Gender::Unknown, Number::Plural),
558                ("suyo", Gender::Masculine, Number::Singular),
559                ("suya", Gender::Feminine, Number::Singular),
560                ("suyos", Gender::Masculine, Number::Plural),
561                ("suyas", Gender::Feminine, Number::Plural),
562                ("se", Gender::Unknown, Number::Unknown), // Reflexive
563                ("nosotros", Gender::Masculine, Number::Plural),
564                ("nosotras", Gender::Feminine, Number::Plural),
565                ("vosotros", Gender::Masculine, Number::Plural),
566                ("vosotras", Gender::Feminine, Number::Plural),
567                ("usted", Gender::Unknown, Number::Singular),
568                ("ustedes", Gender::Unknown, Number::Plural),
569                // Non-binary options (emerging usage)
570                // Note: "elle" (singular) and "elles" (plural) are being used by some non-binary Spanish speakers
571                // though not yet standardized. Some also use "le" (leísmo) as gender-neutral.
572                ("elle", Gender::Unknown, Number::Singular), // Non-binary third-person (emerging)
573                ("elles", Gender::Unknown, Number::Plural), // Non-binary third-person plural (emerging)
574            ],
575            "fr" => vec![
576                // French pronouns
577                ("il", Gender::Masculine, Number::Singular),
578                ("elle", Gender::Feminine, Number::Singular),
579                ("ils", Gender::Masculine, Number::Plural),
580                ("elles", Gender::Feminine, Number::Plural),
581                ("le", Gender::Masculine, Number::Singular),
582                ("la", Gender::Feminine, Number::Singular),
583                ("les", Gender::Unknown, Number::Plural),
584                ("lui", Gender::Unknown, Number::Singular),
585                ("leur", Gender::Unknown, Number::Plural),
586                ("son", Gender::Masculine, Number::Singular),
587                ("sa", Gender::Feminine, Number::Singular),
588                ("ses", Gender::Unknown, Number::Plural),
589                ("se", Gender::Unknown, Number::Unknown), // Reflexive
590                ("nous", Gender::Unknown, Number::Plural),
591                ("vous", Gender::Unknown, Number::Unknown),
592                // Non-binary options (emerging usage)
593                // Note: "iel" (singular) and "iels" (plural) are being used by some non-binary French speakers
594                // though not yet standardized in formal French
595                ("iel", Gender::Unknown, Number::Singular), // Non-binary third-person (emerging)
596                ("iels", Gender::Unknown, Number::Plural), // Non-binary third-person plural (emerging)
597            ],
598            "de" => vec![
599                // German pronouns
600                ("er", Gender::Masculine, Number::Singular),
601                ("sie", Gender::Feminine, Number::Singular),
602                ("es", Gender::Neutral, Number::Singular),
603                ("sie", Gender::Unknown, Number::Plural), // Same form as feminine singular
604                ("ihn", Gender::Masculine, Number::Singular),
605                ("ihr", Gender::Feminine, Number::Singular),
606                ("ihm", Gender::Masculine, Number::Singular),
607                ("ihnen", Gender::Unknown, Number::Plural),
608                ("sein", Gender::Masculine, Number::Singular),
609                ("seine", Gender::Feminine, Number::Singular),
610                ("sein", Gender::Neutral, Number::Singular),
611                ("ihre", Gender::Feminine, Number::Singular),
612                ("ihr", Gender::Unknown, Number::Plural),
613                ("sich", Gender::Unknown, Number::Unknown), // Reflexive
614                ("wir", Gender::Unknown, Number::Plural),
615                ("ihr", Gender::Unknown, Number::Plural), // 2nd person plural
616                ("sie", Gender::Unknown, Number::Plural), // 3rd person plural (formal)
617                // Non-binary options (emerging usage)
618                // Note: "sier" and "xier" are being used by some non-binary German speakers
619                // though not yet standardized. "es" (it) is grammatically neutral but dehumanizing.
620                ("sier", Gender::Unknown, Number::Singular), // Non-binary third-person (emerging)
621                ("xier", Gender::Unknown, Number::Singular), // Non-binary third-person (emerging, alternative)
622                ("dier", Gender::Unknown, Number::Singular), // Non-binary third-person (emerging, alternative)
623            ],
624            "ar" => vec![
625                // Arabic pronouns (RTL)
626                ("هو", Gender::Masculine, Number::Singular), // huwa
627                ("هي", Gender::Feminine, Number::Singular),  // hiya
628                ("هم", Gender::Masculine, Number::Plural),   // hum
629                ("هن", Gender::Feminine, Number::Plural),    // hunna
630                ("هما", Gender::Unknown, Number::Plural),    // huma (dual)
631            ],
632            "ru" => vec![
633                // Russian pronouns
634                ("он", Gender::Masculine, Number::Singular),
635                ("она", Gender::Feminine, Number::Singular),
636                ("оно", Gender::Neutral, Number::Singular),
637                ("они", Gender::Unknown, Number::Plural),
638                ("его", Gender::Masculine, Number::Singular),
639                ("её", Gender::Feminine, Number::Singular),
640                ("их", Gender::Unknown, Number::Plural),
641                ("себя", Gender::Unknown, Number::Unknown), // Reflexive
642                ("мы", Gender::Unknown, Number::Plural),
643                ("вы", Gender::Unknown, Number::Unknown),
644            ],
645            "zh" => vec![
646                // Chinese pronouns
647                // Traditional gendered forms (introduced in 20th century)
648                ("他", Gender::Masculine, Number::Singular), // tā - he (also used as gender-neutral historically)
649                ("她", Gender::Feminine, Number::Singular),  // tā - she
650                ("它", Gender::Neutral, Number::Singular),   // tā - it (objects)
651                ("牠", Gender::Neutral, Number::Singular),   // tā - it (animals, traditional)
652                ("祂", Gender::Neutral, Number::Singular),   // tā - it (deities)
653                // Gender-neutral options for non-binary individuals
654                ("怹", Gender::Unknown, Number::Singular), // tān - honorific gender-neutral "they" (archaic but exists)
655                ("其", Gender::Unknown, Number::Singular), // qí - formal gender-neutral pronoun (very formal)
656                // Modern non-binary options (pinyin, used in informal/online contexts)
657                // Note: "TA" and "X也" are typically written in pinyin/latin, but we include them
658                // for completeness. In practice, these may appear as "TA" or "X也" in text.
659                ("他们", Gender::Masculine, Number::Plural), // tāmen - they (masculine/mixed)
660                ("她们", Gender::Feminine, Number::Plural),  // tāmen - they (feminine)
661                ("它们", Gender::Neutral, Number::Plural),   // tāmen - they (objects)
662                                                             // Note: In spoken Chinese, all third-person pronouns are pronounced "tā" (gender-neutral)
663                                                             // The gender distinction exists only in written form
664            ],
665            "ja" => vec![
666                // Japanese pronouns
667                // Third-person (historically gender-neutral, now gendered in modern usage)
668                ("彼", Gender::Masculine, Number::Singular), // kare - he (originally gender-neutral)
669                ("彼女", Gender::Feminine, Number::Singular), // kanojo - she
670                ("彼ら", Gender::Unknown, Number::Plural),   // karera - they
671                // Gender-neutral alternatives (modern usage)
672                // Note: Japanese often avoids pronouns entirely, using names/titles instead
673                // For non-binary individuals, その人 (sono hito - that person) or name/title is common
674                ("その人", Gender::Unknown, Number::Singular), // sono hito - that person (gender-neutral)
675                ("あの人", Gender::Unknown, Number::Singular), // ano hito - that person (gender-neutral)
676            ],
677            "ko" => vec![
678                // Korean pronouns
679                // Korean often avoids third-person pronouns, using names/titles
680                ("그", Gender::Masculine, Number::Singular), // geu - he (also means "that")
681                ("그녀", Gender::Feminine, Number::Singular), // geunyeo - she (literally "that woman")
682                ("그들", Gender::Unknown, Number::Plural),    // geudeul - they
683                // Gender-neutral alternatives
684                ("그 사람", Gender::Unknown, Number::Singular), // geu saram - that person (gender-neutral)
685                ("그분", Gender::Unknown, Number::Singular), // geubun - that person (honorific, gender-neutral)
686            ],
687            _ => {
688                // English (default) - comprehensive pronoun patterns including neopronouns
689                vec![
690                    // Traditional pronouns
691                    ("he", Gender::Masculine, Number::Singular),
692                    ("she", Gender::Feminine, Number::Singular),
693                    ("it", Gender::Neutral, Number::Singular),
694                    ("they", Gender::Unknown, Number::Unknown), // Singular or plural
695                    ("him", Gender::Masculine, Number::Singular),
696                    ("her", Gender::Feminine, Number::Singular),
697                    ("them", Gender::Unknown, Number::Unknown), // Singular or plural
698                    ("his", Gender::Masculine, Number::Singular),
699                    ("hers", Gender::Feminine, Number::Singular),
700                    ("its", Gender::Neutral, Number::Singular),
701                    ("their", Gender::Unknown, Number::Unknown), // Singular or plural
702                    ("theirs", Gender::Unknown, Number::Unknown),
703                    ("themself", Gender::Unknown, Number::Singular), // Explicitly singular
704                    ("themselves", Gender::Unknown, Number::Plural), // Explicitly plural
705                    // Third-person reflexives
706                    ("himself", Gender::Masculine, Number::Singular),
707                    ("herself", Gender::Feminine, Number::Singular),
708                    ("itself", Gender::Neutral, Number::Singular),
709                    // First-person pronouns
710                    ("i", Gender::Unknown, Number::Singular),
711                    ("me", Gender::Unknown, Number::Singular),
712                    ("my", Gender::Unknown, Number::Singular),
713                    ("mine", Gender::Unknown, Number::Singular),
714                    ("myself", Gender::Unknown, Number::Singular),
715                    ("we", Gender::Unknown, Number::Plural),
716                    ("us", Gender::Unknown, Number::Plural),
717                    ("our", Gender::Unknown, Number::Plural),
718                    ("ours", Gender::Unknown, Number::Plural),
719                    ("ourselves", Gender::Unknown, Number::Plural),
720                    ("you", Gender::Unknown, Number::Unknown), // Singular or plural
721                    ("your", Gender::Unknown, Number::Unknown),
722                    ("yours", Gender::Unknown, Number::Unknown),
723                    ("yourself", Gender::Unknown, Number::Singular),
724                    ("yourselves", Gender::Unknown, Number::Plural),
725                    // Neopronouns: ze/hir set
726                    ("ze", Gender::Unknown, Number::Singular),
727                    ("hir", Gender::Unknown, Number::Singular),
728                    ("hirs", Gender::Unknown, Number::Singular),
729                    ("hirself", Gender::Unknown, Number::Singular),
730                    // Neopronouns: xe/xem set
731                    ("xe", Gender::Unknown, Number::Singular),
732                    ("xem", Gender::Unknown, Number::Singular),
733                    ("xyr", Gender::Unknown, Number::Singular),
734                    ("xyrs", Gender::Unknown, Number::Singular),
735                    ("xemself", Gender::Unknown, Number::Singular),
736                    // Neopronouns: e/em (Spivak) set
737                    ("ey", Gender::Unknown, Number::Singular), // Also spelled "e"
738                    ("em", Gender::Unknown, Number::Singular),
739                    ("eir", Gender::Unknown, Number::Singular),
740                    ("eirs", Gender::Unknown, Number::Singular),
741                    ("emself", Gender::Unknown, Number::Singular),
742                    // Neopronouns: fae/faer set
743                    ("fae", Gender::Unknown, Number::Singular),
744                    ("faer", Gender::Unknown, Number::Singular),
745                    ("faers", Gender::Unknown, Number::Singular),
746                    ("faerself", Gender::Unknown, Number::Singular),
747                    // Demonstrative pronouns
748                    ("this", Gender::Unknown, Number::Singular),
749                    ("that", Gender::Unknown, Number::Singular),
750                    ("these", Gender::Unknown, Number::Plural),
751                    ("those", Gender::Unknown, Number::Plural),
752                    // Indefinite pronouns
753                    ("someone", Gender::Unknown, Number::Singular),
754                    ("somebody", Gender::Unknown, Number::Singular),
755                    ("anyone", Gender::Unknown, Number::Singular),
756                    ("anybody", Gender::Unknown, Number::Singular),
757                    ("everyone", Gender::Unknown, Number::Singular), // Grammatically singular
758                    ("everybody", Gender::Unknown, Number::Singular),
759                    ("no one", Gender::Unknown, Number::Singular),
760                    ("nobody", Gender::Unknown, Number::Singular),
761                    // Impersonal "one"
762                    ("one", Gender::Unknown, Number::Singular),
763                    ("oneself", Gender::Unknown, Number::Singular),
764                    // Interrogative/relative pronouns
765                    ("who", Gender::Unknown, Number::Unknown),
766                    ("whom", Gender::Unknown, Number::Unknown),
767                    ("whose", Gender::Unknown, Number::Unknown),
768                    ("which", Gender::Unknown, Number::Unknown),
769                    // Reciprocal pronouns
770                    ("each other", Gender::Unknown, Number::Plural),
771                    ("one another", Gender::Unknown, Number::Plural),
772                ]
773            }
774        }
775    }
776
777    /// Detect mentions using NER or heuristics.
778    fn detect_mentions(&self, text: &str) -> Result<Vec<RankedMention>> {
779        let mut mentions = Vec::new();
780
781        // Use NER if available
782        if let Some(ref ner) = self.ner {
783            let entities = ner.extract_entities(text, None)?;
784            for entity in entities {
785                mentions.push(RankedMention {
786                    start: entity.start,
787                    end: entity.end,
788                    text: entity.text.clone(),
789                    mention_type: MentionType::Proper,
790                    gender: None,
791                    number: None,
792                    head: self.get_head(&entity.text),
793                });
794            }
795        }
796
797        // Also detect pronouns via pattern matching
798        //
799        // Note on singular "they": English has used singular they since the 14th century
800        // (Chaucer, Shakespeare, Jane Austen). It's standard for:
801        // 1. Non-binary individuals ("Alex said they would come")
802        // 2. Unknown/generic referents ("Someone left their umbrella")
803        // 3. Formal contexts avoiding gendered assumptions
804        //
805        // Therefore, they/them/their use Number::Unknown, not Plural.
806        // The coreference scorer handles this by not penalizing Unknown mismatches.
807        //
808        // Neopronouns (ze/hir, xe/xem, e/em Spivak, etc.) are third-person singular
809        // pronouns used for gender-neutral or nonbinary reference. They behave
810        // grammatically as singular and use Gender::Unknown since they explicitly
811        // Get language-specific pronoun patterns
812        // Use language from config, fallback to English
813        let pronoun_patterns = self.get_pronoun_patterns();
814
815        // =========================================================================
816        // KNOWN GAPS / FUTURE WORK (documented for linguistic completeness):
817        // =========================================================================
818        //
819        // 1. CATAPHORA (forward reference):
820        //    "Before she arrived, Mary called ahead."
821        //    Current: Only backward (anaphoric) reference is modeled.
822        //    Fix: Would require looking ahead in discourse.
823        //
824        // 2. SPLIT ANTECEDENTS:
825        //    "John went to the store. Mary went to the bank. They met for lunch."
826        //    Current: "They" would need to link to BOTH John and Mary.
827        //    Fix: Cluster merging based on plural pronoun + multiple candidates.
828        //
829        // 3. BRIDGING ANAPHORA:
830        //    "I bought a car. The engine was faulty."
831        //    Current: "The engine" has no explicit antecedent.
832        //    Fix: Requires world knowledge (car has engine).
833        //
834        // 4. APPOSITIVE CONSTRUCTIONS:
835        //    "John, the baker, opened his shop."
836        //    Current: Would detect "John" and "the baker" as separate mentions.
837        //    Fix: Need to recognize appositive structure and link them.
838        //
839        // 5. COPULA CONSTRUCTIONS:
840        //    "The CEO is John Smith."
841        //    Current: Separate mentions, may not link.
842        //    Fix: Special handling for "X is Y" patterns (see is_be_phrase_link).
843        //
844        // 6. PRO-DROP LANGUAGES (Spanish, Italian, Japanese):
845        //    Subject pronouns can be omitted: "∅ llegué tarde" = "I arrived late"
846        //    Current: Only works with overt pronouns.
847        //    Fix: Verb morphology analysis, zero pronoun detection.
848        //
849        // 7. BINDING THEORY CONSTRAINTS:
850        //    Reflexives must be locally bound: "John saw himself" (same clause)
851        //    Pronouns must NOT be locally bound: "John saw him" (different entity)
852        //    Current: Not enforced - all candidates scored equally.
853        //    Fix: Syntactic parsing to identify clause boundaries.
854        //
855        // 8. ANIMACY CONSTRAINTS:
856        //    "The rock fell. *It/*He was heavy."
857        //    Current: Basic gender/number matching only.
858        //    Fix: Animacy feature extraction from entity type or lexicon.
859        //
860        // =========================================================================
861        // EXOTIC LINGUISTIC PHENOMENA (beyond standard English):
862        // =========================================================================
863        //
864        // 9. CLUSIVITY (inclusive vs exclusive "we"):
865        //    Many languages (Austronesian, Dravidian, Algonquian) distinguish:
866        //    - Inclusive: speaker + addressee ("you and I")
867        //    - Exclusive: speaker + others, NOT addressee ("me and them, not you")
868        //    Current: Not modeled. English "we" is ambiguous.
869        //
870        // 10. OBVIATION (Algonquian "fourth person"):
871        //     Distinguishes proximate (topical) vs obviative (less topical) 3rd person.
872        //     "He_PROX saw him_OBV" = unambiguous reference to two different entities.
873        //     Current: No support for discourse-level topicality tracking.
874        //
875        // 11. SWITCH-REFERENCE:
876        //     Clausal markers indicating whether subject is same/different from prior clause.
877        //     "He went home and-SAME_SUBJ ate" vs "He went home and-DIFF_SUBJ she cooked"
878        //     Current: No syntactic clause analysis.
879        //
880        // 12. LOGOPHORIC PRONOUNS (West African languages like Ewe, Yoruba):
881        //     Special pronoun for "the person whose speech/thought is being reported"
882        //     "Kofi said that LOG will win" (LOG = Kofi, unambiguously)
883        //     Current: No perspective/attitude holder tracking.
884        //
885        // 13. CORRELATIVE-RELATIVE (Sanskrit, Hindi):
886        //     "ya- ... sa-" pattern: relative clause first, then demonstrative resumes.
887        //     "Who(ever) came, that-one ate" = explicit cross-clause coreference.
888        //     Current: Only backward anaphora modeled.
889        //
890        // 14. NOUN CLASS SYSTEMS (Bantu, Dyirbal):
891        //     10-20+ "genders" based on semantics (human, animal, plant, tool, etc.)
892        //     Pronouns agree with noun class, not biological sex.
893        //     Current: Only masc/fem/neut gender, not full noun class agreement.
894        //
895        // 15. SHAPE-BASED CLASSIFIERS (Navajo, Chinese classifiers):
896        //     Verbs/pronouns encode physical properties (long, flat, round, granular).
897        //     Current: No shape/classifier feature tracking.
898        //
899        // 16. TRIAL/PAUCAL NUMBER (Austronesian):
900        //     Some languages distinguish: singular, dual, trial (exactly 3), paucal (few).
901        //     Current: Only sg/du/pl/unknown in Number enum.
902        //
903        // 17. HONORIFIC/POLITENESS LEVELS (Japanese, Korean, Thai):
904        //     Pronoun choice encodes social relationship, not just person/number.
905        //     "Watashi" vs "boku" vs "ore" (Japanese 1st person, different registers).
906        //     Current: No formality/register tracking.
907        //
908        // =========================================================================
909        // INFORMATION-THEORETIC VIEW:
910        // =========================================================================
911        //
912        // Coreference resolution can be framed as entropy reduction:
913        // - H(Antecedent | Context) = uncertainty over which entity a pronoun refers to
914        // - Good discourse makes pronouns low-entropy (context narrows candidates)
915        // - Surprisal of choosing antecedent a = -log p(a | Context)
916        // - Each resolved anaphor yields information gain: H(A) - H(A | Context)
917        //
918        // Features like recency, grammatical role, semantic compatibility all
919        // increase mutual information I(Antecedent; Context).
920        //
921
922        // Find pronouns in text
923        let text_lower = text.to_lowercase();
924        let text_chars: Vec<char> = text.chars().collect();
925        for (pronoun, gender, number) in pronoun_patterns {
926            let mut search_start_byte = 0;
927            while let Some(pos) = text_lower[search_start_byte..].find(pronoun) {
928                let abs_byte_pos = search_start_byte + pos;
929                let end_byte_pos = abs_byte_pos + pronoun.len();
930
931                // Convert byte positions to character positions for boundary checks
932                let char_pos = text[..abs_byte_pos].chars().count();
933                let end_char_pos = char_pos + pronoun.chars().count();
934
935                // Check word boundaries using character positions
936                let is_word_start = char_pos == 0
937                    || match text_chars.get(char_pos.saturating_sub(1)) {
938                        None => true,
939                        Some(c) => !c.is_alphanumeric(),
940                    };
941                let is_word_end = end_char_pos >= text_chars.len()
942                    || match text_chars.get(end_char_pos) {
943                        None => true,
944                        Some(c) => !c.is_alphanumeric(),
945                    };
946
947                if is_word_start && is_word_end {
948                    // Skip pleonastic "it" (non-referential uses)
949                    // See: Boyd et al. "Identification of Pleonastic It Using the Web"
950                    if pronoun == "it" && self.is_pleonastic_it(&text_lower, abs_byte_pos) {
951                        search_start_byte = end_byte_pos;
952                        continue;
953                    }
954
955                    // Use character offsets for the mention
956                    let char_start = char_pos;
957                    let char_end = end_char_pos;
958
959                    mentions.push(RankedMention {
960                        start: char_start,
961                        end: char_end,
962                        text: text[abs_byte_pos..end_byte_pos].to_string(),
963                        mention_type: MentionType::Pronominal,
964                        gender: Some(gender),
965                        number: Some(number),
966                        head: pronoun.to_string(),
967                    });
968                }
969
970                search_start_byte = end_byte_pos;
971            }
972        }
973
974        // Detect proper nouns (capitalized words not at sentence start)
975        let words: Vec<_> = text.split_whitespace().collect();
976        let mut search_byte_pos = 0; // Byte position for searching
977
978        for (i, word) in words.iter().enumerate() {
979            // Skip if at sentence start
980            let at_sentence_start = i == 0
981                || match text[..text.find(word).unwrap_or(0)].chars().last() {
982                    None => true,
983                    Some(c) => c == '.' || c == '!' || c == '?',
984                };
985
986            if !at_sentence_start
987                && word.chars().next().is_some_and(|c| c.is_uppercase())
988                && word.chars().count() > 1
989            // Use chars().count() for Unicode
990            {
991                // Find byte position of word
992                if let Some(rel_byte_pos) = text[search_byte_pos..].find(word) {
993                    let abs_byte_pos = search_byte_pos + rel_byte_pos;
994                    // Convert byte offset to character offset for Entity
995                    let char_start = text[..abs_byte_pos].chars().count();
996                    let char_end = char_start + word.chars().count();
997
998                    mentions.push(RankedMention {
999                        start: char_start,
1000                        end: char_end,
1001                        text: word.to_string(),
1002                        mention_type: MentionType::Proper,
1003                        gender: None,
1004                        number: Some(Number::Singular),
1005                        head: word.to_string(),
1006                    });
1007                }
1008            }
1009
1010            search_byte_pos += word.len() + 1; // +1 for space (byte-based)
1011        }
1012
1013        // Detect nominal adjectives (J2N: arXiv:2409.14374)
1014        // Phrases like "the poor", "the elderly" function as plural noun phrases.
1015        //
1016        // MULTILINGUAL: Supports English, German, French, Spanish patterns.
1017        // - German: "die Armen" (the poor), "die Reichen" (the rich)
1018        // - French: "les pauvres", "les riches"
1019        // - Spanish: "los pobres", "los ricos"
1020        // - Arabic and Japanese use different patterns not yet supported.
1021        if self.config.enable_nominal_adjective_detection {
1022            // Adjectives that commonly function as nouns when preceded by determiners.
1023            // These refer to groups of people and are grammatically plural.
1024            const NOMINALIZED_ADJECTIVES: &[&str] = &[
1025                // Socioeconomic status
1026                "poor",
1027                "rich",
1028                "wealthy",
1029                "homeless",
1030                "unemployed",
1031                "employed",
1032                // Age
1033                "young",
1034                "old",
1035                "elderly",
1036                "aged",
1037                // Health and physical state
1038                "sick",
1039                "ill",
1040                "healthy",
1041                "wounded",
1042                "injured",
1043                "disabled",
1044                "blind",
1045                "deaf",
1046                // Life state
1047                "dead",
1048                "living",
1049                "deceased",
1050                // Legal/social status
1051                "accused",
1052                "condemned",
1053                "convicted",
1054                "guilty",
1055                "innocent",
1056                "insured",
1057                "uninsured",
1058                // Education/ability
1059                "gifted",
1060                "talented",
1061                "educated",
1062                "literate",
1063                "illiterate",
1064                // Power dynamics
1065                "powerful",
1066                "powerless",
1067                "oppressed",
1068                "weak",
1069                "famous",
1070                "infamous",
1071                // Moral/religious (common in literary texts)
1072                "righteous",
1073                "wicked",
1074                "blessed",
1075                "damned",
1076                "faithful",
1077                // Other common cases
1078                "hungry",
1079                "needy",
1080                "privileged",
1081                "underprivileged",
1082                "disadvantaged",
1083                "marginalized",
1084            ];
1085
1086            // =========================================================================
1087            // Language-specific nominal adjective patterns
1088            // =========================================================================
1089
1090            // Get determiners and adjectives for the configured language
1091            let (determiners, adjectives): (Vec<&str>, Vec<&str>) =
1092                match self.config.language.as_str() {
1093                    "de" => {
1094                        // German: "die Armen", "die Reichen", etc.
1095                        // Note: German uses "die" (the) for plural nominalized adjectives
1096                        let dets = vec!["die ", "diese ", "jene "];
1097                        let adjs = vec![
1098                            "armen",
1099                            "reichen",
1100                            "alten",
1101                            "jungen",
1102                            "kranken",
1103                            "gesunden",
1104                            "toten",
1105                            "lebenden",
1106                            "blinden",
1107                            "tauben",
1108                            "arbeitslosen",
1109                            "obdachlosen",
1110                            "mächtigen",
1111                            "schwachen",
1112                            "unterdrückten",
1113                        ];
1114                        (dets, adjs)
1115                    }
1116                    "fr" => {
1117                        // French: "les pauvres", "les riches", etc.
1118                        let dets = vec!["les ", "ces "];
1119                        let adjs = vec![
1120                            "pauvres",
1121                            "riches",
1122                            "vieux",
1123                            "jeunes",
1124                            "malades",
1125                            "morts",
1126                            "vivants",
1127                            "aveugles",
1128                            "sourds",
1129                            "faibles",
1130                            "puissants",
1131                            "opprimés",
1132                            "affamés",
1133                            "marginalisés",
1134                        ];
1135                        (dets, adjs)
1136                    }
1137                    "es" => {
1138                        // Spanish: "los pobres", "los ricos", etc.
1139                        // Note: Spanish uses gender-marked articles (los/las)
1140                        let dets = vec!["los ", "las ", "estos ", "estas "];
1141                        let adjs = vec![
1142                            "pobres",
1143                            "ricos",
1144                            "viejos",
1145                            "jóvenes",
1146                            "enfermos",
1147                            "muertos",
1148                            "vivos",
1149                            "ciegos",
1150                            "sordos",
1151                            "débiles",
1152                            "poderosos",
1153                            "oprimidos",
1154                            "hambrientos",
1155                            "marginados",
1156                        ];
1157                        (dets, adjs)
1158                    }
1159                    _ => {
1160                        // English (default): "the poor", "the rich", etc.
1161                        let dets = vec!["the ", "these ", "those "];
1162                        (dets, NOMINALIZED_ADJECTIVES.to_vec())
1163                    }
1164                };
1165
1166            for det in &determiners {
1167                for adj in &adjectives {
1168                    let pattern = format!("{}{}", det, adj);
1169                    let pattern_lower = pattern.to_lowercase();
1170
1171                    let mut search_start = 0;
1172                    while let Some(rel_pos) = text_lower[search_start..].find(&pattern_lower) {
1173                        let abs_byte_pos = search_start + rel_pos;
1174                        let end_byte_pos = abs_byte_pos + pattern.len();
1175
1176                        // Check that the adjective isn't modifying a following noun.
1177                        // "the poor performance" should NOT match because "poor" modifies "performance".
1178                        // But "the poor are struggling" SHOULD match because "poor" is nominalized.
1179                        //
1180                        // Heuristic: If followed by a verb, conjunction, or sentence boundary,
1181                        // it's likely a nominal adjective. If followed by a noun/adjective, it's not.
1182                        let following_text = &text_lower[end_byte_pos..];
1183                        let next_word: String = following_text
1184                            .chars()
1185                            .skip_while(|c| c.is_whitespace())
1186                            .take_while(|c| c.is_alphabetic())
1187                            .collect();
1188
1189                        // Words that can follow a nominal adjective (language-specific)
1190                        let valid_followers: Vec<&str> = match self.config.language.as_str() {
1191                            "de" => vec![
1192                                // German verbs
1193                                "sind", "waren", "haben", "hatten", "werden", "wurden", "brauchen",
1194                                "müssen", "können", "sollen", "wollen", // Conjunctions
1195                                "und", "oder", "aber", "die", "welche",
1196                            ],
1197                            "fr" => vec![
1198                                // French verbs
1199                                "sont",
1200                                "étaient",
1201                                "ont",
1202                                "avaient",
1203                                "seront",
1204                                "peuvent",
1205                                "doivent",
1206                                "veulent",
1207                                "méritent",
1208                                // Conjunctions
1209                                "et",
1210                                "ou",
1211                                "mais",
1212                                "qui",
1213                                "que",
1214                            ],
1215                            "es" => vec![
1216                                // Spanish verbs
1217                                "son",
1218                                "eran",
1219                                "tienen",
1220                                "tenían",
1221                                "serán",
1222                                "pueden",
1223                                "deben",
1224                                "quieren",
1225                                "merecen",
1226                                "necesitan",
1227                                "sufren",
1228                                "luchan",
1229                                "reciben",
1230                                "buscan",
1231                                // Conjunctions
1232                                "y",
1233                                "o",
1234                                "pero",
1235                                "que",
1236                                "quienes",
1237                            ],
1238                            _ => vec![
1239                                // English (default)
1240                                "are", "were", "is", "was", "be", "been", "being", "have", "has",
1241                                "had", "having", "do", "does", "did", "can", "could", "will",
1242                                "would", "shall", "should", "may", "might", "must", "need", "want",
1243                                "get", "got", "struggle", "suffer", "deserve", "receive", "face",
1244                                "lack", "seek", "and", "or", "but", "who", "whom", "whose", "that",
1245                                "which", "in", "of", "from", "with", "without", "among",
1246                            ],
1247                        };
1248
1249                        // Valid if: no next word, starts with punct, or next word is allowed
1250                        let is_valid_nominal =
1251                            next_word.is_empty() || valid_followers.contains(&next_word.as_str());
1252
1253                        if is_valid_nominal {
1254                            // Convert byte positions to character positions
1255                            let char_start = text[..abs_byte_pos].chars().count();
1256                            let char_end = char_start + pattern.chars().count();
1257
1258                            mentions.push(RankedMention {
1259                                start: char_start,
1260                                end: char_end,
1261                                text: text[abs_byte_pos..end_byte_pos].to_string(),
1262                                mention_type: MentionType::Nominal,
1263                                gender: Some(Gender::Unknown), // Groups are gender-neutral
1264                                number: Some(Number::Plural),  // Grammatically plural
1265                                head: adj.to_string(),         // Head is the adjective
1266                            });
1267                        }
1268
1269                        search_start = end_byte_pos;
1270                    }
1271                }
1272            }
1273        }
1274
1275        // Deduplicate overlapping mentions (prefer longer/earlier)
1276        mentions.sort_by_key(|m| (m.start, std::cmp::Reverse(m.end)));
1277        let mut deduped = Vec::new();
1278        let mut covered_end = 0;
1279
1280        for mention in mentions {
1281            if mention.start >= covered_end {
1282                covered_end = mention.end;
1283                deduped.push(mention);
1284            }
1285        }
1286
1287        Ok(deduped)
1288    }
1289
1290    /// Extract additional features for a mention.
1291    fn extract_features(&self, mention: &mut RankedMention) {
1292        // Infer gender from proper nouns
1293        if mention.gender.is_none() && mention.mention_type == MentionType::Proper {
1294            mention.gender = self.guess_gender(&mention.text);
1295        }
1296
1297        // Infer number
1298        if mention.number.is_none() {
1299            mention.number = Some(Number::Singular); // Default
1300        }
1301    }
1302
1303    /// Guess gender from a proper noun.
1304    fn guess_gender(&self, text: &str) -> Option<Gender> {
1305        let masc_names = [
1306            "john", "james", "michael", "david", "robert", "william", "richard",
1307        ];
1308        let fem_names = [
1309            "mary",
1310            "jennifer",
1311            "lisa",
1312            "sarah",
1313            "jessica",
1314            "emily",
1315            "elizabeth",
1316        ];
1317
1318        let first_word = text.split_whitespace().next()?.to_lowercase();
1319
1320        if masc_names.contains(&first_word.as_str()) {
1321            Some(Gender::Masculine)
1322        } else if fem_names.contains(&first_word.as_str()) {
1323            Some(Gender::Feminine)
1324        } else {
1325            None
1326        }
1327    }
1328
1329    /// Get head word of a mention.
1330    fn get_head(&self, text: &str) -> String {
1331        // Simple heuristic: last word is head
1332        text.split_whitespace().last().unwrap_or(text).to_string()
1333    }
1334
1335    /// Link mentions to antecedents and form clusters.
1336    ///
1337    /// # Arguments
1338    ///
1339    /// * `mentions` - Detected mentions sorted by position
1340    /// * `text` - Source text for context-aware features (i2b2-inspired)
1341    fn link_mentions(&self, mentions: &[RankedMention], text: &str) -> Vec<MentionCluster> {
1342        match self.config.clustering_strategy {
1343            ClusteringStrategy::LeftToRight => self.link_mentions_left_to_right(mentions, text),
1344            ClusteringStrategy::EasyFirst => self.link_mentions_easy_first(mentions, text),
1345        }
1346    }
1347
1348    /// Traditional left-to-right clustering.
1349    fn link_mentions_left_to_right(
1350        &self,
1351        mentions: &[RankedMention],
1352        text: &str,
1353    ) -> Vec<MentionCluster> {
1354        let mut mention_to_cluster: HashMap<usize, usize> = HashMap::new();
1355        let mut clusters: Vec<Vec<usize>> = Vec::new();
1356
1357        for (i, mention) in mentions.iter().enumerate() {
1358            let mut best_antecedent: Option<usize> = None;
1359            let mut best_score = self.config.link_threshold;
1360
1361            // Type-specific antecedent limit
1362            let max_antecedents = self.config.max_antecedents_for_type(mention.mention_type);
1363
1364            // Score against previous mentions with type-specific limit
1365            for j in (0..i).rev().take(max_antecedents) {
1366                let antecedent = &mentions[j];
1367
1368                // Also check character distance as a fallback
1369                let distance = mention.start.saturating_sub(antecedent.end);
1370                if distance > self.config.max_distance {
1371                    break;
1372                }
1373
1374                let score = self.score_pair(mention, antecedent, distance, Some(text));
1375                if score > best_score {
1376                    best_score = score;
1377                    best_antecedent = Some(j);
1378                }
1379            }
1380
1381            if let Some(ant_idx) = best_antecedent {
1382                // Link to antecedent's cluster
1383                if let Some(&cluster_id) = mention_to_cluster.get(&ant_idx) {
1384                    clusters[cluster_id].push(i);
1385                    mention_to_cluster.insert(i, cluster_id);
1386                } else {
1387                    // New cluster
1388                    let cluster_id = clusters.len();
1389                    clusters.push(vec![ant_idx, i]);
1390                    mention_to_cluster.insert(ant_idx, cluster_id);
1391                    mention_to_cluster.insert(i, cluster_id);
1392                }
1393            }
1394        }
1395
1396        // Apply global proper noun coreference if enabled
1397        let clusters = if self.config.enable_global_proper_coref {
1398            self.apply_global_proper_coref(mentions, clusters)
1399        } else {
1400            clusters
1401        };
1402
1403        // Convert to MentionCluster
1404        clusters
1405            .into_iter()
1406            .enumerate()
1407            .map(|(id, indices)| MentionCluster {
1408                id,
1409                mentions: indices.into_iter().map(|i| mentions[i].clone()).collect(),
1410            })
1411            .collect()
1412    }
1413
1414    /// Easy-first clustering: process high-confidence decisions first.
1415    ///
1416    /// Based on Clark & Manning (2016) and Bourgois & Poibeau (2025).
1417    /// High-confidence decisions constrain later decisions.
1418    fn link_mentions_easy_first(
1419        &self,
1420        mentions: &[RankedMention],
1421        text: &str,
1422    ) -> Vec<MentionCluster> {
1423        // Step 1: Compute all pairwise scores
1424        let mut scored_pairs: Vec<ScoredPair> = Vec::new();
1425        let mut non_coref_pairs: HashSet<(usize, usize)> = HashSet::new();
1426
1427        for (i, mention) in mentions.iter().enumerate() {
1428            let max_antecedents = self.config.max_antecedents_for_type(mention.mention_type);
1429
1430            for j in (0..i).rev().take(max_antecedents) {
1431                let antecedent = &mentions[j];
1432                let distance = mention.start.saturating_sub(antecedent.end);
1433                if distance > self.config.max_distance {
1434                    break;
1435                }
1436
1437                let score = self.score_pair(mention, antecedent, distance, Some(text));
1438
1439                // Track non-coreference constraints
1440                if self.config.use_non_coref_constraints && score < self.config.non_coref_threshold
1441                {
1442                    // Check for coordinating conjunction pattern
1443                    // (mentions connected by "and"/"or" are likely non-coreferent)
1444                    non_coref_pairs.insert((j.min(i), j.max(i)));
1445                }
1446
1447                if score > self.config.link_threshold {
1448                    scored_pairs.push(ScoredPair {
1449                        mention_idx: i,
1450                        antecedent_idx: j,
1451                        score,
1452                    });
1453                }
1454            }
1455        }
1456
1457        // Step 2: Sort by confidence (highest first)
1458        scored_pairs.sort_by(|a, b| {
1459            b.score
1460                .partial_cmp(&a.score)
1461                .unwrap_or(std::cmp::Ordering::Equal)
1462        });
1463
1464        // Step 3: Process in confidence order, respecting constraints
1465        let mut mention_to_cluster: HashMap<usize, usize> = HashMap::new();
1466        let mut clusters: Vec<Vec<usize>> = Vec::new();
1467        let mut processed: HashSet<usize> = HashSet::new();
1468
1469        for pair in scored_pairs {
1470            // Skip if mention already has an antecedent
1471            if processed.contains(&pair.mention_idx) {
1472                continue;
1473            }
1474
1475            // Check non-coreference constraint
1476            let key = (
1477                pair.antecedent_idx.min(pair.mention_idx),
1478                pair.antecedent_idx.max(pair.mention_idx),
1479            );
1480            if self.config.use_non_coref_constraints && non_coref_pairs.contains(&key) {
1481                continue;
1482            }
1483
1484            // Check cluster-level constraint: would this merge violate any non-coref?
1485            let would_violate = if self.config.use_non_coref_constraints {
1486                self.would_violate_constraint(
1487                    pair.mention_idx,
1488                    pair.antecedent_idx,
1489                    &mention_to_cluster,
1490                    &clusters,
1491                    &non_coref_pairs,
1492                )
1493            } else {
1494                false
1495            };
1496
1497            if would_violate {
1498                continue;
1499            }
1500
1501            // Link mention to antecedent's cluster
1502            processed.insert(pair.mention_idx);
1503
1504            if let Some(&cluster_id) = mention_to_cluster.get(&pair.antecedent_idx) {
1505                clusters[cluster_id].push(pair.mention_idx);
1506                mention_to_cluster.insert(pair.mention_idx, cluster_id);
1507            } else {
1508                let cluster_id = clusters.len();
1509                clusters.push(vec![pair.antecedent_idx, pair.mention_idx]);
1510                mention_to_cluster.insert(pair.antecedent_idx, cluster_id);
1511                mention_to_cluster.insert(pair.mention_idx, cluster_id);
1512            }
1513        }
1514
1515        // Apply global proper noun coreference if enabled
1516        let clusters = if self.config.enable_global_proper_coref {
1517            self.apply_global_proper_coref(mentions, clusters)
1518        } else {
1519            clusters
1520        };
1521
1522        // Convert to MentionCluster
1523        clusters
1524            .into_iter()
1525            .enumerate()
1526            .map(|(id, indices)| MentionCluster {
1527                id,
1528                mentions: indices.into_iter().map(|i| mentions[i].clone()).collect(),
1529            })
1530            .collect()
1531    }
1532
1533    /// Check if linking would violate non-coreference constraints.
1534    fn would_violate_constraint(
1535        &self,
1536        mention_idx: usize,
1537        antecedent_idx: usize,
1538        mention_to_cluster: &HashMap<usize, usize>,
1539        clusters: &[Vec<usize>],
1540        non_coref_pairs: &HashSet<(usize, usize)>,
1541    ) -> bool {
1542        // Get cluster members that would be merged
1543        let mut members = vec![mention_idx];
1544        if let Some(&cluster_id) = mention_to_cluster.get(&antecedent_idx) {
1545            members.extend(clusters[cluster_id].iter().copied());
1546        } else {
1547            members.push(antecedent_idx);
1548        }
1549
1550        // Check all pairs in merged cluster for violations
1551        for i in 0..members.len() {
1552            for j in (i + 1)..members.len() {
1553                let key = (members[i].min(members[j]), members[i].max(members[j]));
1554                if non_coref_pairs.contains(&key) {
1555                    return true;
1556                }
1557            }
1558        }
1559
1560        false
1561    }
1562
1563    /// Apply global proper noun coreference propagation.
1564    ///
1565    /// For each pair of proper nouns that are locally predicted coreferent,
1566    /// propagate this decision to all document-wide pairs involving those strings.
1567    fn apply_global_proper_coref(
1568        &self,
1569        mentions: &[RankedMention],
1570        mut clusters: Vec<Vec<usize>>,
1571    ) -> Vec<Vec<usize>> {
1572        // Collect proper noun clusters and their normalized forms
1573        let mut proper_to_cluster: HashMap<String, usize> = HashMap::new();
1574        let mut cluster_to_propers: HashMap<usize, Vec<String>> = HashMap::new();
1575
1576        for (cluster_idx, cluster) in clusters.iter().enumerate() {
1577            for &mention_idx in cluster {
1578                let mention = &mentions[mention_idx];
1579                if mention.mention_type == MentionType::Proper {
1580                    let normalized = mention.text.to_lowercase();
1581                    proper_to_cluster.insert(normalized.clone(), cluster_idx);
1582                    cluster_to_propers
1583                        .entry(cluster_idx)
1584                        .or_default()
1585                        .push(normalized);
1586                }
1587            }
1588        }
1589
1590        // Find all proper mentions not yet clustered
1591        let mut unclustered_propers: Vec<(usize, String)> = Vec::new();
1592        let mut mention_to_cluster: HashMap<usize, usize> = HashMap::new();
1593
1594        for (cluster_idx, cluster) in clusters.iter().enumerate() {
1595            for &mention_idx in cluster {
1596                mention_to_cluster.insert(mention_idx, cluster_idx);
1597            }
1598        }
1599
1600        for (i, mention) in mentions.iter().enumerate() {
1601            if mention.mention_type == MentionType::Proper && !mention_to_cluster.contains_key(&i) {
1602                unclustered_propers.push((i, mention.text.to_lowercase()));
1603            }
1604        }
1605
1606        // Link unclustered proper nouns to matching clusters
1607        for (mention_idx, normalized) in unclustered_propers {
1608            if let Some(&cluster_idx) = proper_to_cluster.get(&normalized) {
1609                clusters[cluster_idx].push(mention_idx);
1610            }
1611        }
1612
1613        // Merge clusters that share proper noun strings
1614        // This handles cases like "Sir Ralph Brown" and "Raphael" being in same cluster
1615        let mut merged = vec![false; clusters.len()];
1616        let mut merge_map: HashMap<usize, usize> = HashMap::new();
1617
1618        for (idx, cluster) in clusters.iter().enumerate() {
1619            if merged[idx] {
1620                continue;
1621            }
1622
1623            let propers: Vec<_> = cluster
1624                .iter()
1625                .filter_map(|&i| {
1626                    let m = &mentions[i];
1627                    if m.mention_type == MentionType::Proper {
1628                        Some(m.text.to_lowercase())
1629                    } else {
1630                        None
1631                    }
1632                })
1633                .collect();
1634
1635            // Find other clusters with matching propers
1636            for (other_idx, other_cluster) in clusters.iter().enumerate() {
1637                if other_idx <= idx || merged[other_idx] {
1638                    continue;
1639                }
1640
1641                let other_propers: Vec<_> = other_cluster
1642                    .iter()
1643                    .filter_map(|&i| {
1644                        let m = &mentions[i];
1645                        if m.mention_type == MentionType::Proper {
1646                            Some(m.text.to_lowercase())
1647                        } else {
1648                            None
1649                        }
1650                    })
1651                    .collect();
1652
1653                // Check for overlap
1654                if propers.iter().any(|p| other_propers.contains(p)) {
1655                    merged[other_idx] = true;
1656                    merge_map.insert(other_idx, idx);
1657                }
1658            }
1659        }
1660
1661        // Apply merges
1662        if !merge_map.is_empty() {
1663            let mut final_clusters: Vec<Vec<usize>> = Vec::new();
1664            let mut old_to_new: HashMap<usize, usize> = HashMap::new();
1665
1666            for (old_idx, cluster) in clusters.into_iter().enumerate() {
1667                if merged[old_idx] {
1668                    // Find target cluster
1669                    let mut target = merge_map[&old_idx];
1670                    while let Some(&next) = merge_map.get(&target) {
1671                        target = next;
1672                    }
1673                    if let Some(&new_idx) = old_to_new.get(&target) {
1674                        final_clusters[new_idx].extend(cluster);
1675                    }
1676                } else {
1677                    let new_idx = final_clusters.len();
1678                    old_to_new.insert(old_idx, new_idx);
1679                    final_clusters.push(cluster);
1680                }
1681            }
1682
1683            final_clusters
1684        } else {
1685            clusters
1686        }
1687    }
1688
1689    /// Score a (mention, antecedent) pair.
1690    ///
1691    /// # Arguments
1692    ///
1693    /// * `mention` - The anaphor being resolved
1694    /// * `antecedent` - Candidate antecedent
1695    /// * `distance` - Character distance between mentions
1696    /// * `text` - Optional source text for context-aware features
1697    fn score_pair(
1698        &self,
1699        mention: &RankedMention,
1700        antecedent: &RankedMention,
1701        distance: usize,
1702        text: Option<&str>,
1703    ) -> f64 {
1704        let mut score = 0.0;
1705
1706        // =========================================================================
1707        // i2b2-inspired context filtering (Chen et al. 2011)
1708        // Check this first - if context filtering rejects the pair, return low score
1709        // =========================================================================
1710        if self.config.enable_context_filtering {
1711            if let Some(txt) = text {
1712                if self.should_filter_by_context(txt, mention, antecedent) {
1713                    return -1.0; // Strong negative signal to reject this pair
1714                }
1715            }
1716        }
1717
1718        // =========================================================================
1719        // String match features
1720        // =========================================================================
1721        let m_lower = mention.text.to_lowercase();
1722        let a_lower = antecedent.text.to_lowercase();
1723
1724        // Exact match
1725        if m_lower == a_lower {
1726            score += self.config.string_match_weight * 1.0;
1727        }
1728        // Head match
1729        else if mention.head.to_lowercase() == antecedent.head.to_lowercase() {
1730            score += self.config.string_match_weight * 0.6;
1731        }
1732        // Substring
1733        else if m_lower.contains(&a_lower) || a_lower.contains(&m_lower) {
1734            score += self.config.string_match_weight * 0.3;
1735        }
1736
1737        // =========================================================================
1738        // i2b2-inspired "be phrase" detection (Chen et al. 2011)
1739        // "Resolution of X is Y" → X and Y are coreferent
1740        // =========================================================================
1741        if self.config.enable_be_phrase_detection {
1742            if let Some(txt) = text {
1743                if self.is_be_phrase_link(txt, mention, antecedent) {
1744                    score += self.config.be_phrase_weight;
1745                }
1746            }
1747        }
1748
1749        // =========================================================================
1750        // i2b2-inspired acronym matching (Chen et al. 2011)
1751        // "MRSA" ↔ "Methicillin-resistant Staphylococcus aureus"
1752        // =========================================================================
1753        if self.config.enable_acronym_matching && self.is_acronym_match(mention, antecedent) {
1754            score += self.config.acronym_weight;
1755        }
1756
1757        // =========================================================================
1758        // i2b2-inspired synonym matching (Chen et al. 2011)
1759        // Uses UMLS concept matching in original; we use a basic synonym table
1760        // =========================================================================
1761        if self.config.enable_synonym_matching && self.are_synonyms(mention, antecedent) {
1762            score += self.config.synonym_weight;
1763        }
1764
1765        // =========================================================================
1766        // Type compatibility
1767        // =========================================================================
1768        match (mention.mention_type, antecedent.mention_type) {
1769            (MentionType::Pronominal, MentionType::Proper) => {
1770                score += self.config.type_compat_weight * 0.5;
1771            }
1772            (MentionType::Pronominal, MentionType::Pronominal) => {
1773                // Same pronoun
1774                if mention.text.to_lowercase() == antecedent.text.to_lowercase() {
1775                    score += self.config.type_compat_weight * 0.3;
1776                }
1777            }
1778            (MentionType::Proper, MentionType::Proper) => {
1779                score += self.config.type_compat_weight * 0.4;
1780            }
1781            _ => {}
1782        }
1783
1784        // =========================================================================
1785        // Gender agreement
1786        // =========================================================================
1787        if let (Some(m_gender), Some(a_gender)) = (mention.gender, antecedent.gender) {
1788            if m_gender == a_gender {
1789                score += self.config.type_compat_weight * 0.3;
1790            } else if m_gender != Gender::Unknown && a_gender != Gender::Unknown {
1791                score -= self.config.type_compat_weight * 0.5; // Penalty for mismatch
1792            }
1793        }
1794
1795        // =========================================================================
1796        // Number agreement
1797        //
1798        // Uses Number::is_compatible() from anno_core which handles:
1799        // - Unknown is compatible with anything (singular they, "you")
1800        // - Dual is compatible with Plural (Arabic/Hebrew/Sanskrit dual numbers)
1801        // - Exact matches are preferred
1802        // =========================================================================
1803        if let (Some(m_number), Some(a_number)) = (mention.number, antecedent.number) {
1804            if m_number == a_number {
1805                // Exact match: strongest bonus
1806                score += self.config.type_compat_weight * 0.2;
1807            } else if m_number.is_compatible(&a_number) {
1808                // Compatible but not exact (e.g., Unknown with Singular, Dual with Plural)
1809                // Small bonus - compatible but less certain
1810                score += self.config.type_compat_weight * 0.05;
1811            } else {
1812                // Incompatible numbers (e.g., Singular vs Plural)
1813                score -= self.config.type_compat_weight * 0.4;
1814            }
1815        }
1816
1817        // =========================================================================
1818        // Distance penalty
1819        // =========================================================================
1820        score -= self.config.distance_weight * (distance as f64).ln().max(0.0);
1821
1822        // =========================================================================
1823        // Salience boost
1824        // =========================================================================
1825        if self.config.salience_weight > 0.0 {
1826            let salience = self.get_salience(&antecedent.text);
1827            score += self.config.salience_weight * salience;
1828        }
1829
1830        score
1831    }
1832}
1833
1834impl Default for MentionRankingCoref {
1835    fn default() -> Self {
1836        Self::new()
1837    }
1838}
1839
1840// =============================================================================
1841// Integration with GroundedDocument (Signal → Track → Identity hierarchy)
1842// =============================================================================
1843
1844impl MentionRankingCoref {
1845    /// Resolve coreferences and produce Signals and Tracks for a GroundedDocument.
1846    ///
1847    /// This is the bridge between mention-ranking output and the canonical
1848    /// `Signal → Track → Identity` hierarchy in `anno-core::grounded`.
1849    ///
1850    /// # Returns
1851    ///
1852    /// A tuple of (signals, tracks) that can be added to a GroundedDocument:
1853    /// - `signals`: Individual mention detections with locations
1854    /// - `tracks`: Clusters of signals referring to the same entity
1855    ///
1856    /// # Example
1857    ///
1858    /// ```rust,ignore
1859    /// use anno::backends::mention_ranking::MentionRankingCoref;
1860    /// use anno_core::GroundedDocument;
1861    ///
1862    /// let coref = MentionRankingCoref::new();
1863    /// let (signals, tracks) = coref.resolve_to_grounded("John saw Mary. He waved.")?;
1864    ///
1865    /// let mut doc = GroundedDocument::new("doc1");
1866    /// for signal in signals {
1867    ///     doc.add_signal(signal);
1868    /// }
1869    /// for track in tracks {
1870    ///     doc.add_track(track);
1871    /// }
1872    /// ```
1873    pub fn resolve_to_grounded(
1874        &self,
1875        text: &str,
1876    ) -> Result<(
1877        Vec<anno_core::Signal<anno_core::Location>>,
1878        Vec<anno_core::Track>,
1879    )> {
1880        let clusters = self.resolve(text)?;
1881
1882        let mut all_signals = Vec::new();
1883        let mut all_tracks = Vec::new();
1884        let mut signal_id_offset = anno_core::SignalId::ZERO;
1885
1886        for cluster in clusters {
1887            let (track, signals) = cluster.to_track(signal_id_offset);
1888            signal_id_offset += signals.len() as u64;
1889            all_signals.extend(signals);
1890            all_tracks.push(track);
1891        }
1892
1893        Ok((all_signals, all_tracks))
1894    }
1895
1896    /// Resolve coreferences and add results directly to a GroundedDocument.
1897    ///
1898    /// This is a convenience method that calls `resolve_to_grounded` and
1899    /// adds the signals and tracks to the document.
1900    ///
1901    /// # Returns
1902    ///
1903    /// Vector of TrackIds for the created tracks.
1904    pub fn resolve_into_document(
1905        &self,
1906        text: &str,
1907        doc: &mut anno_core::GroundedDocument,
1908    ) -> Result<Vec<anno_core::TrackId>> {
1909        let (signals, tracks) = self.resolve_to_grounded(text)?;
1910        let mut track_ids = Vec::new();
1911
1912        // Add signals to document
1913        for signal in signals {
1914            doc.signals.push(signal);
1915        }
1916
1917        // Add tracks to document
1918        for track in tracks {
1919            track_ids.push(track.id);
1920            doc.tracks.insert(track.id, track);
1921        }
1922
1923        Ok(track_ids)
1924    }
1925}
1926
1927// =============================================================================
1928// CoreferenceResolver trait implementation
1929// =============================================================================
1930
1931use crate::Entity;
1932use anno_core::CoreferenceResolver;
1933
1934impl CoreferenceResolver for MentionRankingCoref {
1935    fn resolve(&self, entities: &[Entity]) -> Vec<Entity> {
1936        if entities.is_empty() {
1937            return vec![];
1938        }
1939
1940        // Convert Entity to RankedMention
1941        let mut mentions: Vec<RankedMention> = entities
1942            .iter()
1943            .map(|e| {
1944                let mention_type = if e.text.chars().all(|c| c.is_lowercase()) {
1945                    MentionType::Pronominal
1946                } else if e.text.chars().next().is_some_and(|c| c.is_uppercase()) {
1947                    MentionType::Proper
1948                } else {
1949                    MentionType::Nominal
1950                };
1951
1952                let gender = self.guess_gender(&e.text);
1953                // Infer number from pronoun or surface form
1954                // Note: they/them/their can be singular or plural (singular they)
1955                let lower = e.text.to_lowercase();
1956                let number = if ["we", "us"].iter().any(|p| lower == *p) {
1957                    Some(Number::Plural)
1958                } else if ["they", "them", "their", "you"].iter().any(|p| lower == *p) {
1959                    Some(Number::Unknown) // Singular or plural
1960                } else {
1961                    Some(Number::Singular)
1962                };
1963
1964                RankedMention {
1965                    start: e.start,
1966                    end: e.end,
1967                    text: e.text.clone(),
1968                    mention_type,
1969                    gender,
1970                    number,
1971                    head: self.get_head(&e.text),
1972                }
1973            })
1974            .collect();
1975
1976        // Sort by position
1977        mentions.sort_by_key(|m| (m.start, m.end));
1978
1979        // Extract features
1980        for mention in &mut mentions {
1981            self.extract_features(mention);
1982        }
1983
1984        // Link mentions into clusters
1985        // Note: CoreferenceResolver trait doesn't provide source text,
1986        // so context-aware features (be-phrase, filtering) are disabled
1987        let clusters = self.link_mentions(&mentions, "");
1988
1989        // Build canonical ID mapping: mention_key -> cluster_id
1990        let mut canonical_map: HashMap<(usize, usize), usize> = HashMap::new();
1991        for cluster in &clusters {
1992            for mention in &cluster.mentions {
1993                canonical_map.insert((mention.start, mention.end), cluster.id);
1994            }
1995        }
1996
1997        // Assign unique IDs to singletons (entities not in any cluster)
1998        let max_cluster_id = clusters.iter().map(|c| c.id).max().unwrap_or(0);
1999        let mut next_singleton_id = max_cluster_id + 1;
2000
2001        // Apply canonical IDs to entities
2002        entities
2003            .iter()
2004            .map(|e| {
2005                let mut entity = e.clone();
2006                if let Some(&cluster_id) = canonical_map.get(&(e.start, e.end)) {
2007                    entity.canonical_id = Some(anno_core::CanonicalId::new(cluster_id as u64));
2008                } else {
2009                    // Assign unique ID to singleton
2010                    entity.canonical_id =
2011                        Some(anno_core::CanonicalId::new(next_singleton_id as u64));
2012                    next_singleton_id += 1;
2013                }
2014                entity
2015            })
2016            .collect()
2017    }
2018
2019    fn name(&self) -> &'static str {
2020        "MentionRankingCoref"
2021    }
2022}
2023
2024#[cfg(test)]
2025mod tests {
2026    use super::*;
2027
2028    #[test]
2029    fn test_basic_resolution() {
2030        let coref = MentionRankingCoref::new();
2031        let clusters = coref.resolve("John saw Mary. He waved to her.").unwrap();
2032
2033        // Check structure is valid
2034        for cluster in &clusters {
2035            assert!(!cluster.mentions.is_empty());
2036            for mention in &cluster.mentions {
2037                assert!(mention.start <= mention.end);
2038            }
2039        }
2040    }
2041
2042    #[test]
2043    fn test_empty_input() {
2044        let coref = MentionRankingCoref::new();
2045        let clusters = coref.resolve("").unwrap();
2046        assert!(clusters.is_empty());
2047    }
2048
2049    #[test]
2050    fn test_pronoun_detection() {
2051        let coref = MentionRankingCoref::new();
2052        let mentions = coref.detect_mentions("He saw her.").unwrap();
2053
2054        let pronouns: Vec<_> = mentions
2055            .iter()
2056            .filter(|m| m.mention_type == MentionType::Pronominal)
2057            .collect();
2058
2059        assert!(
2060            pronouns.len() >= 2,
2061            "Should detect 'He' and 'her' as pronouns"
2062        );
2063    }
2064
2065    #[test]
2066    fn test_gender_inference() {
2067        let coref = MentionRankingCoref::new();
2068
2069        assert_eq!(coref.guess_gender("John"), Some(Gender::Masculine));
2070        assert_eq!(coref.guess_gender("Mary Smith"), Some(Gender::Feminine));
2071        assert_eq!(coref.guess_gender("Google"), None);
2072    }
2073
2074    #[test]
2075    fn test_pair_scoring() {
2076        let coref = MentionRankingCoref::new();
2077
2078        let m1 = RankedMention {
2079            start: 0,
2080            end: 4,
2081            text: "John".to_string(),
2082            mention_type: MentionType::Proper,
2083            gender: Some(Gender::Masculine),
2084            number: Some(Number::Singular),
2085            head: "John".to_string(),
2086        };
2087
2088        let m2 = RankedMention {
2089            start: 10,
2090            end: 12,
2091            text: "He".to_string(),
2092            mention_type: MentionType::Pronominal,
2093            gender: Some(Gender::Masculine),
2094            number: Some(Number::Singular),
2095            head: "He".to_string(),
2096        };
2097
2098        let score = coref.score_pair(&m2, &m1, 6, None);
2099        assert!(score > 0.0, "Pronoun with matching gender should link");
2100    }
2101
2102    #[test]
2103    fn test_gender_mismatch_penalty() {
2104        let coref = MentionRankingCoref::new();
2105
2106        let m1 = RankedMention {
2107            start: 0,
2108            end: 4,
2109            text: "Mary".to_string(),
2110            mention_type: MentionType::Proper,
2111            gender: Some(Gender::Feminine),
2112            number: Some(Number::Singular),
2113            head: "Mary".to_string(),
2114        };
2115
2116        let m2 = RankedMention {
2117            start: 10,
2118            end: 12,
2119            text: "He".to_string(),
2120            mention_type: MentionType::Pronominal,
2121            gender: Some(Gender::Masculine),
2122            number: Some(Number::Singular),
2123            head: "He".to_string(),
2124        };
2125
2126        let score = coref.score_pair(&m2, &m1, 6, None);
2127        assert!(
2128            score < 0.5,
2129            "Gender mismatch should have low/negative score"
2130        );
2131    }
2132
2133    #[test]
2134    fn test_config() {
2135        let config = MentionRankingConfig {
2136            link_threshold: 0.5,
2137            ..Default::default()
2138        };
2139
2140        let coref = MentionRankingCoref::with_config(config);
2141        assert_eq!(coref.config.link_threshold, 0.5);
2142    }
2143
2144    #[test]
2145    fn test_unicode_offsets() {
2146        let coref = MentionRankingCoref::new();
2147        let text = "北京很美. He likes it.";
2148        let char_count = text.chars().count();
2149
2150        let clusters = coref.resolve(text).unwrap();
2151
2152        for cluster in &clusters {
2153            for mention in &cluster.mentions {
2154                assert!(mention.start <= mention.end);
2155                assert!(mention.end <= char_count);
2156            }
2157        }
2158    }
2159
2160    // =========================================================================
2161    // Tests for type-specific antecedent limits (Bourgois & Poibeau 2025)
2162    // =========================================================================
2163
2164    #[test]
2165    fn test_type_specific_antecedent_limits() {
2166        let config = MentionRankingConfig::default();
2167
2168        // Default limits from paper
2169        assert_eq!(config.pronoun_max_antecedents, 30);
2170        assert_eq!(config.proper_max_antecedents, 300);
2171        assert_eq!(config.nominal_max_antecedents, 300);
2172
2173        // Type-specific getter
2174        assert_eq!(config.max_antecedents_for_type(MentionType::Pronominal), 30);
2175        assert_eq!(config.max_antecedents_for_type(MentionType::Proper), 300);
2176        assert_eq!(config.max_antecedents_for_type(MentionType::Nominal), 300);
2177        assert_eq!(config.max_antecedents_for_type(MentionType::Zero), 300);
2178        assert_eq!(config.max_antecedents_for_type(MentionType::Unknown), 300);
2179    }
2180
2181    #[test]
2182    fn test_book_scale_config() {
2183        let config = MentionRankingConfig::book_scale();
2184
2185        // Book-scale optimizations enabled
2186        assert!(config.enable_global_proper_coref);
2187        assert_eq!(config.clustering_strategy, ClusteringStrategy::EasyFirst);
2188        assert!(config.use_non_coref_constraints);
2189
2190        // Larger distance for book-scale
2191        assert!(config.max_distance > 100);
2192    }
2193
2194    #[test]
2195    fn test_pronoun_antecedent_limit_enforced() {
2196        // Create config with very small pronoun limit
2197        let config = MentionRankingConfig {
2198            pronoun_max_antecedents: 2,
2199            ..Default::default()
2200        };
2201        let coref = MentionRankingCoref::with_config(config);
2202
2203        // With a pronoun limit of 2, it should only consider 2 antecedents
2204        // This is a structural test - the limit is enforced in link_mentions
2205        assert_eq!(coref.config.pronoun_max_antecedents, 2);
2206    }
2207
2208    // =========================================================================
2209    // Tests for clustering strategies
2210    // =========================================================================
2211
2212    #[test]
2213    fn test_clustering_strategy_default() {
2214        let config = MentionRankingConfig::default();
2215        assert_eq!(config.clustering_strategy, ClusteringStrategy::LeftToRight);
2216    }
2217
2218    #[test]
2219    fn test_easy_first_clustering() {
2220        let config = MentionRankingConfig {
2221            clustering_strategy: ClusteringStrategy::EasyFirst,
2222            ..Default::default()
2223        };
2224        let coref = MentionRankingCoref::with_config(config);
2225
2226        // Should produce valid clusters
2227        let clusters = coref.resolve("John went home. He was tired.").unwrap();
2228        for cluster in &clusters {
2229            assert!(!cluster.mentions.is_empty());
2230        }
2231    }
2232
2233    #[test]
2234    fn test_left_to_right_vs_easy_first_produces_clusters() {
2235        let text = "John met Mary. He greeted her warmly. She smiled at him.";
2236
2237        // Left-to-right clustering
2238        let l2r_config = MentionRankingConfig {
2239            clustering_strategy: ClusteringStrategy::LeftToRight,
2240            ..Default::default()
2241        };
2242        let l2r_coref = MentionRankingCoref::with_config(l2r_config);
2243        let l2r_clusters = l2r_coref.resolve(text).unwrap();
2244
2245        // Easy-first clustering
2246        let ef_config = MentionRankingConfig {
2247            clustering_strategy: ClusteringStrategy::EasyFirst,
2248            ..Default::default()
2249        };
2250        let ef_coref = MentionRankingCoref::with_config(ef_config);
2251        let ef_clusters = ef_coref.resolve(text).unwrap();
2252
2253        // Both should produce some clusters
2254        assert!(
2255            !l2r_clusters.is_empty() || !ef_clusters.is_empty(),
2256            "At least one strategy should produce clusters"
2257        );
2258    }
2259
2260    // =========================================================================
2261    // Tests for global proper noun coreference
2262    // =========================================================================
2263
2264    #[test]
2265    fn test_global_proper_coref_config() {
2266        let config = MentionRankingConfig {
2267            enable_global_proper_coref: true,
2268            global_proper_threshold: 0.8,
2269            ..Default::default()
2270        };
2271
2272        assert!(config.enable_global_proper_coref);
2273        assert!((config.global_proper_threshold - 0.8).abs() < 0.001);
2274    }
2275
2276    #[test]
2277    fn test_global_proper_coref_same_name() {
2278        // Test that repeated proper nouns get clustered globally
2279        let config = MentionRankingConfig {
2280            enable_global_proper_coref: true,
2281            ..Default::default()
2282        };
2283        let coref = MentionRankingCoref::with_config(config);
2284
2285        // Use a text with pronouns to ensure we get clusters
2286        // "John" -> "he" should link, then global proper coref can propagate
2287        let text = "John arrived. He was happy. Later John left.";
2288        let clusters = coref.resolve(text).unwrap();
2289
2290        // The global proper coref feature is mainly for linking distant proper nouns
2291        // Here we just verify it doesn't break normal clustering
2292        // Check valid structure is produced
2293        for cluster in &clusters {
2294            for mention in &cluster.mentions {
2295                assert!(mention.start <= mention.end);
2296            }
2297        }
2298    }
2299
2300    // =========================================================================
2301    // Tests for non-coreference constraints
2302    // =========================================================================
2303
2304    #[test]
2305    fn test_non_coref_constraints_config() {
2306        let config = MentionRankingConfig {
2307            use_non_coref_constraints: true,
2308            non_coref_threshold: 0.1,
2309            ..Default::default()
2310        };
2311
2312        assert!(config.use_non_coref_constraints);
2313        assert!((config.non_coref_threshold - 0.1).abs() < 0.001);
2314    }
2315
2316    #[test]
2317    fn test_easy_first_with_non_coref_constraints() {
2318        let config = MentionRankingConfig {
2319            clustering_strategy: ClusteringStrategy::EasyFirst,
2320            use_non_coref_constraints: true,
2321            ..Default::default()
2322        };
2323        let coref = MentionRankingCoref::with_config(config);
2324
2325        // "John and Mary" - the "and" should prevent merging John and Mary
2326        let clusters = coref.resolve("John and Mary went to the store.").unwrap();
2327
2328        // Should produce valid structure regardless of specific clustering
2329        for cluster in &clusters {
2330            for mention in &cluster.mentions {
2331                assert!(mention.start <= mention.end);
2332            }
2333        }
2334    }
2335
2336    // =========================================================================
2337    // Integration tests
2338    // =========================================================================
2339
2340    #[test]
2341    fn test_full_book_scale_pipeline() {
2342        let config = MentionRankingConfig::book_scale();
2343        let coref = MentionRankingCoref::with_config(config);
2344
2345        // A longer text simulating literary content
2346        let text = "Elizabeth Bennett was a spirited young woman. She lived at Longbourn \
2347                    with her family. Her mother, Mrs. Bennett, was determined to see her \
2348                    daughters married well. Elizabeth often walked in the countryside. \
2349                    She enjoyed the solitude it offered.";
2350
2351        let clusters = coref.resolve(text).unwrap();
2352
2353        // Validate cluster structure
2354        for cluster in &clusters {
2355            assert!(!cluster.mentions.is_empty());
2356            for mention in &cluster.mentions {
2357                assert!(mention.start <= mention.end);
2358                assert!(mention.end <= text.chars().count());
2359            }
2360        }
2361    }
2362
2363    #[test]
2364    fn test_mention_type_distribution() {
2365        let coref = MentionRankingCoref::new();
2366        let text = "Dr. Smith saw John. He examined him carefully.";
2367        let mentions = coref.detect_mentions(text).unwrap();
2368
2369        let pronoun_count = mentions
2370            .iter()
2371            .filter(|m| m.mention_type == MentionType::Pronominal)
2372            .count();
2373        let proper_count = mentions
2374            .iter()
2375            .filter(|m| m.mention_type == MentionType::Proper)
2376            .count();
2377
2378        // Should detect both pronouns and proper nouns
2379        assert!(pronoun_count > 0, "Should detect pronouns");
2380        assert!(proper_count > 0, "Should detect proper nouns");
2381    }
2382
2383    // =========================================================================
2384    // Tests for salience integration
2385    // =========================================================================
2386
2387    #[test]
2388    fn test_salience_config_default() {
2389        let config = MentionRankingConfig::default();
2390        // Disabled by default for backward compatibility
2391        assert!((config.salience_weight - 0.0).abs() < 0.001);
2392    }
2393
2394    #[test]
2395    fn test_salience_config_builder() {
2396        let config = MentionRankingConfig::default().with_salience(0.25);
2397        assert!((config.salience_weight - 0.25).abs() < 0.001);
2398
2399        // Clamped to [0, 1]
2400        let clamped = MentionRankingConfig::default().with_salience(1.5);
2401        assert!((clamped.salience_weight - 1.0).abs() < 0.001);
2402    }
2403
2404    #[test]
2405    fn test_salience_book_scale_enabled() {
2406        let config = MentionRankingConfig::book_scale();
2407        assert!(
2408            config.salience_weight > 0.0,
2409            "Book-scale should enable salience"
2410        );
2411    }
2412
2413    #[test]
2414    fn test_with_salience_scores() {
2415        let mut scores = HashMap::new();
2416        scores.insert("john".to_string(), 0.8);
2417        scores.insert("Mary".to_string(), 0.6); // Mixed case
2418
2419        let coref = MentionRankingCoref::new().with_salience(scores);
2420
2421        // Lookup should be case-insensitive
2422        assert!((coref.get_salience("john") - 0.8).abs() < 0.001);
2423        assert!((coref.get_salience("John") - 0.8).abs() < 0.001);
2424        assert!((coref.get_salience("JOHN") - 0.8).abs() < 0.001);
2425        assert!((coref.get_salience("mary") - 0.6).abs() < 0.001);
2426
2427        // Unknown entity returns 0.0
2428        assert!((coref.get_salience("unknown") - 0.0).abs() < 0.001);
2429    }
2430
2431    #[test]
2432    fn test_salience_boosts_antecedent_score() {
2433        // Create config with salience enabled
2434        let config = MentionRankingConfig {
2435            salience_weight: 0.3,
2436            ..Default::default()
2437        };
2438
2439        // Scores: John is salient, Mary is not
2440        let mut scores = HashMap::new();
2441        scores.insert("john".to_string(), 1.0);
2442        scores.insert("mary".to_string(), 0.0);
2443
2444        let coref = MentionRankingCoref::with_config(config).with_salience(scores);
2445
2446        let mention = RankedMention {
2447            start: 20,
2448            end: 22,
2449            text: "He".to_string(),
2450            mention_type: MentionType::Pronominal,
2451            gender: Some(Gender::Masculine),
2452            number: Some(Number::Singular),
2453            head: "He".to_string(),
2454        };
2455
2456        let john = RankedMention {
2457            start: 0,
2458            end: 4,
2459            text: "John".to_string(),
2460            mention_type: MentionType::Proper,
2461            gender: Some(Gender::Masculine),
2462            number: Some(Number::Singular),
2463            head: "John".to_string(),
2464        };
2465
2466        let bob = RankedMention {
2467            start: 10,
2468            end: 13,
2469            text: "Bob".to_string(), // Not in salience scores
2470            mention_type: MentionType::Proper,
2471            gender: Some(Gender::Masculine),
2472            number: Some(Number::Singular),
2473            head: "Bob".to_string(),
2474        };
2475
2476        let score_john = coref.score_pair(&mention, &john, 16, None);
2477        let score_bob = coref.score_pair(&mention, &bob, 7, None);
2478
2479        // John should get a salience boost of 0.3 * 1.0 = 0.3
2480        // Both have same gender agreement, but John is salient
2481        // Despite Bob being closer (distance 7 vs 16), John's salience should help
2482        assert!(
2483            score_john > score_bob - 0.1, // Allow some margin for distance penalty
2484            "Salient antecedent should score higher: john={}, bob={}",
2485            score_john,
2486            score_bob
2487        );
2488    }
2489
2490    #[test]
2491    fn test_salience_no_effect_when_disabled() {
2492        let config = MentionRankingConfig {
2493            salience_weight: 0.0, // Disabled
2494            ..Default::default()
2495        };
2496
2497        let mut scores = HashMap::new();
2498        scores.insert("john".to_string(), 1.0);
2499
2500        let coref = MentionRankingCoref::with_config(config.clone()).with_salience(scores);
2501
2502        let mention = RankedMention {
2503            start: 10,
2504            end: 12,
2505            text: "He".to_string(),
2506            mention_type: MentionType::Pronominal,
2507            gender: Some(Gender::Masculine),
2508            number: Some(Number::Singular),
2509            head: "He".to_string(),
2510        };
2511
2512        let antecedent = RankedMention {
2513            start: 0,
2514            end: 4,
2515            text: "John".to_string(),
2516            mention_type: MentionType::Proper,
2517            gender: Some(Gender::Masculine),
2518            number: Some(Number::Singular),
2519            head: "John".to_string(),
2520        };
2521
2522        // Without salience scores
2523        let coref_no_salience = MentionRankingCoref::with_config(config);
2524        let score_without = coref_no_salience.score_pair(&mention, &antecedent, 6, None);
2525
2526        // With salience scores but weight=0
2527        let score_with = coref.score_pair(&mention, &antecedent, 6, None);
2528
2529        // Scores should be equal when weight is 0
2530        assert!(
2531            (score_without - score_with).abs() < 0.001,
2532            "Salience should have no effect when weight=0"
2533        );
2534    }
2535
2536    #[test]
2537    fn test_salience_resolution_integration() {
2538        // Full resolution with salience
2539        let config = MentionRankingConfig {
2540            salience_weight: 0.2,
2541            ..Default::default()
2542        };
2543
2544        let mut scores = HashMap::new();
2545        scores.insert("president".to_string(), 0.9);
2546        scores.insert("john".to_string(), 0.7);
2547        scores.insert("meeting".to_string(), 0.3);
2548
2549        let coref = MentionRankingCoref::with_config(config).with_salience(scores);
2550
2551        let text = "John met the President. He was nervous.";
2552        let clusters = coref.resolve(text).unwrap();
2553
2554        // Should produce valid clusters
2555        for cluster in &clusters {
2556            assert!(!cluster.mentions.is_empty());
2557            for mention in &cluster.mentions {
2558                assert!(mention.start <= mention.end);
2559                assert!(mention.end <= text.chars().count());
2560            }
2561        }
2562    }
2563
2564    #[test]
2565    fn test_salience_with_multilingual_text() {
2566        let config = MentionRankingConfig {
2567            salience_weight: 0.2,
2568            ..Default::default()
2569        };
2570
2571        let mut scores = HashMap::new();
2572        scores.insert("北京".to_string(), 0.8);
2573        scores.insert("習近平".to_string(), 0.9);
2574
2575        let coref = MentionRankingCoref::with_config(config).with_salience(scores);
2576
2577        // Case-insensitive lookup (though CJK doesn't have case)
2578        assert!((coref.get_salience("北京") - 0.8).abs() < 0.001);
2579        assert!((coref.get_salience("習近平") - 0.9).abs() < 0.001);
2580    }
2581
2582    // =========================================================================
2583    // Tests for GroundedDocument integration (Signal → Track → Identity)
2584    // =========================================================================
2585
2586    #[test]
2587    fn test_mention_cluster_to_signals() {
2588        let cluster = MentionCluster {
2589            id: 0,
2590            mentions: vec![
2591                RankedMention {
2592                    start: 0,
2593                    end: 4,
2594                    text: "John".to_string(),
2595                    mention_type: MentionType::Proper,
2596                    gender: Some(Gender::Masculine),
2597                    number: Some(Number::Singular),
2598                    head: "John".to_string(),
2599                },
2600                RankedMention {
2601                    start: 15,
2602                    end: 17,
2603                    text: "He".to_string(),
2604                    mention_type: MentionType::Pronominal,
2605                    gender: Some(Gender::Masculine),
2606                    number: Some(Number::Singular),
2607                    head: "He".to_string(),
2608                },
2609            ],
2610        };
2611
2612        let signals = cluster.to_signals(anno_core::SignalId::new(100));
2613
2614        assert_eq!(signals.len(), 2);
2615        assert_eq!(signals[0].id, anno_core::SignalId::new(100));
2616        assert_eq!(signals[1].id, anno_core::SignalId::new(101));
2617        assert_eq!(signals[0].surface, "John");
2618        assert_eq!(signals[1].surface, "He");
2619
2620        // Check location is correct
2621        if let anno_core::Location::Text { start, end } = &signals[0].location {
2622            assert_eq!(*start, 0);
2623            assert_eq!(*end, 4);
2624        } else {
2625            panic!("Expected Text location");
2626        }
2627    }
2628
2629    #[test]
2630    fn test_mention_cluster_to_track() {
2631        let cluster = MentionCluster {
2632            id: 42,
2633            mentions: vec![
2634                RankedMention {
2635                    start: 0,
2636                    end: 4,
2637                    text: "John".to_string(),
2638                    mention_type: MentionType::Proper,
2639                    gender: Some(Gender::Masculine),
2640                    number: Some(Number::Singular),
2641                    head: "John".to_string(),
2642                },
2643                RankedMention {
2644                    start: 15,
2645                    end: 17,
2646                    text: "He".to_string(),
2647                    mention_type: MentionType::Pronominal,
2648                    gender: Some(Gender::Masculine),
2649                    number: Some(Number::Singular),
2650                    head: "He".to_string(),
2651                },
2652            ],
2653        };
2654
2655        let (track, signals) = cluster.to_track(anno_core::SignalId::new(0));
2656
2657        // Track should have correct structure
2658        assert_eq!(track.id, anno_core::TrackId::new(42));
2659        assert_eq!(track.canonical_surface, "John"); // Proper noun preferred
2660        assert_eq!(track.signals.len(), 2);
2661
2662        // Signals should be correct
2663        assert_eq!(signals.len(), 2);
2664        assert_eq!(signals[0].surface, "John");
2665        assert_eq!(signals[1].surface, "He");
2666    }
2667
2668    #[test]
2669    fn test_canonical_mention_prefers_proper() {
2670        // Cluster with pronoun first, proper noun second
2671        let cluster = MentionCluster {
2672            id: 0,
2673            mentions: vec![
2674                RankedMention {
2675                    start: 0,
2676                    end: 2,
2677                    text: "He".to_string(),
2678                    mention_type: MentionType::Pronominal,
2679                    gender: Some(Gender::Masculine),
2680                    number: Some(Number::Singular),
2681                    head: "He".to_string(),
2682                },
2683                RankedMention {
2684                    start: 10,
2685                    end: 14,
2686                    text: "John".to_string(),
2687                    mention_type: MentionType::Proper,
2688                    gender: Some(Gender::Masculine),
2689                    number: Some(Number::Singular),
2690                    head: "John".to_string(),
2691                },
2692            ],
2693        };
2694
2695        // Should prefer proper noun even though it's second
2696        let canonical = cluster.canonical_mention().unwrap();
2697        assert_eq!(canonical.text, "John");
2698    }
2699
2700    #[test]
2701    fn test_resolve_to_grounded() {
2702        let coref = MentionRankingCoref::new();
2703        let (signals, tracks) = coref
2704            .resolve_to_grounded("John saw Mary. He waved.")
2705            .unwrap();
2706
2707        // Should have signals
2708        assert!(!signals.is_empty());
2709
2710        // All signals should have valid locations
2711        for signal in &signals {
2712            if let anno_core::Location::Text { start, end } = &signal.location {
2713                assert!(start <= end);
2714            } else {
2715                panic!("Expected Text location");
2716            }
2717        }
2718
2719        // Tracks should reference signals correctly
2720        for track in &tracks {
2721            assert!(!track.signals.is_empty());
2722            assert!(!track.canonical_surface.is_empty());
2723        }
2724    }
2725
2726    #[test]
2727    fn test_resolve_into_document() {
2728        let coref = MentionRankingCoref::new();
2729        let text = "John saw Mary. He waved to her.";
2730        let mut doc = anno_core::GroundedDocument::new("test_doc", text);
2731
2732        let track_ids = coref.resolve_into_document(text, &mut doc).unwrap();
2733
2734        // Document should have signals and tracks
2735        assert!(!doc.signals.is_empty());
2736        assert!(!doc.tracks.is_empty());
2737
2738        // Returned track IDs should match document
2739        for track_id in &track_ids {
2740            assert!(doc.tracks.contains_key(track_id));
2741        }
2742    }
2743
2744    #[test]
2745    fn test_ranked_mention_to_signal() {
2746        let mention = RankedMention {
2747            start: 10,
2748            end: 20,
2749            text: "the company".to_string(),
2750            mention_type: MentionType::Nominal,
2751            gender: None,
2752            number: Some(Number::Singular),
2753            head: "company".to_string(),
2754        };
2755
2756        let signal = mention.to_signal(anno_core::SignalId::new(999));
2757
2758        assert_eq!(signal.id, anno_core::SignalId::new(999));
2759        assert_eq!(signal.surface, "the company");
2760        assert_eq!(signal.label, "nominal".into());
2761        assert_eq!(signal.modality, anno_core::Modality::Symbolic);
2762
2763        if let anno_core::Location::Text { start, end } = signal.location {
2764            assert_eq!(start, 10);
2765            assert_eq!(end, 20);
2766        } else {
2767            panic!("Expected Text location");
2768        }
2769    }
2770
2771    #[test]
2772    fn test_grounded_integration_unicode() {
2773        let coref = MentionRankingCoref::new();
2774        let text = "習近平在北京。他很忙。"; // "Xi Jinping is in Beijing. He is busy."
2775
2776        let (signals, _tracks) = coref.resolve_to_grounded(text).unwrap();
2777        let char_count = text.chars().count();
2778
2779        // All signal locations should be within text bounds (character offsets)
2780        for signal in &signals {
2781            if let anno_core::Location::Text { start, end } = &signal.location {
2782                assert!(*start <= *end);
2783                assert!(
2784                    *end <= char_count,
2785                    "Signal end {} exceeds char count {}",
2786                    end,
2787                    char_count
2788                );
2789            }
2790        }
2791    }
2792
2793    // =========================================================================
2794    // Tests for i2b2-inspired features (Chen et al. 2011)
2795    // =========================================================================
2796
2797    #[test]
2798    fn test_be_phrase_detection() {
2799        let config = MentionRankingConfig::clinical();
2800        let coref = MentionRankingCoref::with_config(config);
2801
2802        let text = "The patient is John Smith. He was seen by Dr. Jones.";
2803
2804        // "patient" (0-11) is "John Smith" (15-25) via "is"
2805        let m1 = RankedMention {
2806            start: 4,
2807            end: 11,
2808            text: "patient".to_string(),
2809            mention_type: MentionType::Nominal,
2810            gender: None,
2811            number: Some(Number::Singular),
2812            head: "patient".to_string(),
2813        };
2814
2815        let m2 = RankedMention {
2816            start: 15,
2817            end: 25,
2818            text: "John Smith".to_string(),
2819            mention_type: MentionType::Proper,
2820            gender: Some(Gender::Masculine),
2821            number: Some(Number::Singular),
2822            head: "Smith".to_string(),
2823        };
2824
2825        // Should detect be-phrase link
2826        assert!(
2827            coref.is_be_phrase_link(text, &m1, &m2),
2828            "Should detect 'is' between patient and John Smith"
2829        );
2830
2831        // Score should be higher due to be-phrase
2832        let score = coref.score_pair(&m1, &m2, 4, Some(text));
2833        assert!(score > 0.5, "Be-phrase should boost score: got {}", score);
2834    }
2835
2836    #[test]
2837    fn test_be_phrase_detection_negative() {
2838        let coref = MentionRankingCoref::new();
2839
2840        let text = "John saw Mary at the store.";
2841
2842        let m1 = RankedMention {
2843            start: 0,
2844            end: 4,
2845            text: "John".to_string(),
2846            mention_type: MentionType::Proper,
2847            gender: Some(Gender::Masculine),
2848            number: Some(Number::Singular),
2849            head: "John".to_string(),
2850        };
2851
2852        let m2 = RankedMention {
2853            start: 9,
2854            end: 13,
2855            text: "Mary".to_string(),
2856            mention_type: MentionType::Proper,
2857            gender: Some(Gender::Feminine),
2858            number: Some(Number::Singular),
2859            head: "Mary".to_string(),
2860        };
2861
2862        // "saw" is not a be-phrase
2863        assert!(
2864            !coref.is_be_phrase_link(text, &m1, &m2),
2865            "Should not detect be-phrase between John and Mary"
2866        );
2867    }
2868
2869    #[test]
2870    fn test_acronym_matching() {
2871        let coref = MentionRankingCoref::new();
2872
2873        let mrsa = RankedMention {
2874            start: 0,
2875            end: 4,
2876            text: "MRSA".to_string(),
2877            mention_type: MentionType::Proper,
2878            gender: None,
2879            number: Some(Number::Singular),
2880            head: "MRSA".to_string(),
2881        };
2882
2883        let full = RankedMention {
2884            start: 20,
2885            end: 65,
2886            text: "Methicillin-resistant Staphylococcus aureus".to_string(),
2887            mention_type: MentionType::Proper,
2888            gender: None,
2889            number: Some(Number::Singular),
2890            head: "aureus".to_string(),
2891        };
2892
2893        assert!(
2894            coref.is_acronym_match(&mrsa, &full),
2895            "MRSA should match Methicillin-resistant Staphylococcus aureus"
2896        );
2897    }
2898
2899    #[test]
2900    fn test_acronym_matching_who() {
2901        let coref = MentionRankingCoref::new();
2902
2903        let who = RankedMention {
2904            start: 0,
2905            end: 3,
2906            text: "WHO".to_string(),
2907            mention_type: MentionType::Proper,
2908            gender: None,
2909            number: Some(Number::Singular),
2910            head: "WHO".to_string(),
2911        };
2912
2913        let full = RankedMention {
2914            start: 10,
2915            end: 35,
2916            text: "World Health Organization".to_string(),
2917            mention_type: MentionType::Proper,
2918            gender: None,
2919            number: Some(Number::Singular),
2920            head: "Organization".to_string(),
2921        };
2922
2923        assert!(
2924            coref.is_acronym_match(&who, &full),
2925            "WHO should match World Health Organization"
2926        );
2927    }
2928
2929    #[test]
2930    fn test_acronym_matching_negative() {
2931        let coref = MentionRankingCoref::new();
2932
2933        let ibm = RankedMention {
2934            start: 0,
2935            end: 3,
2936            text: "IBM".to_string(),
2937            mention_type: MentionType::Proper,
2938            gender: None,
2939            number: Some(Number::Singular),
2940            head: "IBM".to_string(),
2941        };
2942
2943        let apple = RankedMention {
2944            start: 10,
2945            end: 25,
2946            text: "Apple Inc".to_string(),
2947            mention_type: MentionType::Proper,
2948            gender: None,
2949            number: Some(Number::Singular),
2950            head: "Apple".to_string(),
2951        };
2952
2953        assert!(
2954            !coref.is_acronym_match(&ibm, &apple),
2955            "IBM should not match Apple Inc"
2956        );
2957    }
2958
2959    #[test]
2960    fn test_context_filtering_different_dates() {
2961        let config = MentionRankingConfig::clinical();
2962        let coref = MentionRankingCoref::with_config(config);
2963
2964        // Two mentions with different dates in their context
2965        let text = "On 2024-01-15 the patient presented. On 2024-02-20 the patient returned.";
2966
2967        let m1 = RankedMention {
2968            start: 17,
2969            end: 24,
2970            text: "patient".to_string(),
2971            mention_type: MentionType::Nominal,
2972            gender: None,
2973            number: Some(Number::Singular),
2974            head: "patient".to_string(),
2975        };
2976
2977        let m2 = RankedMention {
2978            start: 50,
2979            end: 57,
2980            text: "patient".to_string(),
2981            mention_type: MentionType::Nominal,
2982            gender: None,
2983            number: Some(Number::Singular),
2984            head: "patient".to_string(),
2985        };
2986
2987        // Should filter due to different dates (different visits = potentially different patients)
2988        assert!(
2989            coref.should_filter_by_context(text, &m1, &m2),
2990            "Should filter link between patients with different dates"
2991        );
2992    }
2993
2994    #[test]
2995    fn test_context_filtering_negation() {
2996        let config = MentionRankingConfig::clinical();
2997        let coref = MentionRankingCoref::with_config(config);
2998
2999        // Use longer text to ensure contexts don't overlap
3000        // The context window is 20 chars before the mention start
3001        let text = "Patient is not a diabetic. This is important. The diabetic protocol was used.";
3002        //          0         1         2         3         4         5         6         7
3003        //          0123456789012345678901234567890123456789012345678901234567890123456789012345
3004
3005        // First "diabetic" at position 17-25 (after "not a")
3006        let m1 = RankedMention {
3007            start: 17,
3008            end: 25,
3009            text: "diabetic".to_string(),
3010            mention_type: MentionType::Nominal,
3011            gender: None,
3012            number: Some(Number::Singular),
3013            head: "diabetic".to_string(),
3014        };
3015
3016        // Second "diabetic" at position 50-58 (far enough that context won't include "not")
3017        let m2 = RankedMention {
3018            start: 50,
3019            end: 58,
3020            text: "diabetic".to_string(),
3021            mention_type: MentionType::Nominal,
3022            gender: None,
3023            number: Some(Number::Singular),
3024            head: "diabetic".to_string(),
3025        };
3026
3027        // Verify context windows include the right context
3028        let text_chars: Vec<char> = text.chars().collect();
3029        let m1_context: String = text_chars
3030            [m1.start.saturating_sub(20)..m1.end.min(text_chars.len())]
3031            .iter()
3032            .collect();
3033        let m2_context: String = text_chars
3034            [m2.start.saturating_sub(20)..m2.end.min(text_chars.len())]
3035            .iter()
3036            .collect();
3037        eprintln!("m1 context: '{}'", m1_context);
3038        eprintln!("m2 context: '{}'", m2_context);
3039
3040        // m1 should have "not" in context, m2 should not
3041        assert!(
3042            m1_context.contains("not"),
3043            "m1 context should contain 'not'"
3044        );
3045        assert!(
3046            !m2_context.contains("not"),
3047            "m2 context should not contain 'not'"
3048        );
3049
3050        // Should filter due to negation mismatch
3051        assert!(
3052            coref.should_filter_by_context(text, &m1, &m2),
3053            "Should filter link between negated ('{}') and non-negated ('{}') mentions",
3054            m1_context,
3055            m2_context
3056        );
3057    }
3058
3059    #[test]
3060    fn test_synonym_matching_high_similarity() {
3061        // Synonym matching now uses string similarity (>0.8) rather than
3062        // a hardcoded table. This tests that high-similarity strings match.
3063        let coref = MentionRankingCoref::new();
3064
3065        let obama = RankedMention {
3066            start: 0,
3067            end: 5,
3068            text: "Obama".to_string(),
3069            mention_type: MentionType::Proper,
3070            gender: None,
3071            number: Some(Number::Singular),
3072            head: "Obama".to_string(),
3073        };
3074
3075        let obama_lower = RankedMention {
3076            start: 10,
3077            end: 15,
3078            text: "obama".to_string(),
3079            mention_type: MentionType::Proper,
3080            gender: None,
3081            number: Some(Number::Singular),
3082            head: "obama".to_string(),
3083        };
3084
3085        // Case-insensitive match should work
3086        assert!(
3087            coref.are_synonyms(&obama, &obama_lower),
3088            "Obama and obama should match (case-insensitive)"
3089        );
3090    }
3091
3092    #[test]
3093    fn test_synonym_matching_low_similarity_no_match() {
3094        // Domain-specific synonyms like heart/cardiac require external
3095        // SynonymSource implementations. The default uses string similarity,
3096        // which won't match semantically related but lexically different terms.
3097        let coref = MentionRankingCoref::new();
3098
3099        let heart = RankedMention {
3100            start: 0,
3101            end: 5,
3102            text: "heart".to_string(),
3103            mention_type: MentionType::Nominal,
3104            gender: None,
3105            number: Some(Number::Singular),
3106            head: "heart".to_string(),
3107        };
3108
3109        let cardiac = RankedMention {
3110            start: 10,
3111            end: 17,
3112            text: "cardiac".to_string(),
3113            mention_type: MentionType::Nominal,
3114            gender: None,
3115            number: Some(Number::Singular),
3116            head: "cardiac".to_string(),
3117        };
3118
3119        // Without a domain-specific SynonymSource, these won't match
3120        // because "heart" and "cardiac" have low string similarity.
3121        // This is the expected behavior - use anno::coalesce::SynonymSource
3122        // for domain-specific synonym matching.
3123        assert!(
3124            !coref.are_synonyms(&heart, &cardiac),
3125            "heart/cardiac require domain-specific SynonymSource"
3126        );
3127    }
3128
3129    #[test]
3130    fn test_clinical_config() {
3131        let config = MentionRankingConfig::clinical();
3132
3133        // Verify i2b2-inspired features are enabled
3134        assert!(config.enable_be_phrase_detection);
3135        assert!(config.enable_acronym_matching);
3136        assert!(config.enable_context_filtering);
3137        assert!(config.enable_synonym_matching);
3138
3139        // Verify reasonable weights
3140        assert!(config.be_phrase_weight > 0.5);
3141        assert!(config.acronym_weight > 0.5);
3142        assert!(config.synonym_weight > 0.3);
3143    }
3144
3145    #[test]
3146    fn test_clinical_resolution_integration() {
3147        let config = MentionRankingConfig::clinical();
3148        let coref = MentionRankingCoref::with_config(config);
3149
3150        // Clinical text with various coreference patterns
3151        let text = "The patient is John Smith. Pt was admitted with MRSA. \
3152                    Methicillin-resistant Staphylococcus aureus was treated.";
3153
3154        let clusters = coref.resolve(text).unwrap();
3155
3156        // Should create meaningful clusters
3157        assert!(
3158            !clusters.is_empty(),
3159            "Should find clusters in clinical text"
3160        );
3161
3162        // Print clusters for debugging
3163        for cluster in &clusters {
3164            let texts: Vec<_> = cluster.mentions.iter().map(|m| &m.text).collect();
3165            eprintln!("Cluster {}: {:?}", cluster.id, texts);
3166        }
3167    }
3168
3169    #[test]
3170    fn test_i2b2_scoring_with_all_features() {
3171        let config = MentionRankingConfig::clinical();
3172        let coref = MentionRankingCoref::with_config(config);
3173
3174        // Text with be-phrase pattern
3175        let text = "Resolution of organism is MRSA.";
3176
3177        let m1 = RankedMention {
3178            start: 14,
3179            end: 22,
3180            text: "organism".to_string(),
3181            mention_type: MentionType::Nominal,
3182            gender: None,
3183            number: Some(Number::Singular),
3184            head: "organism".to_string(),
3185        };
3186
3187        let m2 = RankedMention {
3188            start: 26,
3189            end: 30,
3190            text: "MRSA".to_string(),
3191            mention_type: MentionType::Proper,
3192            gender: None,
3193            number: Some(Number::Singular),
3194            head: "MRSA".to_string(),
3195        };
3196
3197        // Score should be high due to be-phrase
3198        let score = coref.score_pair(&m1, &m2, 4, Some(text));
3199        assert!(
3200            score > 0.7,
3201            "Be-phrase pattern should yield high score, got {}",
3202            score
3203        );
3204    }
3205
3206    // =========================================================================
3207    // Nominal adjective detection tests (J2N: arXiv:2409.14374)
3208    // =========================================================================
3209
3210    #[test]
3211    fn test_nominal_adjective_detection_basic() {
3212        let config = MentionRankingConfig {
3213            enable_nominal_adjective_detection: true,
3214            ..Default::default()
3215        };
3216        let coref = MentionRankingCoref::with_config(config);
3217
3218        let text = "The poor are struggling while the rich get richer.";
3219        let mentions = coref.detect_mentions(text).unwrap();
3220
3221        let texts: Vec<_> = mentions.iter().map(|m| m.text.as_str()).collect();
3222        assert!(
3223            texts.contains(&"The poor"),
3224            "Should detect 'The poor': {:?}",
3225            texts
3226        );
3227        assert!(
3228            texts.contains(&"the rich"),
3229            "Should detect 'the rich': {:?}",
3230            texts
3231        );
3232
3233        // Check grammatical number is plural
3234        let poor_mention = mentions
3235            .iter()
3236            .find(|m| m.text.to_lowercase() == "the poor");
3237        assert!(poor_mention.is_some());
3238        assert_eq!(poor_mention.unwrap().number, Some(Number::Plural));
3239        assert_eq!(poor_mention.unwrap().mention_type, MentionType::Nominal);
3240    }
3241
3242    #[test]
3243    fn test_nominal_adjective_not_before_noun() {
3244        // "the poor performance" should NOT detect "the poor" as a mention
3245        // because "poor" modifies "performance", not a nominalized group
3246        let config = MentionRankingConfig {
3247            enable_nominal_adjective_detection: true,
3248            ..Default::default()
3249        };
3250        let coref = MentionRankingCoref::with_config(config);
3251
3252        let text = "The poor performance was criticized.";
3253        let mentions = coref.detect_mentions(text).unwrap();
3254
3255        let texts: Vec<_> = mentions.iter().map(|m| m.text.as_str()).collect();
3256        assert!(
3257            !texts.contains(&"The poor"),
3258            "Should NOT detect 'The poor' when followed by noun: {:?}",
3259            texts
3260        );
3261    }
3262
3263    #[test]
3264    fn test_nominal_adjective_at_sentence_end() {
3265        let config = MentionRankingConfig {
3266            enable_nominal_adjective_detection: true,
3267            ..Default::default()
3268        };
3269        let coref = MentionRankingCoref::with_config(config);
3270
3271        let text = "We must help the elderly.";
3272        let mentions = coref.detect_mentions(text).unwrap();
3273
3274        let texts: Vec<_> = mentions.iter().map(|m| m.text.as_str()).collect();
3275        assert!(
3276            texts.contains(&"the elderly"),
3277            "Should detect 'the elderly' at end: {:?}",
3278            texts
3279        );
3280    }
3281
3282    #[test]
3283    fn test_nominal_adjective_with_punctuation() {
3284        let config = MentionRankingConfig {
3285            enable_nominal_adjective_detection: true,
3286            ..Default::default()
3287        };
3288        let coref = MentionRankingCoref::with_config(config);
3289
3290        let text = "The accused, the condemned, and the guilty were present.";
3291        let mentions = coref.detect_mentions(text).unwrap();
3292
3293        let texts: Vec<_> = mentions.iter().map(|m| m.text.as_str()).collect();
3294        assert!(
3295            texts.contains(&"The accused"),
3296            "Should detect 'The accused': {:?}",
3297            texts
3298        );
3299        assert!(
3300            texts.contains(&"the condemned"),
3301            "Should detect 'the condemned': {:?}",
3302            texts
3303        );
3304        assert!(
3305            texts.contains(&"the guilty"),
3306            "Should detect 'the guilty': {:?}",
3307            texts
3308        );
3309    }
3310
3311    #[test]
3312    fn test_nominal_adjective_these_those() {
3313        let config = MentionRankingConfig {
3314            enable_nominal_adjective_detection: true,
3315            ..Default::default()
3316        };
3317        let coref = MentionRankingCoref::with_config(config);
3318
3319        let text = "These homeless need shelter. Those unemployed seek work.";
3320        let mentions = coref.detect_mentions(text).unwrap();
3321
3322        let texts: Vec<_> = mentions.iter().map(|m| m.text.as_str()).collect();
3323        assert!(
3324            texts.contains(&"These homeless"),
3325            "Should detect 'These homeless': {:?}",
3326            texts
3327        );
3328        assert!(
3329            texts.contains(&"Those unemployed"),
3330            "Should detect 'Those unemployed': {:?}",
3331            texts
3332        );
3333    }
3334
3335    #[test]
3336    fn test_nominal_adjective_disabled_by_default() {
3337        let coref = MentionRankingCoref::new();
3338
3339        let text = "The poor are struggling.";
3340        let mentions = coref.detect_mentions(text).unwrap();
3341
3342        // With detection disabled, "the poor" should not be detected as a mention
3343        let has_the_poor = mentions.iter().any(|m| m.text.to_lowercase() == "the poor");
3344        assert!(
3345            !has_the_poor,
3346            "Nominal adjective detection should be disabled by default"
3347        );
3348    }
3349
3350    // =========================================================================
3351    // Singular "they" tests
3352    // =========================================================================
3353
3354    #[test]
3355    fn test_singular_they_number_unknown() {
3356        let coref = MentionRankingCoref::new();
3357
3358        // "they" should have Number::Unknown to support both singular and plural
3359        let text = "Alex said they would come. They brought their friends.";
3360        let mentions = coref.detect_mentions(text).unwrap();
3361
3362        // Find "they" mentions
3363        let they_mentions: Vec<_> = mentions
3364            .iter()
3365            .filter(|m| m.text.to_lowercase() == "they")
3366            .collect();
3367
3368        for they in &they_mentions {
3369            assert_eq!(
3370                they.number,
3371                Some(Number::Unknown),
3372                "'they' should have Number::Unknown for singular/plural ambiguity"
3373            );
3374        }
3375    }
3376
3377    #[test]
3378    fn test_their_number_unknown() {
3379        let coref = MentionRankingCoref::new();
3380
3381        let text = "Someone left their umbrella.";
3382        let mentions = coref.detect_mentions(text).unwrap();
3383
3384        let their = mentions.iter().find(|m| m.text.to_lowercase() == "their");
3385        assert!(their.is_some(), "Should detect 'their'");
3386        assert_eq!(
3387            their.unwrap().number,
3388            Some(Number::Unknown),
3389            "'their' should have Number::Unknown"
3390        );
3391    }
3392
3393    #[test]
3394    fn test_themself_vs_themselves() {
3395        // "themself" is explicitly singular (singular they reflexive)
3396        // "themselves" is explicitly plural
3397        let coref = MentionRankingCoref::new();
3398
3399        let text = "The student prepared themself. The students prepared themselves.";
3400        let mentions = coref.detect_mentions(text).unwrap();
3401
3402        let themself = mentions
3403            .iter()
3404            .find(|m| m.text.to_lowercase() == "themself");
3405        let themselves = mentions
3406            .iter()
3407            .find(|m| m.text.to_lowercase() == "themselves");
3408
3409        assert!(themself.is_some(), "Should detect 'themself'");
3410        assert!(themselves.is_some(), "Should detect 'themselves'");
3411
3412        assert_eq!(
3413            themself.unwrap().number,
3414            Some(Number::Singular),
3415            "'themself' is explicitly singular"
3416        );
3417        assert_eq!(
3418            themselves.unwrap().number,
3419            Some(Number::Plural),
3420            "'themselves' is explicitly plural"
3421        );
3422    }
3423
3424    // =========================================================================
3425    // Neopronoun tests
3426    // =========================================================================
3427
3428    #[test]
3429    fn test_neopronoun_ze_hir() {
3430        let coref = MentionRankingCoref::new();
3431
3432        let text = "Ze told me to text hir, but I don't have hirs number.";
3433        let mentions = coref.detect_mentions(text).unwrap();
3434
3435        let ze = mentions.iter().find(|m| m.text.to_lowercase() == "ze");
3436        let hir = mentions.iter().find(|m| m.text.to_lowercase() == "hir");
3437        let hirs = mentions.iter().find(|m| m.text.to_lowercase() == "hirs");
3438
3439        assert!(ze.is_some(), "Should detect 'ze'");
3440        assert!(hir.is_some(), "Should detect 'hir'");
3441        assert!(hirs.is_some(), "Should detect 'hirs'");
3442
3443        // All neopronouns are grammatically singular
3444        assert_eq!(ze.unwrap().number, Some(Number::Singular));
3445        assert_eq!(hir.unwrap().number, Some(Number::Singular));
3446        assert_eq!(hirs.unwrap().number, Some(Number::Singular));
3447
3448        // All use Gender::Unknown (nonbinary)
3449        assert_eq!(ze.unwrap().gender, Some(Gender::Unknown));
3450    }
3451
3452    #[test]
3453    fn test_neopronoun_xe_xem() {
3454        let coref = MentionRankingCoref::new();
3455
3456        let text = "Xe said xem would bring xyr notes.";
3457        let mentions = coref.detect_mentions(text).unwrap();
3458
3459        let xe = mentions.iter().find(|m| m.text.to_lowercase() == "xe");
3460        let xem = mentions.iter().find(|m| m.text.to_lowercase() == "xem");
3461        let xyr = mentions.iter().find(|m| m.text.to_lowercase() == "xyr");
3462
3463        assert!(xe.is_some(), "Should detect 'xe'");
3464        assert!(xem.is_some(), "Should detect 'xem'");
3465        assert!(xyr.is_some(), "Should detect 'xyr'");
3466
3467        assert_eq!(xe.unwrap().number, Some(Number::Singular));
3468        assert_eq!(xe.unwrap().gender, Some(Gender::Unknown));
3469    }
3470
3471    #[test]
3472    fn test_neopronoun_spivak_ey_em() {
3473        let coref = MentionRankingCoref::new();
3474
3475        let text = "Ey told me to call em later.";
3476        let mentions = coref.detect_mentions(text).unwrap();
3477
3478        let ey = mentions.iter().find(|m| m.text.to_lowercase() == "ey");
3479        let em = mentions.iter().find(|m| m.text.to_lowercase() == "em");
3480
3481        assert!(ey.is_some(), "Should detect 'ey' (Spivak pronoun)");
3482        assert!(em.is_some(), "Should detect 'em' (Spivak pronoun)");
3483
3484        assert_eq!(ey.unwrap().number, Some(Number::Singular));
3485    }
3486
3487    #[test]
3488    fn test_neopronoun_fae_faer() {
3489        let coref = MentionRankingCoref::new();
3490
3491        let text = "Fae said faer class was cancelled.";
3492        let mentions = coref.detect_mentions(text).unwrap();
3493
3494        let fae = mentions.iter().find(|m| m.text.to_lowercase() == "fae");
3495        let faer = mentions.iter().find(|m| m.text.to_lowercase() == "faer");
3496
3497        assert!(fae.is_some(), "Should detect 'fae'");
3498        assert!(faer.is_some(), "Should detect 'faer'");
3499
3500        assert_eq!(fae.unwrap().number, Some(Number::Singular));
3501    }
3502
3503    // =========================================================================
3504    // From implementation tests
3505    // =========================================================================
3506
3507    #[test]
3508    fn test_ranked_mention_from_entity() {
3509        let entity = crate::Entity::new("Barack Obama", crate::EntityType::Person, 0, 12, 0.95);
3510        let mention = RankedMention::from(&entity);
3511
3512        assert_eq!(mention.start, 0);
3513        assert_eq!(mention.end, 12);
3514        assert_eq!(mention.text, "Barack Obama");
3515        assert_eq!(mention.head, "Obama"); // Last word
3516        assert_eq!(mention.mention_type, MentionType::Proper);
3517    }
3518
3519    #[test]
3520    fn test_ranked_mention_to_coref_mention() {
3521        let mention = RankedMention {
3522            start: 10,
3523            end: 20,
3524            text: "the patient".to_string(),
3525            mention_type: MentionType::Nominal,
3526            gender: Some(Gender::Unknown),
3527            number: Some(Number::Singular),
3528            head: "patient".to_string(),
3529        };
3530
3531        let coref_mention: anno_core::Mention = (&mention).into();
3532
3533        assert_eq!(coref_mention.start, 10);
3534        assert_eq!(coref_mention.end, 20);
3535        assert_eq!(coref_mention.text, "the patient");
3536        assert_eq!(coref_mention.mention_type, Some(MentionType::Nominal));
3537    }
3538
3539    #[test]
3540    fn test_ranked_mention_span() {
3541        let mention = RankedMention {
3542            start: 5,
3543            end: 15,
3544            text: "test".to_string(),
3545            mention_type: MentionType::Nominal,
3546            gender: None,
3547            number: None,
3548            head: "test".to_string(),
3549        };
3550
3551        assert_eq!(mention.span(), (5, 15));
3552    }
3553
3554    // =========================================================================
3555    // Pronoun coreference with nominal adjectives
3556    // =========================================================================
3557
3558    #[test]
3559    fn test_nominal_adjective_pronoun_resolution() {
3560        // This tests the key insight from J2N: detecting "the poor" enables
3561        // resolving "they" that refers to this group.
3562        let config = MentionRankingConfig {
3563            enable_nominal_adjective_detection: true,
3564            link_threshold: 0.1, // Low threshold for pronoun linking
3565            ..Default::default()
3566        };
3567        let coref = MentionRankingCoref::with_config(config);
3568
3569        // Use sentence-final position for "the poor" to ensure detection
3570        let text = "We must help the poor. They deserve better.";
3571
3572        // First verify detection works
3573        let detected = coref.detect_mentions(text).unwrap();
3574        let detected_texts: Vec<_> = detected.iter().map(|m| m.text.as_str()).collect();
3575
3576        assert!(
3577            detected.iter().any(|m| m.text.to_lowercase() == "the poor"),
3578            "Should detect 'the poor' in detect_mentions: {:?}",
3579            detected_texts
3580        );
3581        assert!(
3582            detected.iter().any(|m| m.text.to_lowercase() == "they"),
3583            "Should detect 'They' in detect_mentions: {:?}",
3584            detected_texts
3585        );
3586
3587        // Verify scoring: "They" should have positive score with "the poor"
3588        let the_poor = detected
3589            .iter()
3590            .find(|m| m.text.to_lowercase() == "the poor")
3591            .unwrap();
3592        let they = detected
3593            .iter()
3594            .find(|m| m.text.to_lowercase() == "they")
3595            .unwrap();
3596
3597        let distance = they.start.saturating_sub(the_poor.end);
3598        let score = coref.score_pair(they, the_poor, distance, Some(text));
3599
3600        // With Number::Unknown for "they" and Number::Plural for "the poor",
3601        // there should be no number mismatch penalty (Unknown is compatible with any)
3602        assert!(
3603            score > -0.5,
3604            "Score between 'They' and 'the poor' should not be strongly negative, got {}",
3605            score
3606        );
3607
3608        // Note: Clustering only includes mentions that form links.
3609        // If the score is above threshold, they'll be clustered together.
3610        // If not, they remain singletons (not in any cluster).
3611        // This is expected behavior - the key benefit is detection, not guaranteed linking.
3612    }
3613
3614    // =========================================================================
3615    // Neopronoun detection tests (GICoref/MISGENDERED datasets)
3616    // =========================================================================
3617
3618    #[test]
3619    fn test_neopronoun_xe_detection() {
3620        let coref = MentionRankingCoref::new();
3621        let text = "Alex introduced xemself. Xe said xe was happy to be here.";
3622        let mentions = coref.detect_mentions(text).unwrap();
3623
3624        let texts: Vec<_> = mentions.iter().map(|m| m.text.to_lowercase()).collect();
3625        assert!(
3626            texts.contains(&"xemself".to_string()),
3627            "Should detect 'xemself': {:?}",
3628            texts
3629        );
3630        assert!(
3631            texts.contains(&"xe".to_string()),
3632            "Should detect 'xe': {:?}",
3633            texts
3634        );
3635    }
3636
3637    #[test]
3638    fn test_neopronoun_ze_detection() {
3639        let coref = MentionRankingCoref::new();
3640        let text = "Jordan uses ze/hir pronouns. Hir presentation was excellent.";
3641        let mentions = coref.detect_mentions(text).unwrap();
3642
3643        let texts: Vec<_> = mentions.iter().map(|m| m.text.to_lowercase()).collect();
3644        assert!(
3645            texts.contains(&"ze".to_string()),
3646            "Should detect 'ze': {:?}",
3647            texts
3648        );
3649        assert!(
3650            texts.contains(&"hir".to_string()),
3651            "Should detect 'hir': {:?}",
3652            texts
3653        );
3654    }
3655
3656    #[test]
3657    fn test_neopronoun_ey_detection() {
3658        let coref = MentionRankingCoref::new();
3659        let text = "Sam asked em to pass eir notebook.";
3660        let mentions = coref.detect_mentions(text).unwrap();
3661
3662        let texts: Vec<_> = mentions.iter().map(|m| m.text.to_lowercase()).collect();
3663        assert!(
3664            texts.contains(&"em".to_string()),
3665            "Should detect 'em': {:?}",
3666            texts
3667        );
3668        assert!(
3669            texts.contains(&"eir".to_string()),
3670            "Should detect 'eir': {:?}",
3671            texts
3672        );
3673    }
3674
3675    #[test]
3676    fn test_neopronoun_fae_detection() {
3677        let coref = MentionRankingCoref::new();
3678        let text = "River explained faer perspective. Fae was very articulate.";
3679        let mentions = coref.detect_mentions(text).unwrap();
3680
3681        let texts: Vec<_> = mentions.iter().map(|m| m.text.to_lowercase()).collect();
3682        assert!(
3683            texts.contains(&"faer".to_string()),
3684            "Should detect 'faer': {:?}",
3685            texts
3686        );
3687        assert!(
3688            texts.contains(&"fae".to_string()),
3689            "Should detect 'fae': {:?}",
3690            texts
3691        );
3692    }
3693
3694    #[test]
3695    fn test_neopronoun_gender_and_number() {
3696        let coref = MentionRankingCoref::new();
3697        let text = "Xe arrived early.";
3698        let mentions = coref.detect_mentions(text).unwrap();
3699
3700        let xe_mention = mentions.iter().find(|m| m.text.to_lowercase() == "xe");
3701        assert!(xe_mention.is_some(), "Should detect 'xe'");
3702
3703        let xe = xe_mention.unwrap();
3704        // Neopronouns are singular and gender-unknown (non-binary)
3705        assert_eq!(
3706            xe.number,
3707            Some(Number::Singular),
3708            "Neopronouns are singular"
3709        );
3710        assert_eq!(
3711            xe.gender,
3712            Some(Gender::Unknown),
3713            "Neopronouns use Unknown gender"
3714        );
3715    }
3716
3717    #[test]
3718    fn test_neopronoun_coreference_linking() {
3719        // Test that neopronouns are detected and have correct properties
3720        // for coreference linking (proper noun detection requires NER,
3721        // which is beyond mention_ranking's scope)
3722        let coref = MentionRankingCoref::new();
3723        let text = "Xe said xe would be late. Xem was right.";
3724        let mentions = coref.detect_mentions(text).unwrap();
3725
3726        // All neopronouns should be detected
3727        let texts: Vec<_> = mentions.iter().map(|m| m.text.to_lowercase()).collect();
3728        assert!(
3729            texts.iter().filter(|t| *t == "xe").count() >= 2,
3730            "Should detect multiple 'xe': {:?}",
3731            texts
3732        );
3733        assert!(
3734            texts.contains(&"xem".to_string()),
3735            "Should detect 'xem': {:?}",
3736            texts
3737        );
3738
3739        // All should be pronominal type
3740        for m in &mentions {
3741            if ["xe", "xem"].contains(&m.text.to_lowercase().as_str()) {
3742                assert_eq!(
3743                    m.mention_type,
3744                    MentionType::Pronominal,
3745                    "Neopronouns should be Pronominal type"
3746                );
3747            }
3748        }
3749    }
3750
3751    // =========================================================================
3752    // Number::Dual compatibility tests (Arabic, Hebrew, Sanskrit)
3753    // =========================================================================
3754
3755    #[test]
3756    fn test_dual_number_compatibility_scoring() {
3757        // Dual should be compatible with Plural (but not exact match)
3758        // This is important for languages like Arabic, Hebrew, Sanskrit
3759        // where dual forms are distinct from plural
3760        let coref = MentionRankingCoref::new();
3761
3762        // Create mentions manually to test scoring
3763        let dual_mention = RankedMention {
3764            start: 0,
3765            end: 5,
3766            text: "كتابان".to_string(), // Arabic dual: "two books"
3767            mention_type: MentionType::Nominal,
3768            gender: Some(Gender::Neutral),
3769            number: Some(Number::Dual),
3770            head: "كتابان".to_string(),
3771        };
3772
3773        let plural_mention = RankedMention {
3774            start: 10,
3775            end: 15,
3776            text: "هم".to_string(), // Arabic plural pronoun: "they"
3777            mention_type: MentionType::Pronominal,
3778            gender: Some(Gender::Unknown),
3779            number: Some(Number::Plural),
3780            head: "هم".to_string(),
3781        };
3782
3783        let singular_mention = RankedMention {
3784            start: 20,
3785            end: 22,
3786            text: "هو".to_string(), // Arabic singular: "he"
3787            mention_type: MentionType::Pronominal,
3788            gender: Some(Gender::Masculine),
3789            number: Some(Number::Singular),
3790            head: "هو".to_string(),
3791        };
3792
3793        // Test Number::is_compatible directly
3794        assert!(
3795            Number::Dual.is_compatible(&Number::Plural),
3796            "Dual should be compatible with Plural"
3797        );
3798        assert!(
3799            !Number::Dual.is_compatible(&Number::Singular),
3800            "Dual should NOT be compatible with Singular"
3801        );
3802
3803        // Dual ↔ Plural should score better than Dual ↔ Singular
3804        let score_dual_plural = coref.score_pair(&plural_mention, &dual_mention, 5, None);
3805        let score_dual_singular = coref.score_pair(&singular_mention, &dual_mention, 5, None);
3806
3807        assert!(
3808            score_dual_plural > score_dual_singular,
3809            "Dual-Plural score ({}) should be higher than Dual-Singular ({})",
3810            score_dual_plural,
3811            score_dual_singular
3812        );
3813    }
3814
3815    #[test]
3816    fn test_number_compatibility_unknown() {
3817        // Number::Unknown should be compatible with all other values
3818        // This is critical for singular they, "you", etc.
3819        assert!(Number::Unknown.is_compatible(&Number::Singular));
3820        assert!(Number::Unknown.is_compatible(&Number::Plural));
3821        assert!(Number::Unknown.is_compatible(&Number::Dual));
3822        assert!(Number::Unknown.is_compatible(&Number::Unknown));
3823
3824        // The coreference scorer should not penalize Unknown mismatches
3825        let coref = MentionRankingCoref::new();
3826
3827        let they_mention = RankedMention {
3828            start: 0,
3829            end: 4,
3830            text: "They".to_string(),
3831            mention_type: MentionType::Pronominal,
3832            gender: Some(Gender::Unknown),
3833            number: Some(Number::Unknown), // Singular or plural
3834            head: "They".to_string(),
3835        };
3836
3837        let singular_mention = RankedMention {
3838            start: 10,
3839            end: 14,
3840            text: "Alex".to_string(),
3841            mention_type: MentionType::Proper,
3842            gender: Some(Gender::Unknown),
3843            number: Some(Number::Singular),
3844            head: "Alex".to_string(),
3845        };
3846
3847        let plural_mention = RankedMention {
3848            start: 20,
3849            end: 30,
3850            text: "the students".to_string(),
3851            mention_type: MentionType::Nominal,
3852            gender: Some(Gender::Unknown),
3853            number: Some(Number::Plural),
3854            head: "students".to_string(),
3855        };
3856
3857        // Both should get non-negative scores (Unknown is compatible with both)
3858        let score_they_singular = coref.score_pair(&they_mention, &singular_mention, 5, None);
3859        let score_they_plural = coref.score_pair(&they_mention, &plural_mention, 5, None);
3860
3861        // Neither should be penalized for number mismatch
3862        assert!(
3863            score_they_singular > -1.0,
3864            "'They' ↔ singular should not be heavily penalized: {}",
3865            score_they_singular
3866        );
3867        assert!(
3868            score_they_plural > -1.0,
3869            "'They' ↔ plural should not be heavily penalized: {}",
3870            score_they_plural
3871        );
3872    }
3873
3874    // =========================================================================
3875    // Pleonastic "it" detection tests
3876    // =========================================================================
3877
3878    #[test]
3879    fn test_pleonastic_it_weather() {
3880        // Weather expressions should NOT detect "it" as a referring pronoun
3881        let coref = MentionRankingCoref::new();
3882
3883        let weather_texts = [
3884            "It rains every day in Seattle.",
3885            "It is raining outside.",
3886            "It snows heavily in winter.",
3887            "It was snowing when we arrived.",
3888            "It thundered all night.",
3889        ];
3890
3891        for text in weather_texts {
3892            let mentions = coref.detect_mentions(text).unwrap();
3893            let has_it = mentions.iter().any(|m| m.text.to_lowercase() == "it");
3894            assert!(
3895                !has_it,
3896                "Weather 'it' should be filtered as pleonastic in: '{}'\nDetected: {:?}",
3897                text,
3898                mentions.iter().map(|m| &m.text).collect::<Vec<_>>()
3899            );
3900        }
3901    }
3902
3903    #[test]
3904    fn test_pleonastic_it_weather_adjectives() {
3905        let coref = MentionRankingCoref::new();
3906
3907        let weather_adj_texts = [
3908            "It is sunny today.",
3909            "It was cold last night.",
3910            "It's foggy this morning.",
3911            "It will be warm tomorrow.",
3912        ];
3913
3914        for text in weather_adj_texts {
3915            let mentions = coref.detect_mentions(text).unwrap();
3916            let has_it = mentions.iter().any(|m| m.text.to_lowercase() == "it");
3917            assert!(
3918                !has_it,
3919                "Weather adjective 'it' should be filtered: '{}'\nDetected: {:?}",
3920                text,
3921                mentions.iter().map(|m| &m.text).collect::<Vec<_>>()
3922            );
3923        }
3924    }
3925
3926    #[test]
3927    fn test_pleonastic_it_modal() {
3928        let coref = MentionRankingCoref::new();
3929
3930        let modal_texts = [
3931            "It is important that we finish on time.",
3932            "It is likely that he will arrive late.",
3933            "It was clear that something was wrong.",
3934            "It is necessary to complete the form.",
3935            "It's obvious that she was upset.",
3936        ];
3937
3938        for text in modal_texts {
3939            let mentions = coref.detect_mentions(text).unwrap();
3940            let has_it = mentions.iter().any(|m| m.text.to_lowercase() == "it");
3941            assert!(
3942                !has_it,
3943                "Modal 'it' should be filtered: '{}'\nDetected: {:?}",
3944                text,
3945                mentions.iter().map(|m| &m.text).collect::<Vec<_>>()
3946            );
3947        }
3948    }
3949
3950    #[test]
3951    fn test_pleonastic_it_cognitive_verbs() {
3952        let coref = MentionRankingCoref::new();
3953
3954        let cognitive_texts = [
3955            "It seems that the project is delayed.",
3956            "It appears he was mistaken.",
3957            "It turns out she was right.",
3958            "It happened that we met by chance.",
3959        ];
3960
3961        for text in cognitive_texts {
3962            let mentions = coref.detect_mentions(text).unwrap();
3963            let has_it = mentions.iter().any(|m| m.text.to_lowercase() == "it");
3964            assert!(
3965                !has_it,
3966                "Cognitive verb 'it' should be filtered: '{}'\nDetected: {:?}",
3967                text,
3968                mentions.iter().map(|m| &m.text).collect::<Vec<_>>()
3969            );
3970        }
3971    }
3972
3973    #[test]
3974    fn test_referential_it_not_filtered() {
3975        // Referential "it" should still be detected
3976        let coref = MentionRankingCoref::new();
3977
3978        let referential_texts = [
3979            "I read the book. It was fascinating.",
3980            "The car broke down. We had to push it.",
3981            "She gave him a gift. He loved it.",
3982        ];
3983
3984        for text in referential_texts {
3985            let mentions = coref.detect_mentions(text).unwrap();
3986            let has_it = mentions.iter().any(|m| m.text.to_lowercase() == "it");
3987            assert!(
3988                has_it,
3989                "Referential 'it' should be detected: '{}'\nDetected: {:?}",
3990                text,
3991                mentions.iter().map(|m| &m.text).collect::<Vec<_>>()
3992            );
3993        }
3994    }
3995
3996    #[test]
3997    fn test_pleonastic_it_time_expressions() {
3998        let coref = MentionRankingCoref::new();
3999
4000        let time_texts = [
4001            "It is midnight.",
4002            "It was noon when we left.",
4003            "It is 5 o'clock.",
4004        ];
4005
4006        for text in time_texts {
4007            let mentions = coref.detect_mentions(text).unwrap();
4008            let has_it = mentions.iter().any(|m| m.text.to_lowercase() == "it");
4009            assert!(
4010                !has_it,
4011                "Time expression 'it' should be filtered: '{}'\nDetected: {:?}",
4012                text,
4013                mentions.iter().map(|m| &m.text).collect::<Vec<_>>()
4014            );
4015        }
4016    }
4017
4018    // =========================================================================
4019    // Demonstrative pronoun tests
4020    // =========================================================================
4021
4022    #[test]
4023    fn test_demonstrative_pronoun_detection() {
4024        let coref = MentionRankingCoref::new();
4025
4026        let text = "I saw the problem. This was unexpected. Those are the facts.";
4027        let mentions = coref.detect_mentions(text).unwrap();
4028        let texts: Vec<_> = mentions.iter().map(|m| m.text.to_lowercase()).collect();
4029
4030        assert!(
4031            texts.contains(&"this".to_string()),
4032            "Should detect 'This': {:?}",
4033            texts
4034        );
4035        assert!(
4036            texts.contains(&"those".to_string()),
4037            "Should detect 'Those': {:?}",
4038            texts
4039        );
4040    }
4041
4042    #[test]
4043    fn test_demonstrative_pronoun_number() {
4044        let coref = MentionRankingCoref::new();
4045
4046        // "this" and "that" are singular; "these" and "those" are plural
4047        let text = "This is important. These are facts. That was clear. Those were obvious.";
4048        let mentions = coref.detect_mentions(text).unwrap();
4049
4050        let this_m = mentions.iter().find(|m| m.text.to_lowercase() == "this");
4051        let these_m = mentions.iter().find(|m| m.text.to_lowercase() == "these");
4052        let that_m = mentions.iter().find(|m| m.text.to_lowercase() == "that");
4053        let those_m = mentions.iter().find(|m| m.text.to_lowercase() == "those");
4054
4055        assert_eq!(this_m.map(|m| m.number), Some(Some(Number::Singular)));
4056        assert_eq!(these_m.map(|m| m.number), Some(Some(Number::Plural)));
4057        assert_eq!(that_m.map(|m| m.number), Some(Some(Number::Singular)));
4058        assert_eq!(those_m.map(|m| m.number), Some(Some(Number::Plural)));
4059    }
4060
4061    // =========================================================================
4062    // Indefinite pronoun tests
4063    // =========================================================================
4064
4065    #[test]
4066    fn test_indefinite_pronoun_detection() {
4067        let coref = MentionRankingCoref::new();
4068
4069        let text = "Someone called yesterday. Everyone was surprised.";
4070        let mentions = coref.detect_mentions(text).unwrap();
4071        let texts: Vec<_> = mentions.iter().map(|m| m.text.to_lowercase()).collect();
4072
4073        assert!(
4074            texts.contains(&"someone".to_string()),
4075            "Should detect 'Someone': {:?}",
4076            texts
4077        );
4078        assert!(
4079            texts.contains(&"everyone".to_string()),
4080            "Should detect 'Everyone': {:?}",
4081            texts
4082        );
4083    }
4084
4085    #[test]
4086    fn test_indefinite_pronouns_are_singular() {
4087        // "Everyone", "someone", "nobody" are grammatically singular
4088        // even though they can refer to multiple people conceptually
4089        let coref = MentionRankingCoref::new();
4090
4091        let text = "Everyone was there. Nobody left early.";
4092        let mentions = coref.detect_mentions(text).unwrap();
4093
4094        let everyone_m = mentions
4095            .iter()
4096            .find(|m| m.text.to_lowercase() == "everyone");
4097        let nobody_m = mentions.iter().find(|m| m.text.to_lowercase() == "nobody");
4098
4099        assert!(everyone_m.is_some(), "Should detect 'Everyone'");
4100        assert!(nobody_m.is_some(), "Should detect 'Nobody'");
4101
4102        assert_eq!(
4103            everyone_m.unwrap().number,
4104            Some(Number::Singular),
4105            "'everyone' is grammatically singular"
4106        );
4107        assert_eq!(
4108            nobody_m.unwrap().number,
4109            Some(Number::Singular),
4110            "'nobody' is grammatically singular"
4111        );
4112    }
4113
4114    #[test]
4115    fn test_impersonal_one_detection() {
4116        // Generic "one" is an impersonal pronoun
4117        let coref = MentionRankingCoref::new();
4118
4119        let text = "One should always be prepared. One never knows what might happen.";
4120        let mentions = coref.detect_mentions(text).unwrap();
4121        let one_count = mentions
4122            .iter()
4123            .filter(|m| m.text.to_lowercase() == "one")
4124            .count();
4125
4126        assert!(
4127            one_count >= 2,
4128            "Should detect impersonal 'one': {:?}",
4129            mentions.iter().map(|m| &m.text).collect::<Vec<_>>()
4130        );
4131    }
4132
4133    // =========================================================================
4134    // Reflexive pronoun tests
4135    // =========================================================================
4136
4137    #[test]
4138    fn test_reflexive_pronoun_detection() {
4139        let coref = MentionRankingCoref::new();
4140
4141        let text = "John saw himself in the mirror. Mary hurt herself.";
4142        let mentions = coref.detect_mentions(text).unwrap();
4143        let texts: Vec<_> = mentions.iter().map(|m| m.text.to_lowercase()).collect();
4144
4145        assert!(
4146            texts.contains(&"himself".to_string()),
4147            "Should detect 'himself': {:?}",
4148            texts
4149        );
4150        assert!(
4151            texts.contains(&"herself".to_string()),
4152            "Should detect 'herself': {:?}",
4153            texts
4154        );
4155    }
4156
4157    #[test]
4158    fn test_reflexive_pronoun_gender() {
4159        let coref = MentionRankingCoref::new();
4160
4161        let text = "He saw himself. She saw herself. It fixed itself.";
4162        let mentions = coref.detect_mentions(text).unwrap();
4163
4164        let himself = mentions.iter().find(|m| m.text.to_lowercase() == "himself");
4165        let herself = mentions.iter().find(|m| m.text.to_lowercase() == "herself");
4166        let itself = mentions.iter().find(|m| m.text.to_lowercase() == "itself");
4167
4168        assert!(himself.is_some(), "Should detect 'himself'");
4169        assert!(herself.is_some(), "Should detect 'herself'");
4170        assert!(itself.is_some(), "Should detect 'itself'");
4171
4172        assert_eq!(himself.unwrap().gender, Some(Gender::Masculine));
4173        assert_eq!(herself.unwrap().gender, Some(Gender::Feminine));
4174        assert_eq!(itself.unwrap().gender, Some(Gender::Neutral));
4175    }
4176
4177    // =========================================================================
4178    // Reciprocal pronoun tests
4179    // =========================================================================
4180
4181    #[test]
4182    fn test_reciprocal_pronoun_detection() {
4183        let coref = MentionRankingCoref::new();
4184
4185        let text = "John and Mary looked at each other. The teams competed against one another.";
4186        let mentions = coref.detect_mentions(text).unwrap();
4187        let texts: Vec<_> = mentions.iter().map(|m| m.text.to_lowercase()).collect();
4188
4189        assert!(
4190            texts.contains(&"each other".to_string()),
4191            "Should detect 'each other': {:?}",
4192            texts
4193        );
4194        assert!(
4195            texts.contains(&"one another".to_string()),
4196            "Should detect 'one another': {:?}",
4197            texts
4198        );
4199    }
4200
4201    #[test]
4202    fn test_reciprocal_pronouns_are_plural() {
4203        // Reciprocals require plural antecedents
4204        let coref = MentionRankingCoref::new();
4205
4206        let text = "They helped each other.";
4207        let mentions = coref.detect_mentions(text).unwrap();
4208
4209        let each_other = mentions
4210            .iter()
4211            .find(|m| m.text.to_lowercase() == "each other");
4212        assert!(each_other.is_some(), "Should detect 'each other'");
4213        assert_eq!(
4214            each_other.unwrap().number,
4215            Some(Number::Plural),
4216            "Reciprocals are grammatically plural"
4217        );
4218    }
4219
4220    // =========================================================================
4221    // Property-based tests for mention detection invariants
4222    // =========================================================================
4223    //
4224    // These test real invariants that catch actual bugs:
4225    // - Spans within bounds (prevents panics)
4226    // - Valid Unicode (no slicing mid-character)
4227    // - Phi-feature consistency (catches logic errors)
4228
4229    use proptest::prelude::*;
4230
4231    /// Generate ASCII text with some pronouns embedded
4232    fn text_with_pronouns() -> impl Strategy<Value = String> {
4233        prop::collection::vec(
4234            prop_oneof![
4235                Just("he".to_string()),
4236                Just("she".to_string()),
4237                Just("they".to_string()),
4238                Just("it".to_string()),
4239                Just("the dog".to_string()),
4240                Just("John".to_string()),
4241                "[a-z]{3,10}".prop_map(|s| s),
4242            ],
4243            3..15,
4244        )
4245        .prop_map(|words| words.join(" ") + ".")
4246    }
4247
4248    // =========================================================================
4249    // Multilingual Nominal Adjective Tests
4250    // =========================================================================
4251
4252    #[test]
4253    fn test_multilingual_nominal_adjective_german() {
4254        let config = MentionRankingConfig {
4255            enable_nominal_adjective_detection: true,
4256            language: "de".to_string(),
4257            ..Default::default()
4258        };
4259
4260        let coref = MentionRankingCoref::with_config(config);
4261        let text = "Die Armen leiden unter der Krise.";
4262        let mentions = coref.detect_mentions(text).unwrap();
4263
4264        let has_armen = mentions
4265            .iter()
4266            .any(|m| m.text.to_lowercase().contains("armen"));
4267        assert!(
4268            has_armen,
4269            "Should detect 'die Armen' as a nominal adjective in German"
4270        );
4271    }
4272
4273    #[test]
4274    fn test_multilingual_nominal_adjective_french() {
4275        let config = MentionRankingConfig {
4276            enable_nominal_adjective_detection: true,
4277            language: "fr".to_string(),
4278            ..Default::default()
4279        };
4280
4281        let coref = MentionRankingCoref::with_config(config);
4282        let text = "Les pauvres ont besoin d'aide.";
4283        let mentions = coref.detect_mentions(text).unwrap();
4284
4285        let has_pauvres = mentions
4286            .iter()
4287            .any(|m| m.text.to_lowercase().contains("pauvres"));
4288        assert!(
4289            has_pauvres,
4290            "Should detect 'les pauvres' as a nominal adjective in French"
4291        );
4292    }
4293
4294    #[test]
4295    fn test_multilingual_nominal_adjective_spanish() {
4296        let config = MentionRankingConfig {
4297            enable_nominal_adjective_detection: true,
4298            language: "es".to_string(),
4299            ..Default::default()
4300        };
4301
4302        let coref = MentionRankingCoref::with_config(config);
4303        let text = "Los pobres necesitan ayuda.";
4304        let mentions = coref.detect_mentions(text).unwrap();
4305
4306        let has_pobres = mentions
4307            .iter()
4308            .any(|m| m.text.to_lowercase().contains("pobres"));
4309        assert!(
4310            has_pobres,
4311            "Should detect 'los pobres' as a nominal adjective in Spanish"
4312        );
4313    }
4314
4315    #[test]
4316    fn test_config_language_field() {
4317        // Default should be English
4318        let config = MentionRankingConfig::default();
4319        assert_eq!(config.language, "en");
4320
4321        // Book scale should default to English
4322        let book_config = MentionRankingConfig::book_scale();
4323        assert_eq!(book_config.language, "en");
4324
4325        // Clinical should default to English
4326        let clinical_config = MentionRankingConfig::clinical();
4327        assert_eq!(clinical_config.language, "en");
4328    }
4329
4330    proptest! {
4331        #![proptest_config(ProptestConfig::with_cases(50))]
4332
4333        /// All detected mentions have spans within text bounds
4334        ///
4335        /// This catches off-by-one errors and Unicode slicing bugs.
4336        #[test]
4337        fn mention_spans_within_bounds(text in text_with_pronouns()) {
4338            let coref = MentionRankingCoref::new();
4339            if let Ok(mentions) = coref.detect_mentions(&text) {
4340                let char_count = text.chars().count();
4341                for mention in &mentions {
4342                    prop_assert!(
4343                        mention.start <= mention.end,
4344                        "Start {} > end {} for '{}'",
4345                        mention.start, mention.end, mention.text
4346                    );
4347                    prop_assert!(
4348                        mention.end <= char_count,
4349                        "End {} > text length {} for '{}'",
4350                        mention.end, char_count, mention.text
4351                    );
4352                }
4353            }
4354        }
4355
4356        /// Extracted mention text matches the span
4357        ///
4358        /// Verifies we're using character offsets correctly.
4359        #[test]
4360        fn mention_text_matches_span(text in text_with_pronouns()) {
4361            let coref = MentionRankingCoref::new();
4362            if let Ok(mentions) = coref.detect_mentions(&text) {
4363                for mention in &mentions {
4364                    let extracted: String = text.chars()
4365                        .skip(mention.start)
4366                        .take(mention.end - mention.start)
4367                        .collect();
4368                    // Case-insensitive comparison (we lowercase during detection)
4369                    prop_assert_eq!(
4370                        extracted.to_lowercase(),
4371                        mention.text.to_lowercase(),
4372                        "Extracted text doesn't match stored text"
4373                    );
4374                }
4375            }
4376        }
4377
4378        /// Pronouns always have MentionType::Pronominal
4379        #[test]
4380        fn pronouns_are_pronominal(text in text_with_pronouns()) {
4381            let coref = MentionRankingCoref::new();
4382            if let Ok(mentions) = coref.detect_mentions(&text) {
4383                let pronouns = ["he", "she", "it", "they", "him", "her", "them"];
4384                for mention in &mentions {
4385                    if pronouns.contains(&mention.text.to_lowercase().as_str()) {
4386                        prop_assert_eq!(
4387                            mention.mention_type,
4388                            MentionType::Pronominal,
4389                            "'{}' should be Pronominal",
4390                            mention.text
4391                        );
4392                    }
4393                }
4394            }
4395        }
4396
4397        /// Gender is always set for detected pronouns
4398        #[test]
4399        fn pronouns_have_gender(text in text_with_pronouns()) {
4400            let coref = MentionRankingCoref::new();
4401            if let Ok(mentions) = coref.detect_mentions(&text) {
4402                for mention in &mentions {
4403                    if mention.mention_type == MentionType::Pronominal {
4404                        prop_assert!(
4405                            mention.gender.is_some(),
4406                            "Pronoun '{}' should have gender",
4407                            mention.text
4408                        );
4409                    }
4410                }
4411            }
4412        }
4413
4414        /// Number is always set for detected pronouns
4415        #[test]
4416        fn pronouns_have_number(text in text_with_pronouns()) {
4417            let coref = MentionRankingCoref::new();
4418            if let Ok(mentions) = coref.detect_mentions(&text) {
4419                for mention in &mentions {
4420                    if mention.mention_type == MentionType::Pronominal {
4421                        prop_assert!(
4422                            mention.number.is_some(),
4423                            "Pronoun '{}' should have number",
4424                            mention.text
4425                        );
4426                    }
4427                }
4428            }
4429        }
4430
4431        /// Coreference clusters partition mentions (no overlaps, no orphans)
4432        #[test]
4433        fn clusters_partition_mentions(text in text_with_pronouns()) {
4434            let coref = MentionRankingCoref::new();
4435            if let Ok(clusters) = coref.resolve(&text) {
4436                // Flatten all mentions from clusters
4437                let mut all_mentions: Vec<_> = clusters.iter()
4438                    .flat_map(|c| &c.mentions)
4439                    .collect();
4440
4441                // Check no duplicates (by span)
4442                let original_len = all_mentions.len();
4443                all_mentions.sort_by_key(|m| (m.start, m.end));
4444                all_mentions.dedup_by_key(|m| (m.start, m.end));
4445                prop_assert_eq!(
4446                    all_mentions.len(),
4447                    original_len,
4448                    "Duplicate mentions across clusters"
4449                );
4450            }
4451        }
4452
4453        /// Score pair is deterministic
4454        ///
4455        /// Same inputs should always produce same score.
4456        #[test]
4457        fn score_pair_deterministic(text in text_with_pronouns()) {
4458            let coref = MentionRankingCoref::new();
4459            if let Ok(mentions) = coref.detect_mentions(&text) {
4460                if mentions.len() >= 2 {
4461                    let distance = mentions[1].start.saturating_sub(mentions[0].end);
4462                    let score1 = coref.score_pair(&mentions[0], &mentions[1], distance, Some(&text));
4463                    let score2 = coref.score_pair(&mentions[0], &mentions[1], distance, Some(&text));
4464                    prop_assert!(
4465                        (score1 - score2).abs() < 0.0001,
4466                        "Scoring should be deterministic"
4467                    );
4468                }
4469            }
4470        }
4471    }
4472}