anno/backends/mention_ranking/types.rs
1//! Configuration and data types for mention-ranking coreference.
2
3#[allow(unused_imports)]
4use super::*;
5
6#[allow(unused_imports)]
7use crate::{Model, Result};
8use anno_core::{Gender, MentionType};
9#[allow(unused_imports)]
10use std::collections::{HashMap, HashSet};
11
12/// A scored mention pair for easy-first clustering.
13#[derive(Debug, Clone)]
14pub(super) struct ScoredPair {
15 pub(super) mention_idx: usize,
16 pub(super) antecedent_idx: usize,
17 pub(super) score: f64,
18}
19
20/// Clustering strategy for mention linking.
21///
22/// # Research Context (Bourgois & Poibeau 2025)
23///
24/// The paper compares two clustering strategies:
25/// - **Left-to-right**: Traditional approach, processes mentions in document order
26/// - **Easy-first**: Process high-confidence decisions first, constrains later decisions
27///
28/// Easy-first combined with global proper noun coreference can improve outcomes on long documents.
29#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
30pub enum ClusteringStrategy {
31 /// Process mentions left-to-right in document order (traditional).
32 #[default]
33 LeftToRight,
34 /// Process mentions by confidence score (high confidence first).
35 /// High-confidence decisions constrain later decisions.
36 /// Non-coreference predictions can prevent incorrect merges.
37 EasyFirst,
38}
39
40/// Configuration for mention-ranking coref.
41///
42/// # Research-Informed Defaults
43///
44/// The defaults are informed by findings from Bourgois & Poibeau (2025):
45/// - Pronouns tend to have shorter antecedent distances than proper nouns
46/// - Proper/common nouns can span thousands of mentions
47/// - Type-specific limits outperform uniform limits
48///
49/// # Example
50///
51/// ```rust
52/// use anno::backends::mention_ranking::{MentionRankingConfig, ClusteringStrategy};
53///
54/// // Book-scale configuration
55/// let config = MentionRankingConfig {
56/// pronoun_max_antecedents: 30, // 95% of pronouns within 7 mentions
57/// proper_max_antecedents: 300, // Proper nouns span further
58/// nominal_max_antecedents: 300, // Common nouns similar to proper
59/// enable_global_proper_coref: true, // Bridge long-distance proper nouns
60/// clustering_strategy: ClusteringStrategy::EasyFirst,
61/// ..Default::default()
62/// };
63/// ```
64#[derive(Debug, Clone)]
65pub struct MentionRankingConfig {
66 /// Minimum score to link mentions.
67 pub link_threshold: f64,
68
69 // =========================================================================
70 // Type-specific antecedent limits (Bourgois & Poibeau 2025)
71 // =========================================================================
72 /// Maximum number of antecedent candidates for pronouns.
73 /// Research shows 95% of pronouns are within 7 mentions of antecedent.
74 /// Default: 30 (conservative buffer above 95th percentile).
75 pub pronoun_max_antecedents: usize,
76
77 /// Maximum number of antecedent candidates for proper nouns.
78 /// Proper nouns can span 1700+ mentions in long documents.
79 /// Default: 300 (covers 99th percentile while remaining tractable).
80 pub proper_max_antecedents: usize,
81
82 /// Maximum number of antecedent candidates for nominal mentions.
83 /// Similar distribution to proper nouns.
84 /// Default: 300.
85 pub nominal_max_antecedents: usize,
86
87 /// Legacy uniform max distance (in characters). Used as fallback.
88 /// Prefer type-specific limits for better accuracy.
89 pub max_distance: usize,
90
91 // =========================================================================
92 // Global proper noun coreference (Bourgois & Poibeau 2025)
93 // =========================================================================
94 /// Enable global proper noun coreference propagation.
95 /// When enabled, high-confidence proper noun coreference decisions are
96 /// propagated document-wide, bridging mentions that exceed local windows.
97 /// Gains 5-10 B³ points on documents >20k tokens.
98 pub enable_global_proper_coref: bool,
99
100 /// Minimum confidence to propagate proper noun coreference globally.
101 /// Only pairs with scores above this threshold are propagated.
102 pub global_proper_threshold: f64,
103
104 // =========================================================================
105 // Easy-first clustering (Clark & Manning 2016, Bourgois & Poibeau 2025)
106 // =========================================================================
107 /// Clustering strategy to use.
108 pub clustering_strategy: ClusteringStrategy,
109
110 /// Enable non-coreference constraints in easy-first clustering.
111 /// High-confidence non-coreference predictions prevent incorrect merges.
112 pub use_non_coref_constraints: bool,
113
114 /// Threshold for non-coreference constraints.
115 /// Pairs with scores below this are treated as definitely non-coreferent.
116 pub non_coref_threshold: f64,
117
118 // =========================================================================
119 // Feature weights
120 // =========================================================================
121 /// Weight for string match features.
122 pub string_match_weight: f64,
123 /// Weight for type compatibility features.
124 pub type_compat_weight: f64,
125 /// Weight for distance feature.
126 pub distance_weight: f64,
127
128 // =========================================================================
129 // Salience integration
130 // =========================================================================
131 /// Weight for salience boost when scoring antecedent candidates.
132 ///
133 /// When > 0, antecedents with higher salience scores receive a boost.
134 /// This helps prefer linking to important/central entities in the document.
135 ///
136 /// Typical values: 0.0 (disabled) to 0.3 (moderate boost).
137 pub salience_weight: f64,
138
139 // =========================================================================
140 // i2b2-inspired rule-based features (Chen et al. 2011)
141 // =========================================================================
142 /// Enable "be phrase" detection for identity linking.
143 /// Patterns like "X is Y" or "resolution of X is Y" strongly indicate coreference.
144 /// From i2b2 clinical coref: achieved high precision on medical texts.
145 pub enable_be_phrase_detection: bool,
146
147 /// Weight for be-phrase identity signal.
148 pub be_phrase_weight: f64,
149
150 /// Enable acronym matching (e.g., "MRSA" ↔ "Methicillin-resistant Staphylococcus aureus").
151 pub enable_acronym_matching: bool,
152
153 /// Weight for acronym match signal.
154 pub acronym_weight: f64,
155
156 /// Enable context-based link filtering.
157 /// Uses surrounding context (dates, locations, modifiers) to filter false links.
158 pub enable_context_filtering: bool,
159
160 /// Enable synonym matching for related terms.
161 ///
162 /// When enabled, uses string similarity (from `anno::coalesce`) as a proxy
163 /// for synonym relationships. High similarity (>0.8) indicates likely synonyms.
164 ///
165 /// For domain-specific synonyms (medical, legal, etc.), implement a custom
166 /// `anno::coalesce::SynonymSource` and integrate it with the resolver.
167 pub enable_synonym_matching: bool,
168
169 /// Weight for synonym match signal.
170 pub synonym_weight: f64,
171
172 // =========================================================================
173 // Nominal adjective detection (J2N: arXiv:2409.14374)
174 // =========================================================================
175 /// Enable detection of nominal adjectives as mentions.
176 ///
177 /// Nominal adjectives are phrases like "the poor", "the elderly", "the accused"
178 /// where an adjective functions as a noun phrase referring to a group of people.
179 ///
180 /// # Linguistic Background
181 ///
182 /// In English, certain adjectives can be "nominalized" when preceded by a
183 /// definite article: "The rich get richer while the poor get poorer."
184 /// Here, "the poor" refers to poor people as a collective group.
185 ///
186 /// # Coreference Impact (J2N Paper)
187 ///
188 /// Qi, Han & Xie (arXiv:2409.14374) showed that correctly detecting these
189 /// as mentions can improve coreference metrics slightly. Without detection, pronouns
190 /// like "they" that refer back to "the poor" become orphaned.
191 ///
192 /// # Grammatical Number
193 ///
194 /// Nominal adjectives are grammatically plural in English:
195 /// - "The poor ARE struggling" (not "is")
196 /// - "The elderly NEED support" (not "needs")
197 ///
198 /// Default: false (for backward compatibility)
199 pub enable_nominal_adjective_detection: bool,
200
201 /// Language for language-specific features (ISO 639-1 code).
202 ///
203 /// When set, enables language-specific patterns for:
204 /// - Nominal adjective detection (German "die Armen", French "les pauvres", etc.)
205 /// - Pronoun resolution rules
206 /// - Gender/number agreement
207 ///
208 /// Supported languages:
209 /// - "en" (default): English
210 /// - "de": German
211 /// - "fr": French
212 /// - "es": Spanish
213 ///
214 /// Default: "en"
215 pub language: String,
216}
217
218impl Default for MentionRankingConfig {
219 fn default() -> Self {
220 Self {
221 link_threshold: 0.3,
222
223 // Type-specific limits (Bourgois & Poibeau 2025)
224 pronoun_max_antecedents: 30, // 95% within 7 mentions
225 proper_max_antecedents: 300, // Can span 1700+ mentions
226 nominal_max_antecedents: 300, // Similar to proper nouns
227
228 // Legacy uniform limit (fallback)
229 max_distance: 100,
230
231 // Global proper noun coreference
232 enable_global_proper_coref: false, // Off by default for compatibility
233 global_proper_threshold: 0.7,
234
235 // Clustering strategy
236 clustering_strategy: ClusteringStrategy::LeftToRight,
237 use_non_coref_constraints: false,
238 non_coref_threshold: 0.2,
239
240 // Feature weights
241 string_match_weight: 1.0,
242 type_compat_weight: 0.5,
243 distance_weight: 0.1,
244
245 // Salience (disabled by default for backward compatibility)
246 salience_weight: 0.0,
247
248 // i2b2-inspired features (off by default for backward compatibility)
249 enable_be_phrase_detection: false,
250 be_phrase_weight: 0.8,
251 enable_acronym_matching: false,
252 acronym_weight: 0.7,
253 enable_context_filtering: false,
254 enable_synonym_matching: false,
255 synonym_weight: 0.5,
256
257 // Nominal adjective detection (J2N: arXiv:2409.14374)
258 enable_nominal_adjective_detection: false,
259
260 // Language (English by default)
261 language: "en".to_string(),
262 }
263 }
264}
265
266impl MentionRankingConfig {
267 /// Create a configuration optimized for book-scale documents.
268 ///
269 /// Based on findings from Bourgois & Poibeau (2025):
270 /// - Type-specific antecedent limits
271 /// - Global proper noun coreference enabled
272 /// - Easy-first clustering
273 #[must_use]
274 pub fn book_scale() -> Self {
275 Self {
276 link_threshold: 0.3,
277
278 // Type-specific limits
279 pronoun_max_antecedents: 30,
280 proper_max_antecedents: 300,
281 nominal_max_antecedents: 300,
282
283 max_distance: 500, // Larger for book-scale
284
285 // Enable book-scale optimizations
286 enable_global_proper_coref: true,
287 global_proper_threshold: 0.7,
288
289 clustering_strategy: ClusteringStrategy::EasyFirst,
290 use_non_coref_constraints: true,
291 non_coref_threshold: 0.2,
292
293 // Feature weights
294 string_match_weight: 1.0,
295 type_compat_weight: 0.5,
296 distance_weight: 0.05, // Lower weight for distance in long docs
297
298 // Salience helps in long documents where context is limited
299 salience_weight: 0.2,
300
301 // i2b2-inspired features (useful for long documents)
302 enable_be_phrase_detection: true,
303 be_phrase_weight: 0.8,
304 enable_acronym_matching: true,
305 acronym_weight: 0.7,
306 enable_context_filtering: true,
307 enable_synonym_matching: false, // Off by default, requires domain synonyms
308 synonym_weight: 0.5,
309 enable_nominal_adjective_detection: false,
310 language: "en".to_string(),
311 }
312 }
313
314 /// Create a configuration optimized for clinical/biomedical text.
315 ///
316 /// Based on Chen et al. (2011) "A Rule Based Solution to Co-reference
317 /// Resolution in Clinical Text" from i2b2 NLP Challenge:
318 /// - "Be phrase" detection for identity linking
319 /// - Acronym matching (e.g., MRSA ↔ Methicillin-resistant...)
320 /// - Context-based link filtering
321 /// - Synonym matching for medical terms
322 ///
323 /// # Example
324 ///
325 /// ```rust
326 /// use anno::backends::mention_ranking::MentionRankingConfig;
327 ///
328 /// let config = MentionRankingConfig::clinical();
329 /// assert!(config.enable_be_phrase_detection);
330 /// assert!(config.enable_acronym_matching);
331 /// ```
332 #[must_use]
333 pub fn clinical() -> Self {
334 Self {
335 link_threshold: 0.3,
336
337 // Clinical documents are typically shorter than books
338 pronoun_max_antecedents: 30,
339 proper_max_antecedents: 100,
340 nominal_max_antecedents: 100,
341
342 max_distance: 200,
343
344 // Global proper coref helps with patient/doctor names
345 enable_global_proper_coref: true,
346 global_proper_threshold: 0.6,
347
348 // Easy-first clustering works well for clinical
349 clustering_strategy: ClusteringStrategy::EasyFirst,
350 use_non_coref_constraints: true,
351 non_coref_threshold: 0.2,
352
353 // Feature weights (slightly higher for string matching in clinical)
354 string_match_weight: 1.2,
355 type_compat_weight: 0.5,
356 distance_weight: 0.08,
357
358 // Salience moderate
359 salience_weight: 0.15,
360
361 // Enable all i2b2-inspired features
362 enable_be_phrase_detection: true,
363 be_phrase_weight: 0.9, // High weight for clinical "X is Y" patterns
364 enable_acronym_matching: true,
365 acronym_weight: 0.8, // Medical acronyms are reliable
366 enable_context_filtering: true,
367 enable_synonym_matching: true, // Enable with medical synonyms
368 synonym_weight: 0.6,
369 enable_nominal_adjective_detection: false,
370 language: "en".to_string(),
371 }
372 }
373
374 /// Create a configuration with salience integration enabled.
375 ///
376 /// Salience-weighted scoring boosts antecedents that are more
377 /// important/central in the document.
378 #[must_use]
379 pub fn with_salience(mut self, weight: f64) -> Self {
380 self.salience_weight = weight.clamp(0.0, 1.0);
381 self
382 }
383
384 /// Get maximum antecedents for a given mention type.
385 #[must_use]
386 pub fn max_antecedents_for_type(&self, mention_type: MentionType) -> usize {
387 match mention_type {
388 MentionType::Pronominal => self.pronoun_max_antecedents,
389 MentionType::Proper => self.proper_max_antecedents,
390 MentionType::Nominal => self.nominal_max_antecedents,
391 // Zero anaphora and unknown types use nominal limits as default
392 MentionType::Zero | MentionType::Unknown => self.nominal_max_antecedents,
393 }
394 }
395}
396
397// MentionType imported from anno_core
398
399/// A detected mention with phi-features for coreference resolution.
400///
401/// This is the core data structure for mention-ranking coreference. Each mention
402/// carries the linguistic features needed to determine coreference compatibility:
403///
404/// - **Span** (`start`, `end`): Character offsets in the source text
405/// - **Type** (`mention_type`): Proper/Nominal/Pronominal/Zero (affects salience)
406/// - **Phi-features** (`gender`, `number`): Agreement constraints
407/// - **Head** (`head`): Syntactic head for matching
408///
409/// # Phi-Features and Agreement
410///
411/// The `gender` and `number` fields encode phi-features (φ-features) from
412/// linguistic theory. These are the grammatical features that govern agreement:
413///
414/// | Feature | Purpose | Example constraint |
415/// |---------|---------|-------------------|
416/// | Gender | Pronoun resolution | "Mary... she" not "he" |
417/// | Number | Singular/plural match | "The dogs... they" not "it" |
418///
419/// `None` values indicate unknown features, which are treated as compatible
420/// with any value (permissive matching).
421///
422/// # Cross-Linguistic Notes
423///
424/// - **Person** is not stored here (would be 3rd for most mentions)
425/// - **Dual number** is supported via `Number::Dual` (Arabic, Sanskrit, Hebrew)
426/// - **Noun class** systems (Bantu, Dyirbal) would need extension beyond `Gender`
427/// - **Zero mentions** (pro-drop) have spans but no surface text
428#[derive(Debug, Clone)]
429pub struct RankedMention {
430 /// Character start offset (0-indexed, inclusive).
431 ///
432 /// Uses character offsets, not byte offsets, for Unicode safety.
433 pub start: usize,
434
435 /// Character end offset (exclusive).
436 ///
437 /// The span `[start, end)` extracts the mention text.
438 pub end: usize,
439
440 /// The mention text as it appears in the source.
441 ///
442 /// For zero pronouns (pro-drop), this may be empty or a placeholder.
443 pub text: String,
444
445 /// Mention type classification.
446 ///
447 /// Affects antecedent search: pronouns look locally, proper nouns globally.
448 /// See [`MentionType`] for the accessibility hierarchy.
449 pub mention_type: MentionType,
450
451 /// Grammatical gender (if determinable).
452 ///
453 /// - `Some(Masculine/Feminine)`: Gendered pronoun or name
454 /// - `Some(Neutral)`: "they"/"it" (compatible with any gender)
455 /// - `Some(Unknown)`: Neopronouns or ungendered names
456 /// - `None`: Feature not applicable or not detected
457 pub gender: Option<Gender>,
458
459 /// Grammatical number (if determinable).
460 ///
461 /// - `Some(Singular)`: "he", "she", "it", "the dog"
462 /// - `Some(Dual)`: Arabic/Sanskrit dual forms
463 /// - `Some(Plural)`: "they", "the dogs"
464 /// - `Some(Unknown)`: "you" (ambiguous), singular "they"
465 /// - `None`: Feature not detected
466 pub number: Option<Number>,
467
468 /// Syntactic head word of the mention.
469 ///
470 /// For "the former president", head = "president".
471 /// Used for head matching in coreference scoring.
472 pub head: String,
473}
474
475impl RankedMention {
476 /// Get the character span as a tuple.
477 #[must_use]
478 pub fn span(&self) -> (usize, usize) {
479 (self.start, self.end)
480 }
481}
482
483/// Convert RankedMention to eval::coref::Mention for evaluation.
484///
485/// This enables using mention-ranking output directly in coreference evaluation.
486impl From<&RankedMention> for anno_core::Mention {
487 fn from(mention: &RankedMention) -> Self {
488 Self {
489 text: mention.text.clone(),
490 start: mention.start,
491 end: mention.end,
492 head_start: None,
493 head_end: None,
494 entity_type: None,
495 mention_type: Some(mention.mention_type),
496 }
497 }
498}
499
500impl From<RankedMention> for anno_core::Mention {
501 fn from(mention: RankedMention) -> Self {
502 Self::from(&mention)
503 }
504}
505
506/// Convert Entity to RankedMention for coreference resolution.
507///
508/// This enables using NER output directly in mention-ranking coreference.
509impl From<&crate::Entity> for RankedMention {
510 fn from(entity: &crate::Entity) -> Self {
511 Self {
512 start: entity.start,
513 end: entity.end,
514 text: entity.text.clone(),
515 mention_type: MentionType::classify(&entity.text),
516 gender: None,
517 number: None,
518 head: extract_head(&entity.text),
519 }
520 }
521}
522
523impl From<crate::Entity> for RankedMention {
524 fn from(entity: crate::Entity) -> Self {
525 Self::from(&entity)
526 }
527}
528
529/// Extract the head word from a mention (last word heuristic).
530fn extract_head(text: &str) -> String {
531 text.split_whitespace().last().unwrap_or(text).to_string()
532}
533
534// Gender and Number imported from anno_core
535// Number includes Dual for Arabic, Hebrew, Sanskrit, etc.
536pub use anno_core::Number;
537
538/// Coreference cluster from mention ranking.
539#[derive(Debug, Clone)]
540pub struct MentionCluster {
541 /// Cluster ID.
542 pub id: usize,
543 /// Mentions in this cluster.
544 pub mentions: Vec<RankedMention>,
545}
546
547impl MentionCluster {
548 /// Convert this cluster's mentions to Signals for use with GroundedDocument.
549 ///
550 /// Returns a vector of Signals with Location::Text locations.
551 /// Signal IDs are assigned based on mention order within the cluster.
552 ///
553 /// # Arguments
554 /// * `signal_id_base` - Starting signal ID (to avoid collisions with other clusters)
555 #[must_use]
556 pub fn to_signals(
557 &self,
558 signal_id_base: anno_core::SignalId,
559 ) -> Vec<anno_core::Signal<anno_core::Location>> {
560 self.mentions
561 .iter()
562 .enumerate()
563 .map(|(idx, mention)| anno_core::Signal {
564 id: signal_id_base + idx as u64,
565 location: anno_core::Location::Text {
566 start: mention.start,
567 end: mention.end,
568 },
569 surface: mention.text.clone(),
570 label: anno_core::TypeLabel::from(mention.mention_type.as_label()),
571 confidence: 1.0,
572 hierarchical: None,
573 provenance: None,
574 modality: anno_core::Modality::Symbolic,
575 normalized: None,
576 negated: false,
577 quantifier: None,
578 })
579 .collect()
580 }
581
582 /// Convert this cluster to a Track for use with GroundedDocument.
583 ///
584 /// This bridges mention-ranking output to the canonical Signal→Track→Identity hierarchy.
585 ///
586 /// # Arguments
587 /// * `signal_id_base` - Starting signal ID for the signals in this track
588 ///
589 /// # Returns
590 /// A tuple of `(Track, Vec<Signal>)` containing the track and its signals.
591 /// The signals should be added to the GroundedDocument separately.
592 #[must_use]
593 pub fn to_track(
594 &self,
595 signal_id_base: anno_core::SignalId,
596 ) -> (
597 anno_core::Track,
598 Vec<anno_core::Signal<anno_core::Location>>,
599 ) {
600 let signals = self.to_signals(signal_id_base);
601
602 // Find the canonical surface: prefer proper nouns, else first mention
603 let canonical_surface = self
604 .mentions
605 .iter()
606 .find(|m| m.mention_type == MentionType::Proper)
607 .or_else(|| self.mentions.first())
608 .map(|m| m.text.clone())
609 .unwrap_or_default();
610
611 // Build track with signal references
612 let mut track =
613 anno_core::Track::new(anno_core::TrackId::new(self.id as u64), canonical_surface);
614 // Mention-ranking coref does not infer entity type; leave unset.
615 track.entity_type = None;
616
617 for (idx, _) in signals.iter().enumerate() {
618 track.add_signal(signal_id_base + idx as u64, idx as u32);
619 }
620
621 (track, signals)
622 }
623
624 /// Get the canonical mention (first proper noun, or first mention if none).
625 #[must_use]
626 pub fn canonical_mention(&self) -> Option<&RankedMention> {
627 self.mentions
628 .iter()
629 .find(|m| m.mention_type == MentionType::Proper)
630 .or_else(|| self.mentions.first())
631 }
632}
633
634impl RankedMention {
635 /// Convert to a Signal with Location::Text.
636 #[must_use]
637 pub fn to_signal(
638 &self,
639 signal_id: anno_core::SignalId,
640 ) -> anno_core::Signal<anno_core::Location> {
641 anno_core::Signal {
642 id: signal_id,
643 location: anno_core::Location::Text {
644 start: self.start,
645 end: self.end,
646 },
647 surface: self.text.clone(),
648 label: anno_core::TypeLabel::from(self.mention_type.as_label()),
649 confidence: 1.0,
650 hierarchical: None,
651 provenance: None,
652 modality: anno_core::Modality::Symbolic,
653 normalized: None,
654 negated: false,
655 quantifier: None,
656 }
657 }
658}