Skip to main content

anno/backends/mention_ranking/
types.rs

1//! Configuration and data types for mention-ranking coreference.
2
3#[allow(unused_imports)]
4use super::*;
5
6#[allow(unused_imports)]
7use crate::{Model, Result};
8use anno_core::{Gender, MentionType};
9#[allow(unused_imports)]
10use std::collections::{HashMap, HashSet};
11
12/// A scored mention pair for easy-first clustering.
13#[derive(Debug, Clone)]
14pub(super) struct ScoredPair {
15    pub(super) mention_idx: usize,
16    pub(super) antecedent_idx: usize,
17    pub(super) score: f64,
18}
19
20/// Clustering strategy for mention linking.
21///
22/// # Research Context (Bourgois & Poibeau 2025)
23///
24/// The paper compares two clustering strategies:
25/// - **Left-to-right**: Traditional approach, processes mentions in document order
26/// - **Easy-first**: Process high-confidence decisions first, constrains later decisions
27///
28/// Easy-first combined with global proper noun coreference can improve outcomes on long documents.
29#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
30pub enum ClusteringStrategy {
31    /// Process mentions left-to-right in document order (traditional).
32    #[default]
33    LeftToRight,
34    /// Process mentions by confidence score (high confidence first).
35    /// High-confidence decisions constrain later decisions.
36    /// Non-coreference predictions can prevent incorrect merges.
37    EasyFirst,
38}
39
40/// Configuration for mention-ranking coref.
41///
42/// # Research-Informed Defaults
43///
44/// The defaults are informed by findings from Bourgois & Poibeau (2025):
45/// - Pronouns tend to have shorter antecedent distances than proper nouns
46/// - Proper/common nouns can span thousands of mentions
47/// - Type-specific limits outperform uniform limits
48///
49/// # Example
50///
51/// ```rust
52/// use anno::backends::mention_ranking::{MentionRankingConfig, ClusteringStrategy};
53///
54/// // Book-scale configuration
55/// let config = MentionRankingConfig {
56///     pronoun_max_antecedents: 30,     // 95% of pronouns within 7 mentions
57///     proper_max_antecedents: 300,     // Proper nouns span further
58///     nominal_max_antecedents: 300,    // Common nouns similar to proper
59///     enable_global_proper_coref: true, // Bridge long-distance proper nouns
60///     clustering_strategy: ClusteringStrategy::EasyFirst,
61///     ..Default::default()
62/// };
63/// ```
64#[derive(Debug, Clone)]
65pub struct MentionRankingConfig {
66    /// Minimum score to link mentions.
67    pub link_threshold: f64,
68
69    // =========================================================================
70    // Type-specific antecedent limits (Bourgois & Poibeau 2025)
71    // =========================================================================
72    /// Maximum number of antecedent candidates for pronouns.
73    /// Research shows 95% of pronouns are within 7 mentions of antecedent.
74    /// Default: 30 (conservative buffer above 95th percentile).
75    pub pronoun_max_antecedents: usize,
76
77    /// Maximum number of antecedent candidates for proper nouns.
78    /// Proper nouns can span 1700+ mentions in long documents.
79    /// Default: 300 (covers 99th percentile while remaining tractable).
80    pub proper_max_antecedents: usize,
81
82    /// Maximum number of antecedent candidates for nominal mentions.
83    /// Similar distribution to proper nouns.
84    /// Default: 300.
85    pub nominal_max_antecedents: usize,
86
87    /// Legacy uniform max distance (in characters). Used as fallback.
88    /// Prefer type-specific limits for better accuracy.
89    pub max_distance: usize,
90
91    // =========================================================================
92    // Global proper noun coreference (Bourgois & Poibeau 2025)
93    // =========================================================================
94    /// Enable global proper noun coreference propagation.
95    /// When enabled, high-confidence proper noun coreference decisions are
96    /// propagated document-wide, bridging mentions that exceed local windows.
97    /// Gains 5-10 B³ points on documents >20k tokens.
98    pub enable_global_proper_coref: bool,
99
100    /// Minimum confidence to propagate proper noun coreference globally.
101    /// Only pairs with scores above this threshold are propagated.
102    pub global_proper_threshold: f64,
103
104    // =========================================================================
105    // Easy-first clustering (Clark & Manning 2016, Bourgois & Poibeau 2025)
106    // =========================================================================
107    /// Clustering strategy to use.
108    pub clustering_strategy: ClusteringStrategy,
109
110    /// Enable non-coreference constraints in easy-first clustering.
111    /// High-confidence non-coreference predictions prevent incorrect merges.
112    pub use_non_coref_constraints: bool,
113
114    /// Threshold for non-coreference constraints.
115    /// Pairs with scores below this are treated as definitely non-coreferent.
116    pub non_coref_threshold: f64,
117
118    // =========================================================================
119    // Feature weights
120    // =========================================================================
121    /// Weight for string match features.
122    pub string_match_weight: f64,
123    /// Weight for type compatibility features.
124    pub type_compat_weight: f64,
125    /// Weight for distance feature.
126    pub distance_weight: f64,
127
128    // =========================================================================
129    // Salience integration
130    // =========================================================================
131    /// Weight for salience boost when scoring antecedent candidates.
132    ///
133    /// When > 0, antecedents with higher salience scores receive a boost.
134    /// This helps prefer linking to important/central entities in the document.
135    ///
136    /// Typical values: 0.0 (disabled) to 0.3 (moderate boost).
137    pub salience_weight: f64,
138
139    // =========================================================================
140    // i2b2-inspired rule-based features (Chen et al. 2011)
141    // =========================================================================
142    /// Enable "be phrase" detection for identity linking.
143    /// Patterns like "X is Y" or "resolution of X is Y" strongly indicate coreference.
144    /// From i2b2 clinical coref: achieved high precision on medical texts.
145    pub enable_be_phrase_detection: bool,
146
147    /// Weight for be-phrase identity signal.
148    pub be_phrase_weight: f64,
149
150    /// Enable acronym matching (e.g., "MRSA" ↔ "Methicillin-resistant Staphylococcus aureus").
151    pub enable_acronym_matching: bool,
152
153    /// Weight for acronym match signal.
154    pub acronym_weight: f64,
155
156    /// Enable context-based link filtering.
157    /// Uses surrounding context (dates, locations, modifiers) to filter false links.
158    pub enable_context_filtering: bool,
159
160    /// Enable synonym matching for related terms.
161    ///
162    /// When enabled, uses string similarity (from `anno::coalesce`) as a proxy
163    /// for synonym relationships. High similarity (>0.8) indicates likely synonyms.
164    ///
165    /// For domain-specific synonyms (medical, legal, etc.), implement a custom
166    /// `anno::coalesce::SynonymSource` and integrate it with the resolver.
167    pub enable_synonym_matching: bool,
168
169    /// Weight for synonym match signal.
170    pub synonym_weight: f64,
171
172    // =========================================================================
173    // Nominal adjective detection (J2N: arXiv:2409.14374)
174    // =========================================================================
175    /// Enable detection of nominal adjectives as mentions.
176    ///
177    /// Nominal adjectives are phrases like "the poor", "the elderly", "the accused"
178    /// where an adjective functions as a noun phrase referring to a group of people.
179    ///
180    /// # Linguistic Background
181    ///
182    /// In English, certain adjectives can be "nominalized" when preceded by a
183    /// definite article: "The rich get richer while the poor get poorer."
184    /// Here, "the poor" refers to poor people as a collective group.
185    ///
186    /// # Coreference Impact (J2N Paper)
187    ///
188    /// Qi, Han & Xie (arXiv:2409.14374) showed that correctly detecting these
189    /// as mentions can improve coreference metrics slightly. Without detection, pronouns
190    /// like "they" that refer back to "the poor" become orphaned.
191    ///
192    /// # Grammatical Number
193    ///
194    /// Nominal adjectives are grammatically plural in English:
195    /// - "The poor ARE struggling" (not "is")
196    /// - "The elderly NEED support" (not "needs")
197    ///
198    /// Default: false (for backward compatibility)
199    pub enable_nominal_adjective_detection: bool,
200
201    /// Language for language-specific features (ISO 639-1 code).
202    ///
203    /// When set, enables language-specific patterns for:
204    /// - Nominal adjective detection (German "die Armen", French "les pauvres", etc.)
205    /// - Pronoun resolution rules
206    /// - Gender/number agreement
207    ///
208    /// Supported languages:
209    /// - "en" (default): English
210    /// - "de": German
211    /// - "fr": French
212    /// - "es": Spanish
213    ///
214    /// Default: "en"
215    pub language: String,
216}
217
218impl Default for MentionRankingConfig {
219    fn default() -> Self {
220        Self {
221            link_threshold: 0.3,
222
223            // Type-specific limits (Bourgois & Poibeau 2025)
224            pronoun_max_antecedents: 30,  // 95% within 7 mentions
225            proper_max_antecedents: 300,  // Can span 1700+ mentions
226            nominal_max_antecedents: 300, // Similar to proper nouns
227
228            // Legacy uniform limit (fallback)
229            max_distance: 100,
230
231            // Global proper noun coreference
232            enable_global_proper_coref: false, // Off by default for compatibility
233            global_proper_threshold: 0.7,
234
235            // Clustering strategy
236            clustering_strategy: ClusteringStrategy::LeftToRight,
237            use_non_coref_constraints: false,
238            non_coref_threshold: 0.2,
239
240            // Feature weights
241            string_match_weight: 1.0,
242            type_compat_weight: 0.5,
243            distance_weight: 0.1,
244
245            // Salience (disabled by default for backward compatibility)
246            salience_weight: 0.0,
247
248            // i2b2-inspired features (off by default for backward compatibility)
249            enable_be_phrase_detection: false,
250            be_phrase_weight: 0.8,
251            enable_acronym_matching: false,
252            acronym_weight: 0.7,
253            enable_context_filtering: false,
254            enable_synonym_matching: false,
255            synonym_weight: 0.5,
256
257            // Nominal adjective detection (J2N: arXiv:2409.14374)
258            enable_nominal_adjective_detection: false,
259
260            // Language (English by default)
261            language: "en".to_string(),
262        }
263    }
264}
265
266impl MentionRankingConfig {
267    /// Create a configuration optimized for book-scale documents.
268    ///
269    /// Based on findings from Bourgois & Poibeau (2025):
270    /// - Type-specific antecedent limits
271    /// - Global proper noun coreference enabled
272    /// - Easy-first clustering
273    #[must_use]
274    pub fn book_scale() -> Self {
275        Self {
276            link_threshold: 0.3,
277
278            // Type-specific limits
279            pronoun_max_antecedents: 30,
280            proper_max_antecedents: 300,
281            nominal_max_antecedents: 300,
282
283            max_distance: 500, // Larger for book-scale
284
285            // Enable book-scale optimizations
286            enable_global_proper_coref: true,
287            global_proper_threshold: 0.7,
288
289            clustering_strategy: ClusteringStrategy::EasyFirst,
290            use_non_coref_constraints: true,
291            non_coref_threshold: 0.2,
292
293            // Feature weights
294            string_match_weight: 1.0,
295            type_compat_weight: 0.5,
296            distance_weight: 0.05, // Lower weight for distance in long docs
297
298            // Salience helps in long documents where context is limited
299            salience_weight: 0.2,
300
301            // i2b2-inspired features (useful for long documents)
302            enable_be_phrase_detection: true,
303            be_phrase_weight: 0.8,
304            enable_acronym_matching: true,
305            acronym_weight: 0.7,
306            enable_context_filtering: true,
307            enable_synonym_matching: false, // Off by default, requires domain synonyms
308            synonym_weight: 0.5,
309            enable_nominal_adjective_detection: false,
310            language: "en".to_string(),
311        }
312    }
313
314    /// Create a configuration optimized for clinical/biomedical text.
315    ///
316    /// Based on Chen et al. (2011) "A Rule Based Solution to Co-reference
317    /// Resolution in Clinical Text" from i2b2 NLP Challenge:
318    /// - "Be phrase" detection for identity linking
319    /// - Acronym matching (e.g., MRSA ↔ Methicillin-resistant...)
320    /// - Context-based link filtering
321    /// - Synonym matching for medical terms
322    ///
323    /// # Example
324    ///
325    /// ```rust
326    /// use anno::backends::mention_ranking::MentionRankingConfig;
327    ///
328    /// let config = MentionRankingConfig::clinical();
329    /// assert!(config.enable_be_phrase_detection);
330    /// assert!(config.enable_acronym_matching);
331    /// ```
332    #[must_use]
333    pub fn clinical() -> Self {
334        Self {
335            link_threshold: 0.3,
336
337            // Clinical documents are typically shorter than books
338            pronoun_max_antecedents: 30,
339            proper_max_antecedents: 100,
340            nominal_max_antecedents: 100,
341
342            max_distance: 200,
343
344            // Global proper coref helps with patient/doctor names
345            enable_global_proper_coref: true,
346            global_proper_threshold: 0.6,
347
348            // Easy-first clustering works well for clinical
349            clustering_strategy: ClusteringStrategy::EasyFirst,
350            use_non_coref_constraints: true,
351            non_coref_threshold: 0.2,
352
353            // Feature weights (slightly higher for string matching in clinical)
354            string_match_weight: 1.2,
355            type_compat_weight: 0.5,
356            distance_weight: 0.08,
357
358            // Salience moderate
359            salience_weight: 0.15,
360
361            // Enable all i2b2-inspired features
362            enable_be_phrase_detection: true,
363            be_phrase_weight: 0.9, // High weight for clinical "X is Y" patterns
364            enable_acronym_matching: true,
365            acronym_weight: 0.8, // Medical acronyms are reliable
366            enable_context_filtering: true,
367            enable_synonym_matching: true, // Enable with medical synonyms
368            synonym_weight: 0.6,
369            enable_nominal_adjective_detection: false,
370            language: "en".to_string(),
371        }
372    }
373
374    /// Create a configuration with salience integration enabled.
375    ///
376    /// Salience-weighted scoring boosts antecedents that are more
377    /// important/central in the document.
378    #[must_use]
379    pub fn with_salience(mut self, weight: f64) -> Self {
380        self.salience_weight = weight.clamp(0.0, 1.0);
381        self
382    }
383
384    /// Get maximum antecedents for a given mention type.
385    #[must_use]
386    pub fn max_antecedents_for_type(&self, mention_type: MentionType) -> usize {
387        match mention_type {
388            MentionType::Pronominal => self.pronoun_max_antecedents,
389            MentionType::Proper => self.proper_max_antecedents,
390            MentionType::Nominal => self.nominal_max_antecedents,
391            // Zero anaphora and unknown types use nominal limits as default
392            MentionType::Zero | MentionType::Unknown => self.nominal_max_antecedents,
393        }
394    }
395}
396
397// MentionType imported from anno_core
398
399/// A detected mention with phi-features for coreference resolution.
400///
401/// This is the core data structure for mention-ranking coreference. Each mention
402/// carries the linguistic features needed to determine coreference compatibility:
403///
404/// - **Span** (`start`, `end`): Character offsets in the source text
405/// - **Type** (`mention_type`): Proper/Nominal/Pronominal/Zero (affects salience)
406/// - **Phi-features** (`gender`, `number`): Agreement constraints
407/// - **Head** (`head`): Syntactic head for matching
408///
409/// # Phi-Features and Agreement
410///
411/// The `gender` and `number` fields encode phi-features (φ-features) from
412/// linguistic theory. These are the grammatical features that govern agreement:
413///
414/// | Feature | Purpose | Example constraint |
415/// |---------|---------|-------------------|
416/// | Gender | Pronoun resolution | "Mary... she" not "he" |
417/// | Number | Singular/plural match | "The dogs... they" not "it" |
418///
419/// `None` values indicate unknown features, which are treated as compatible
420/// with any value (permissive matching).
421///
422/// # Cross-Linguistic Notes
423///
424/// - **Person** is not stored here (would be 3rd for most mentions)
425/// - **Dual number** is supported via `Number::Dual` (Arabic, Sanskrit, Hebrew)
426/// - **Noun class** systems (Bantu, Dyirbal) would need extension beyond `Gender`
427/// - **Zero mentions** (pro-drop) have spans but no surface text
428#[derive(Debug, Clone)]
429pub struct RankedMention {
430    /// Character start offset (0-indexed, inclusive).
431    ///
432    /// Uses character offsets, not byte offsets, for Unicode safety.
433    pub start: usize,
434
435    /// Character end offset (exclusive).
436    ///
437    /// The span `[start, end)` extracts the mention text.
438    pub end: usize,
439
440    /// The mention text as it appears in the source.
441    ///
442    /// For zero pronouns (pro-drop), this may be empty or a placeholder.
443    pub text: String,
444
445    /// Mention type classification.
446    ///
447    /// Affects antecedent search: pronouns look locally, proper nouns globally.
448    /// See [`MentionType`] for the accessibility hierarchy.
449    pub mention_type: MentionType,
450
451    /// Grammatical gender (if determinable).
452    ///
453    /// - `Some(Masculine/Feminine)`: Gendered pronoun or name
454    /// - `Some(Neutral)`: "they"/"it" (compatible with any gender)
455    /// - `Some(Unknown)`: Neopronouns or ungendered names
456    /// - `None`: Feature not applicable or not detected
457    pub gender: Option<Gender>,
458
459    /// Grammatical number (if determinable).
460    ///
461    /// - `Some(Singular)`: "he", "she", "it", "the dog"
462    /// - `Some(Dual)`: Arabic/Sanskrit dual forms
463    /// - `Some(Plural)`: "they", "the dogs"
464    /// - `Some(Unknown)`: "you" (ambiguous), singular "they"
465    /// - `None`: Feature not detected
466    pub number: Option<Number>,
467
468    /// Syntactic head word of the mention.
469    ///
470    /// For "the former president", head = "president".
471    /// Used for head matching in coreference scoring.
472    pub head: String,
473}
474
475impl RankedMention {
476    /// Get the character span as a tuple.
477    #[must_use]
478    pub fn span(&self) -> (usize, usize) {
479        (self.start, self.end)
480    }
481}
482
483/// Convert RankedMention to eval::coref::Mention for evaluation.
484///
485/// This enables using mention-ranking output directly in coreference evaluation.
486impl From<&RankedMention> for anno_core::Mention {
487    fn from(mention: &RankedMention) -> Self {
488        Self {
489            text: mention.text.clone(),
490            start: mention.start,
491            end: mention.end,
492            head_start: None,
493            head_end: None,
494            entity_type: None,
495            mention_type: Some(mention.mention_type),
496        }
497    }
498}
499
500impl From<RankedMention> for anno_core::Mention {
501    fn from(mention: RankedMention) -> Self {
502        Self::from(&mention)
503    }
504}
505
506/// Convert Entity to RankedMention for coreference resolution.
507///
508/// This enables using NER output directly in mention-ranking coreference.
509impl From<&crate::Entity> for RankedMention {
510    fn from(entity: &crate::Entity) -> Self {
511        Self {
512            start: entity.start,
513            end: entity.end,
514            text: entity.text.clone(),
515            mention_type: MentionType::classify(&entity.text),
516            gender: None,
517            number: None,
518            head: extract_head(&entity.text),
519        }
520    }
521}
522
523impl From<crate::Entity> for RankedMention {
524    fn from(entity: crate::Entity) -> Self {
525        Self::from(&entity)
526    }
527}
528
529/// Extract the head word from a mention (last word heuristic).
530fn extract_head(text: &str) -> String {
531    text.split_whitespace().last().unwrap_or(text).to_string()
532}
533
534// Gender and Number imported from anno_core
535// Number includes Dual for Arabic, Hebrew, Sanskrit, etc.
536pub use anno_core::Number;
537
538/// Coreference cluster from mention ranking.
539#[derive(Debug, Clone)]
540pub struct MentionCluster {
541    /// Cluster ID.
542    pub id: usize,
543    /// Mentions in this cluster.
544    pub mentions: Vec<RankedMention>,
545}
546
547impl MentionCluster {
548    /// Convert this cluster's mentions to Signals for use with GroundedDocument.
549    ///
550    /// Returns a vector of Signals with Location::Text locations.
551    /// Signal IDs are assigned based on mention order within the cluster.
552    ///
553    /// # Arguments
554    /// * `signal_id_base` - Starting signal ID (to avoid collisions with other clusters)
555    #[must_use]
556    pub fn to_signals(
557        &self,
558        signal_id_base: anno_core::SignalId,
559    ) -> Vec<anno_core::Signal<anno_core::Location>> {
560        self.mentions
561            .iter()
562            .enumerate()
563            .map(|(idx, mention)| anno_core::Signal {
564                id: signal_id_base + idx as u64,
565                location: anno_core::Location::Text {
566                    start: mention.start,
567                    end: mention.end,
568                },
569                surface: mention.text.clone(),
570                label: anno_core::TypeLabel::from(mention.mention_type.as_label()),
571                confidence: 1.0,
572                hierarchical: None,
573                provenance: None,
574                modality: anno_core::Modality::Symbolic,
575                normalized: None,
576                negated: false,
577                quantifier: None,
578            })
579            .collect()
580    }
581
582    /// Convert this cluster to a Track for use with GroundedDocument.
583    ///
584    /// This bridges mention-ranking output to the canonical Signal→Track→Identity hierarchy.
585    ///
586    /// # Arguments
587    /// * `signal_id_base` - Starting signal ID for the signals in this track
588    ///
589    /// # Returns
590    /// A tuple of `(Track, Vec<Signal>)` containing the track and its signals.
591    /// The signals should be added to the GroundedDocument separately.
592    #[must_use]
593    pub fn to_track(
594        &self,
595        signal_id_base: anno_core::SignalId,
596    ) -> (
597        anno_core::Track,
598        Vec<anno_core::Signal<anno_core::Location>>,
599    ) {
600        let signals = self.to_signals(signal_id_base);
601
602        // Find the canonical surface: prefer proper nouns, else first mention
603        let canonical_surface = self
604            .mentions
605            .iter()
606            .find(|m| m.mention_type == MentionType::Proper)
607            .or_else(|| self.mentions.first())
608            .map(|m| m.text.clone())
609            .unwrap_or_default();
610
611        // Build track with signal references
612        let mut track =
613            anno_core::Track::new(anno_core::TrackId::new(self.id as u64), canonical_surface);
614        // Mention-ranking coref does not infer entity type; leave unset.
615        track.entity_type = None;
616
617        for (idx, _) in signals.iter().enumerate() {
618            track.add_signal(signal_id_base + idx as u64, idx as u32);
619        }
620
621        (track, signals)
622    }
623
624    /// Get the canonical mention (first proper noun, or first mention if none).
625    #[must_use]
626    pub fn canonical_mention(&self) -> Option<&RankedMention> {
627        self.mentions
628            .iter()
629            .find(|m| m.mention_type == MentionType::Proper)
630            .or_else(|| self.mentions.first())
631    }
632}
633
634impl RankedMention {
635    /// Convert to a Signal with Location::Text.
636    #[must_use]
637    pub fn to_signal(
638        &self,
639        signal_id: anno_core::SignalId,
640    ) -> anno_core::Signal<anno_core::Location> {
641        anno_core::Signal {
642            id: signal_id,
643            location: anno_core::Location::Text {
644                start: self.start,
645                end: self.end,
646            },
647            surface: self.text.clone(),
648            label: anno_core::TypeLabel::from(self.mention_type.as_label()),
649            confidence: 1.0,
650            hierarchical: None,
651            provenance: None,
652            modality: anno_core::Modality::Symbolic,
653            normalized: None,
654            negated: false,
655            quantifier: None,
656        }
657    }
658}