scirs2_text/information_extraction/
coreference.rs

1//! Coreference resolution for pronoun and entity linking
2
3use super::entities::{Entity, EntityType};
4use crate::error::Result;
5use regex::Regex;
6
7/// Type of coreference mention
8#[derive(Debug, Clone, PartialEq)]
9pub enum MentionType {
10    /// Named entity mention
11    Entity,
12    /// Pronoun mention
13    Pronoun,
14    /// Descriptive mention
15    Description,
16}
17
18/// Individual mention in a coreference chain
19#[derive(Debug, Clone)]
20pub struct CoreferenceMention {
21    /// Text content of the mention
22    pub text: String,
23    /// Start position in the document
24    pub start: usize,
25    /// End position in the document
26    pub end: usize,
27    /// Type of mention
28    pub mention_type: MentionType,
29}
30
31/// Coreference chain representing linked mentions
32#[derive(Debug, Clone)]
33pub struct CoreferenceChain {
34    /// List of mentions in this chain
35    pub mentions: Vec<CoreferenceMention>,
36    /// Confidence score for the coreference chain
37    pub confidence: f64,
38}
39
40/// Coreference resolver for basic pronoun resolution
41pub struct CoreferenceResolver {
42    pronoun_patterns: Vec<Regex>,
43}
44
45impl Default for CoreferenceResolver {
46    fn default() -> Self {
47        Self::new()
48    }
49}
50
51impl CoreferenceResolver {
52    /// Create new coreference resolver
53    pub fn new() -> Self {
54        let pronoun_patterns = vec![
55            Regex::new(r"\b(?i)(?:he|she|it|they|him|her|them|his|hers|its|their)\b")
56                .expect("Operation failed"),
57            Regex::new(r"\b(?i)(?:this|that|these|those)\b").expect("Operation failed"),
58            Regex::new(r"\b(?i)(?:the (?:company|organization|person|individual|entity))\b")
59                .expect("Operation failed"),
60        ];
61
62        Self { pronoun_patterns }
63    }
64
65    /// Resolve coreferences in text with entities
66    pub fn resolve(&self, text: &str, entities: &[Entity]) -> Result<Vec<CoreferenceChain>> {
67        let mut chains = Vec::new();
68        let sentences = self.split_into_sentences(text);
69
70        for (sent_idx, sentence) in sentences.iter().enumerate() {
71            // Find entities in this sentence
72            let _sentence_entities: Vec<&Entity> = entities
73                .iter()
74                .filter(|e| {
75                    text[e.start..e.end].trim() == sentence.trim() || sentence.contains(&e.text)
76                })
77                .collect();
78
79            // Find pronouns in this sentence
80            for pattern in &self.pronoun_patterns {
81                for mat in pattern.find_iter(sentence) {
82                    // Try to resolve to nearest appropriate entity in previous sentences
83                    if let Some(antecedent) = self.find_antecedent(
84                        &mat.as_str().to_lowercase(),
85                        &sentences[..sent_idx],
86                        entities,
87                    ) {
88                        chains.push(CoreferenceChain {
89                            mentions: vec![
90                                CoreferenceMention {
91                                    text: antecedent.text.clone(),
92                                    start: antecedent.start,
93                                    end: antecedent.end,
94                                    mention_type: MentionType::Entity,
95                                },
96                                CoreferenceMention {
97                                    text: mat.as_str().to_string(),
98                                    start: mat.start(),
99                                    end: mat.end(),
100                                    mention_type: MentionType::Pronoun,
101                                },
102                            ],
103                            confidence: 0.6,
104                        });
105                    }
106                }
107            }
108        }
109
110        Ok(chains)
111    }
112
113    /// Split text into sentences (simple implementation)
114    pub fn split_into_sentences(&self, text: &str) -> Vec<String> {
115        text.split(['.', '!', '?'])
116            .map(|s| s.trim().to_string())
117            .filter(|s| !s.is_empty())
118            .collect()
119    }
120
121    /// Find antecedent for a pronoun
122    fn find_antecedent<'a>(
123        &self,
124        pronoun: &str,
125        previous_sentences: &[String],
126        entities: &'a [Entity],
127    ) -> Option<&'a Entity> {
128        // Simple heuristic: find the closest person/organization entity
129        let target_type = match pronoun {
130            "he" | "him" | "his" => Some(EntityType::Person),
131            "she" | "her" | "hers" => Some(EntityType::Person),
132            "it" | "its" => Some(EntityType::Organization),
133            "they" | "them" | "their" => None, // Could be either
134            _ => None,
135        };
136
137        // Look for entities in reverse order (most recent first)
138        for sentence in previous_sentences.iter().rev() {
139            for entity in entities.iter().rev() {
140                if sentence.contains(&entity.text) {
141                    if let Some(expected_type) = &target_type {
142                        if entity.entity_type == *expected_type {
143                            return Some(entity);
144                        }
145                    } else {
146                        // For ambiguous pronouns, return any person or organization
147                        if matches!(
148                            entity.entity_type,
149                            EntityType::Person | EntityType::Organization
150                        ) {
151                            return Some(entity);
152                        }
153                    }
154                }
155            }
156        }
157
158        None
159    }
160}