scirs2_text/information_extraction/
coreference.rs

1//! Coreference resolution for pronoun and entity linking
2
3use super::entities::{Entity, EntityType};
4use crate::error::Result;
5use regex::Regex;
6
7/// Type of coreference mention
8#[derive(Debug, Clone, PartialEq)]
9pub enum MentionType {
10    /// Named entity mention
11    Entity,
12    /// Pronoun mention
13    Pronoun,
14    /// Descriptive mention
15    Description,
16}
17
18/// Individual mention in a coreference chain
19#[derive(Debug, Clone)]
20pub struct CoreferenceMention {
21    /// Text content of the mention
22    pub text: String,
23    /// Start position in the document
24    pub start: usize,
25    /// End position in the document
26    pub end: usize,
27    /// Type of mention
28    pub mention_type: MentionType,
29}
30
31/// Coreference chain representing linked mentions
32#[derive(Debug, Clone)]
33pub struct CoreferenceChain {
34    /// List of mentions in this chain
35    pub mentions: Vec<CoreferenceMention>,
36    /// Confidence score for the coreference chain
37    pub confidence: f64,
38}
39
40/// Coreference resolver for basic pronoun resolution
41pub struct CoreferenceResolver {
42    pronoun_patterns: Vec<Regex>,
43}
44
45impl Default for CoreferenceResolver {
46    fn default() -> Self {
47        Self::new()
48    }
49}
50
51impl CoreferenceResolver {
52    /// Create new coreference resolver
53    pub fn new() -> Self {
54        let pronoun_patterns = vec![
55            Regex::new(r"\b(?i)(?:he|she|it|they|him|her|them|his|hers|its|their)\b").unwrap(),
56            Regex::new(r"\b(?i)(?:this|that|these|those)\b").unwrap(),
57            Regex::new(r"\b(?i)(?:the (?:company|organization|person|individual|entity))\b")
58                .unwrap(),
59        ];
60
61        Self { pronoun_patterns }
62    }
63
64    /// Resolve coreferences in text with entities
65    pub fn resolve(&self, text: &str, entities: &[Entity]) -> Result<Vec<CoreferenceChain>> {
66        let mut chains = Vec::new();
67        let sentences = self.split_into_sentences(text);
68
69        for (sent_idx, sentence) in sentences.iter().enumerate() {
70            // Find entities in this sentence
71            let _sentence_entities: Vec<&Entity> = entities
72                .iter()
73                .filter(|e| {
74                    text[e.start..e.end].trim() == sentence.trim() || sentence.contains(&e.text)
75                })
76                .collect();
77
78            // Find pronouns in this sentence
79            for pattern in &self.pronoun_patterns {
80                for mat in pattern.find_iter(sentence) {
81                    // Try to resolve to nearest appropriate entity in previous sentences
82                    if let Some(antecedent) = self.find_antecedent(
83                        &mat.as_str().to_lowercase(),
84                        &sentences[..sent_idx],
85                        entities,
86                    ) {
87                        chains.push(CoreferenceChain {
88                            mentions: vec![
89                                CoreferenceMention {
90                                    text: antecedent.text.clone(),
91                                    start: antecedent.start,
92                                    end: antecedent.end,
93                                    mention_type: MentionType::Entity,
94                                },
95                                CoreferenceMention {
96                                    text: mat.as_str().to_string(),
97                                    start: mat.start(),
98                                    end: mat.end(),
99                                    mention_type: MentionType::Pronoun,
100                                },
101                            ],
102                            confidence: 0.6,
103                        });
104                    }
105                }
106            }
107        }
108
109        Ok(chains)
110    }
111
112    /// Split text into sentences (simple implementation)
113    pub fn split_into_sentences(&self, text: &str) -> Vec<String> {
114        text.split(['.', '!', '?'])
115            .map(|s| s.trim().to_string())
116            .filter(|s| !s.is_empty())
117            .collect()
118    }
119
120    /// Find antecedent for a pronoun
121    fn find_antecedent<'a>(
122        &self,
123        pronoun: &str,
124        previous_sentences: &[String],
125        entities: &'a [Entity],
126    ) -> Option<&'a Entity> {
127        // Simple heuristic: find the closest person/organization entity
128        let target_type = match pronoun {
129            "he" | "him" | "his" => Some(EntityType::Person),
130            "she" | "her" | "hers" => Some(EntityType::Person),
131            "it" | "its" => Some(EntityType::Organization),
132            "they" | "them" | "their" => None, // Could be either
133            _ => None,
134        };
135
136        // Look for entities in reverse order (most recent first)
137        for sentence in previous_sentences.iter().rev() {
138            for entity in entities.iter().rev() {
139                if sentence.contains(&entity.text) {
140                    if let Some(expected_type) = &target_type {
141                        if entity.entity_type == *expected_type {
142                            return Some(entity);
143                        }
144                    } else {
145                        // For ambiguous pronouns, return any person or organization
146                        if matches!(
147                            entity.entity_type,
148                            EntityType::Person | EntityType::Organization
149                        ) {
150                            return Some(entity);
151                        }
152                    }
153                }
154            }
155        }
156
157        None
158    }
159}