scirs2_text/information_extraction/
coreference.rs1use super::entities::{Entity, EntityType};
4use crate::error::Result;
5use regex::Regex;
6
7#[derive(Debug, Clone, PartialEq)]
9pub enum MentionType {
10 Entity,
12 Pronoun,
14 Description,
16}
17
18#[derive(Debug, Clone)]
20pub struct CoreferenceMention {
21 pub text: String,
23 pub start: usize,
25 pub end: usize,
27 pub mention_type: MentionType,
29}
30
31#[derive(Debug, Clone)]
33pub struct CoreferenceChain {
34 pub mentions: Vec<CoreferenceMention>,
36 pub confidence: f64,
38}
39
40pub struct CoreferenceResolver {
42 pronoun_patterns: Vec<Regex>,
43}
44
45impl Default for CoreferenceResolver {
46 fn default() -> Self {
47 Self::new()
48 }
49}
50
51impl CoreferenceResolver {
52 pub fn new() -> Self {
54 let pronoun_patterns = vec![
55 Regex::new(r"\b(?i)(?:he|she|it|they|him|her|them|his|hers|its|their)\b")
56 .expect("Operation failed"),
57 Regex::new(r"\b(?i)(?:this|that|these|those)\b").expect("Operation failed"),
58 Regex::new(r"\b(?i)(?:the (?:company|organization|person|individual|entity))\b")
59 .expect("Operation failed"),
60 ];
61
62 Self { pronoun_patterns }
63 }
64
65 pub fn resolve(&self, text: &str, entities: &[Entity]) -> Result<Vec<CoreferenceChain>> {
67 let mut chains = Vec::new();
68 let sentences = self.split_into_sentences(text);
69
70 for (sent_idx, sentence) in sentences.iter().enumerate() {
71 let _sentence_entities: Vec<&Entity> = entities
73 .iter()
74 .filter(|e| {
75 text[e.start..e.end].trim() == sentence.trim() || sentence.contains(&e.text)
76 })
77 .collect();
78
79 for pattern in &self.pronoun_patterns {
81 for mat in pattern.find_iter(sentence) {
82 if let Some(antecedent) = self.find_antecedent(
84 &mat.as_str().to_lowercase(),
85 &sentences[..sent_idx],
86 entities,
87 ) {
88 chains.push(CoreferenceChain {
89 mentions: vec![
90 CoreferenceMention {
91 text: antecedent.text.clone(),
92 start: antecedent.start,
93 end: antecedent.end,
94 mention_type: MentionType::Entity,
95 },
96 CoreferenceMention {
97 text: mat.as_str().to_string(),
98 start: mat.start(),
99 end: mat.end(),
100 mention_type: MentionType::Pronoun,
101 },
102 ],
103 confidence: 0.6,
104 });
105 }
106 }
107 }
108 }
109
110 Ok(chains)
111 }
112
113 pub fn split_into_sentences(&self, text: &str) -> Vec<String> {
115 text.split(['.', '!', '?'])
116 .map(|s| s.trim().to_string())
117 .filter(|s| !s.is_empty())
118 .collect()
119 }
120
121 fn find_antecedent<'a>(
123 &self,
124 pronoun: &str,
125 previous_sentences: &[String],
126 entities: &'a [Entity],
127 ) -> Option<&'a Entity> {
128 let target_type = match pronoun {
130 "he" | "him" | "his" => Some(EntityType::Person),
131 "she" | "her" | "hers" => Some(EntityType::Person),
132 "it" | "its" => Some(EntityType::Organization),
133 "they" | "them" | "their" => None, _ => None,
135 };
136
137 for sentence in previous_sentences.iter().rev() {
139 for entity in entities.iter().rev() {
140 if sentence.contains(&entity.text) {
141 if let Some(expected_type) = &target_type {
142 if entity.entity_type == *expected_type {
143 return Some(entity);
144 }
145 } else {
146 if matches!(
148 entity.entity_type,
149 EntityType::Person | EntityType::Organization
150 ) {
151 return Some(entity);
152 }
153 }
154 }
155 }
156 }
157
158 None
159 }
160}