scirs2_text/information_extraction/
coreference.rs1use super::entities::{Entity, EntityType};
4use crate::error::Result;
5use regex::Regex;
6
7#[derive(Debug, Clone, PartialEq)]
9pub enum MentionType {
10 Entity,
12 Pronoun,
14 Description,
16}
17
18#[derive(Debug, Clone)]
20pub struct CoreferenceMention {
21 pub text: String,
23 pub start: usize,
25 pub end: usize,
27 pub mention_type: MentionType,
29}
30
31#[derive(Debug, Clone)]
33pub struct CoreferenceChain {
34 pub mentions: Vec<CoreferenceMention>,
36 pub confidence: f64,
38}
39
40pub struct CoreferenceResolver {
42 pronoun_patterns: Vec<Regex>,
43}
44
45impl Default for CoreferenceResolver {
46 fn default() -> Self {
47 Self::new()
48 }
49}
50
51impl CoreferenceResolver {
52 pub fn new() -> Self {
54 let pronoun_patterns = vec![
55 Regex::new(r"\b(?i)(?:he|she|it|they|him|her|them|his|hers|its|their)\b").unwrap(),
56 Regex::new(r"\b(?i)(?:this|that|these|those)\b").unwrap(),
57 Regex::new(r"\b(?i)(?:the (?:company|organization|person|individual|entity))\b")
58 .unwrap(),
59 ];
60
61 Self { pronoun_patterns }
62 }
63
64 pub fn resolve(&self, text: &str, entities: &[Entity]) -> Result<Vec<CoreferenceChain>> {
66 let mut chains = Vec::new();
67 let sentences = self.split_into_sentences(text);
68
69 for (sent_idx, sentence) in sentences.iter().enumerate() {
70 let _sentence_entities: Vec<&Entity> = entities
72 .iter()
73 .filter(|e| {
74 text[e.start..e.end].trim() == sentence.trim() || sentence.contains(&e.text)
75 })
76 .collect();
77
78 for pattern in &self.pronoun_patterns {
80 for mat in pattern.find_iter(sentence) {
81 if let Some(antecedent) = self.find_antecedent(
83 &mat.as_str().to_lowercase(),
84 &sentences[..sent_idx],
85 entities,
86 ) {
87 chains.push(CoreferenceChain {
88 mentions: vec![
89 CoreferenceMention {
90 text: antecedent.text.clone(),
91 start: antecedent.start,
92 end: antecedent.end,
93 mention_type: MentionType::Entity,
94 },
95 CoreferenceMention {
96 text: mat.as_str().to_string(),
97 start: mat.start(),
98 end: mat.end(),
99 mention_type: MentionType::Pronoun,
100 },
101 ],
102 confidence: 0.6,
103 });
104 }
105 }
106 }
107 }
108
109 Ok(chains)
110 }
111
112 pub fn split_into_sentences(&self, text: &str) -> Vec<String> {
114 text.split(['.', '!', '?'])
115 .map(|s| s.trim().to_string())
116 .filter(|s| !s.is_empty())
117 .collect()
118 }
119
120 fn find_antecedent<'a>(
122 &self,
123 pronoun: &str,
124 previous_sentences: &[String],
125 entities: &'a [Entity],
126 ) -> Option<&'a Entity> {
127 let target_type = match pronoun {
129 "he" | "him" | "his" => Some(EntityType::Person),
130 "she" | "her" | "hers" => Some(EntityType::Person),
131 "it" | "its" => Some(EntityType::Organization),
132 "they" | "them" | "their" => None, _ => None,
134 };
135
136 for sentence in previous_sentences.iter().rev() {
138 for entity in entities.iter().rev() {
139 if sentence.contains(&entity.text) {
140 if let Some(expected_type) = &target_type {
141 if entity.entity_type == *expected_type {
142 return Some(entity);
143 }
144 } else {
145 if matches!(
147 entity.entity_type,
148 EntityType::Person | EntityType::Organization
149 ) {
150 return Some(entity);
151 }
152 }
153 }
154 }
155 }
156
157 None
158 }
159}