1use crate::{
7 data::{AlignmentStatus, CharInterval, Extraction},
8 exceptions::LangExtractResult,
9};
10use std::cmp::min;
11
12#[derive(Debug, Clone)]
14pub struct AlignmentConfig {
15 pub enable_fuzzy_alignment: bool,
17 pub fuzzy_alignment_threshold: f32,
19 pub accept_match_lesser: bool,
21 pub case_sensitive: bool,
23 pub max_search_window: usize,
25}
26
27impl Default for AlignmentConfig {
28 fn default() -> Self {
29 Self {
30 enable_fuzzy_alignment: true,
31 fuzzy_alignment_threshold: 0.4, accept_match_lesser: true,
33 case_sensitive: false,
34 max_search_window: 100,
35 }
36 }
37}
38
39pub struct TextAligner {
41 config: AlignmentConfig,
42}
43
44impl TextAligner {
45 pub fn new() -> Self {
47 Self {
48 config: AlignmentConfig::default(),
49 }
50 }
51
52 pub fn with_config(config: AlignmentConfig) -> Self {
54 Self { config }
55 }
56
57 pub fn align_extractions(
59 &self,
60 extractions: &mut [Extraction],
61 source_text: &str,
62 char_offset: usize,
63 ) -> LangExtractResult<usize> {
64 let mut aligned_count = 0;
65
66 for extraction in extractions.iter_mut() {
67 if let Some(interval) = self.align_single_extraction(extraction, source_text, char_offset)? {
68 extraction.char_interval = Some(interval);
69 aligned_count += 1;
70 }
71 }
72
73 Ok(aligned_count)
74 }
75
76 pub fn align_single_extraction(
78 &self,
79 extraction: &mut Extraction,
80 source_text: &str,
81 char_offset: usize,
82 ) -> LangExtractResult<Option<CharInterval>> {
83 let extraction_text = if self.config.case_sensitive {
84 extraction.extraction_text.clone()
85 } else {
86 extraction.extraction_text.to_lowercase()
87 };
88
89 let search_text = if self.config.case_sensitive {
90 source_text.to_string()
91 } else {
92 source_text.to_lowercase()
93 };
94
95 if let Some((start, end, status)) = self.find_exact_match(&extraction_text, &search_text) {
97 extraction.alignment_status = Some(status);
98 return Ok(Some(CharInterval::new(
99 Some(start + char_offset),
100 Some(end + char_offset),
101 )));
102 }
103
104 if self.config.enable_fuzzy_alignment {
106 if let Some((start, end, status)) = self.find_fuzzy_match(&extraction_text, &search_text) {
107 extraction.alignment_status = Some(status);
108 return Ok(Some(CharInterval::new(
109 Some(start + char_offset),
110 Some(end + char_offset),
111 )));
112 }
113 }
114
115 extraction.alignment_status = None;
117 Ok(None)
118 }
119
120 fn find_exact_match(&self, extraction_text: &str, source_text: &str) -> Option<(usize, usize, AlignmentStatus)> {
122 if let Some(start) = source_text.find(extraction_text) {
124 let end = start + extraction_text.len();
125 return Some((start, end, AlignmentStatus::MatchExact));
126 }
127
128 if self.config.accept_match_lesser {
130 let extraction_words: Vec<&str> = extraction_text.split_whitespace().collect();
132 if extraction_words.len() > 1 {
133 if let (Some(first_word), Some(last_word)) = (extraction_words.first(), extraction_words.last()) {
135 if let Some(first_start) = source_text.find(first_word) {
136 if let Some(last_start) = source_text[first_start..].find(last_word) {
137 let last_absolute_start = first_start + last_start;
138 let last_end = last_absolute_start + last_word.len();
139
140 if last_end - first_start < extraction_text.len() * 2 {
142 return Some((first_start, last_end, AlignmentStatus::MatchLesser));
143 }
144 }
145 }
146 }
147 }
148 }
149
150 None
151 }
152
153 fn find_fuzzy_match(&self, extraction_text: &str, source_text: &str) -> Option<(usize, usize, AlignmentStatus)> {
155 let extraction_words: Vec<&str> = extraction_text.split_whitespace().collect();
156 let source_words: Vec<&str> = source_text.split_whitespace().collect();
157
158 if extraction_words.is_empty() || source_words.is_empty() {
159 return None;
160 }
161
162 let mut best_match: Option<(usize, usize, f32)> = None;
163
164 let max_window = min(source_words.len(), self.config.max_search_window);
166 let min_window = extraction_words.len();
167
168 for window_size in min_window..=max_window {
169 for start_idx in 0..=source_words.len().saturating_sub(window_size) {
170 let end_idx = start_idx + window_size;
171 let window = &source_words[start_idx..end_idx];
172
173 let similarity = self.calculate_word_similarity(&extraction_words, window);
174
175 if similarity >= self.config.fuzzy_alignment_threshold {
176 if let Some((_, _, current_best)) = best_match {
177 if similarity > current_best {
178 best_match = Some((start_idx, end_idx, similarity));
179 }
180 } else {
181 best_match = Some((start_idx, end_idx, similarity));
182 }
183 }
184 }
185
186 if best_match.is_some() {
188 break;
189 }
190 }
191
192 if let Some((start_word_idx, end_word_idx, _)) = best_match {
194 let char_start = if start_word_idx == 0 {
195 0
196 } else {
197 source_words[..start_word_idx].join(" ").len() + 1
198 };
199
200 let char_end = if end_word_idx >= source_words.len() {
201 source_text.len()
202 } else {
203 source_words[..end_word_idx].join(" ").len()
204 };
205
206 return Some((char_start, char_end, AlignmentStatus::MatchFuzzy));
207 }
208
209 None
210 }
211
212 fn calculate_word_similarity(&self, words1: &[&str], words2: &[&str]) -> f32 {
214 if words1.is_empty() && words2.is_empty() {
215 return 1.0;
216 }
217 if words1.is_empty() || words2.is_empty() {
218 return 0.0;
219 }
220
221 let normalized_words1: Vec<String> = words1.iter().map(|w| w.to_lowercase()).collect();
223 let normalized_words2: Vec<String> = words2.iter().map(|w| w.to_lowercase()).collect();
224
225 let mut found_count = 0;
227 for word1 in &normalized_words1 {
228 if normalized_words2.contains(word1) {
229 found_count += 1;
230 }
231 }
232
233 found_count as f32 / normalized_words1.len() as f32
235 }
236
237 pub fn align_chunk_extractions(
239 &self,
240 extractions: &mut [Extraction],
241 chunk_text: &str,
242 chunk_char_offset: usize,
243 ) -> LangExtractResult<usize> {
244 self.align_extractions(extractions, chunk_text, chunk_char_offset)
245 }
246
247 pub fn get_alignment_stats(&self, extractions: &[Extraction]) -> AlignmentStats {
249 let total = extractions.len();
250 let mut exact = 0;
251 let mut fuzzy = 0;
252 let mut lesser = 0;
253 let mut greater = 0;
254 let mut unaligned = 0;
255
256 for extraction in extractions {
257 match extraction.alignment_status {
258 Some(AlignmentStatus::MatchExact) => exact += 1,
259 Some(AlignmentStatus::MatchFuzzy) => fuzzy += 1,
260 Some(AlignmentStatus::MatchLesser) => lesser += 1,
261 Some(AlignmentStatus::MatchGreater) => greater += 1,
262 None => unaligned += 1,
263 }
264 }
265
266 AlignmentStats {
267 total,
268 exact,
269 fuzzy,
270 lesser,
271 greater,
272 unaligned,
273 }
274 }
275}
276
277impl Default for TextAligner {
278 fn default() -> Self {
279 Self::new()
280 }
281}
282
283#[derive(Debug, Clone)]
285pub struct AlignmentStats {
286 pub total: usize,
287 pub exact: usize,
288 pub fuzzy: usize,
289 pub lesser: usize,
290 pub greater: usize,
291 pub unaligned: usize,
292}
293
294impl AlignmentStats {
295 pub fn success_rate(&self) -> f32 {
297 if self.total == 0 {
298 1.0
299 } else {
300 (self.total - self.unaligned) as f32 / self.total as f32
301 }
302 }
303
304 pub fn exact_match_rate(&self) -> f32 {
306 if self.total == 0 {
307 0.0
308 } else {
309 self.exact as f32 / self.total as f32
310 }
311 }
312}
313
314#[cfg(test)]
315mod tests {
316 use super::*;
317
318 #[test]
319 fn test_exact_alignment() {
320 let aligner = TextAligner::new();
321 let mut extraction = Extraction::new("person".to_string(), "John Doe".to_string());
322 let source_text = "Hello, John Doe is a software engineer.";
323
324 let result = aligner.align_single_extraction(&mut extraction, source_text, 0).unwrap();
325
326 assert!(result.is_some());
327 let interval = result.unwrap();
328 assert_eq!(interval.start_pos, Some(7));
329 assert_eq!(interval.end_pos, Some(15));
330 assert_eq!(extraction.alignment_status, Some(AlignmentStatus::MatchExact));
331 }
332
333 #[test]
334 fn test_case_insensitive_alignment() {
335 let aligner = TextAligner::new();
336 let mut extraction = Extraction::new("person".to_string(), "JOHN DOE".to_string());
337 let source_text = "Hello, john doe is a software engineer.";
338
339 let result = aligner.align_single_extraction(&mut extraction, source_text, 0).unwrap();
340
341 assert!(result.is_some());
342 let interval = result.unwrap();
343 assert_eq!(interval.start_pos, Some(7));
344 assert_eq!(interval.end_pos, Some(15));
345 assert_eq!(extraction.alignment_status, Some(AlignmentStatus::MatchExact));
346 }
347
348 #[test]
349 fn test_fuzzy_alignment() {
350 let aligner = TextAligner::new();
351 let mut extraction = Extraction::new("person".to_string(), "John Smith".to_string());
352 let source_text = "Hello, John is a software engineer named Smith.";
353
354 let result = aligner.align_single_extraction(&mut extraction, source_text, 0).unwrap();
355
356 assert!(result.is_some());
357 assert_eq!(extraction.alignment_status, Some(AlignmentStatus::MatchFuzzy));
358 }
359
360 #[test]
361 fn test_no_alignment() {
362 let aligner = TextAligner::new();
363 let mut extraction = Extraction::new("person".to_string(), "Jane Doe".to_string());
364 let source_text = "Hello, John Smith is a software engineer.";
365
366 let result = aligner.align_single_extraction(&mut extraction, source_text, 0).unwrap();
367
368 assert!(result.is_none());
369 assert_eq!(extraction.alignment_status, None);
370 }
371
372 #[test]
373 fn test_chunk_offset() {
374 let aligner = TextAligner::new();
375 let mut extraction = Extraction::new("person".to_string(), "John Doe".to_string());
376 let chunk_text = "John Doe is here.";
377 let chunk_offset = 100;
378
379 let result = aligner.align_single_extraction(&mut extraction, chunk_text, chunk_offset).unwrap();
380
381 assert!(result.is_some());
382 let interval = result.unwrap();
383 assert_eq!(interval.start_pos, Some(100)); assert_eq!(interval.end_pos, Some(108)); }
386
387 #[test]
388 fn test_alignment_stats() {
389 let aligner = TextAligner::new();
390 let extractions = vec![
391 Extraction {
392 extraction_class: "test".to_string(),
393 extraction_text: "test".to_string(),
394 alignment_status: Some(AlignmentStatus::MatchExact),
395 ..Default::default()
396 },
397 Extraction {
398 extraction_class: "test".to_string(),
399 extraction_text: "test".to_string(),
400 alignment_status: Some(AlignmentStatus::MatchFuzzy),
401 ..Default::default()
402 },
403 Extraction {
404 extraction_class: "test".to_string(),
405 extraction_text: "test".to_string(),
406 alignment_status: None,
407 ..Default::default()
408 },
409 ];
410
411 let stats = aligner.get_alignment_stats(&extractions);
412 assert_eq!(stats.total, 3);
413 assert_eq!(stats.exact, 1);
414 assert_eq!(stats.fuzzy, 1);
415 assert_eq!(stats.unaligned, 1);
416 assert_eq!(stats.success_rate(), 2.0 / 3.0);
417 assert_eq!(stats.exact_match_rate(), 1.0 / 3.0);
418 }
419}