langextract_rust/
alignment.rs

1//! Text alignment functionality for mapping extractions to source text positions.
2//!
3//! This module provides algorithms to align extracted text with their positions
4//! in the original source text, supporting both exact and fuzzy matching.
5
6use crate::{
7    data::{AlignmentStatus, CharInterval, Extraction},
8    exceptions::LangExtractResult,
9};
10use std::cmp::min;
11
12/// Configuration for text alignment
13#[derive(Debug, Clone)]
14pub struct AlignmentConfig {
15    /// Enable fuzzy alignment when exact matching fails
16    pub enable_fuzzy_alignment: bool,
17    /// Minimum overlap ratio for fuzzy alignment (0.0 to 1.0)
18    pub fuzzy_alignment_threshold: f32,
19    /// Accept partial exact matches (MATCH_LESSER status)
20    pub accept_match_lesser: bool,
21    /// Case-sensitive matching
22    pub case_sensitive: bool,
23    /// Maximum search window size for fuzzy matching
24    pub max_search_window: usize,
25}
26
27impl Default for AlignmentConfig {
28    fn default() -> Self {
29        Self {
30            enable_fuzzy_alignment: true,
31            fuzzy_alignment_threshold: 0.4, // Lower threshold for better fuzzy matching
32            accept_match_lesser: true,
33            case_sensitive: false,
34            max_search_window: 100,
35        }
36    }
37}
38
39/// Text aligner for mapping extractions to source text positions
40pub struct TextAligner {
41    config: AlignmentConfig,
42}
43
44impl TextAligner {
45    /// Create a new text aligner with default configuration
46    pub fn new() -> Self {
47        Self {
48            config: AlignmentConfig::default(),
49        }
50    }
51
52    /// Create a new text aligner with custom configuration
53    pub fn with_config(config: AlignmentConfig) -> Self {
54        Self { config }
55    }
56
57    /// Align extractions with the source text
58    pub fn align_extractions(
59        &self,
60        extractions: &mut [Extraction],
61        source_text: &str,
62        char_offset: usize,
63    ) -> LangExtractResult<usize> {
64        let mut aligned_count = 0;
65
66        for extraction in extractions.iter_mut() {
67            if let Some(interval) = self.align_single_extraction(extraction, source_text, char_offset)? {
68                extraction.char_interval = Some(interval);
69                aligned_count += 1;
70            }
71        }
72
73        Ok(aligned_count)
74    }
75
76    /// Align a single extraction with the source text
77    pub fn align_single_extraction(
78        &self,
79        extraction: &mut Extraction,
80        source_text: &str,
81        char_offset: usize,
82    ) -> LangExtractResult<Option<CharInterval>> {
83        let extraction_text = if self.config.case_sensitive {
84            extraction.extraction_text.clone()
85        } else {
86            extraction.extraction_text.to_lowercase()
87        };
88
89        let search_text = if self.config.case_sensitive {
90            source_text.to_string()
91        } else {
92            source_text.to_lowercase()
93        };
94
95        // Try exact matching first
96        if let Some((start, end, status)) = self.find_exact_match(&extraction_text, &search_text) {
97            extraction.alignment_status = Some(status);
98            return Ok(Some(CharInterval::new(
99                Some(start + char_offset),
100                Some(end + char_offset),
101            )));
102        }
103
104        // Try fuzzy matching if enabled
105        if self.config.enable_fuzzy_alignment {
106            if let Some((start, end, status)) = self.find_fuzzy_match(&extraction_text, &search_text) {
107                extraction.alignment_status = Some(status);
108                return Ok(Some(CharInterval::new(
109                    Some(start + char_offset),
110                    Some(end + char_offset),
111                )));
112            }
113        }
114
115        // No alignment found
116        extraction.alignment_status = None;
117        Ok(None)
118    }
119
120    /// Find exact matches in the source text
121    fn find_exact_match(&self, extraction_text: &str, source_text: &str) -> Option<(usize, usize, AlignmentStatus)> {
122        // Try to find the exact extraction text
123        if let Some(start) = source_text.find(extraction_text) {
124            let end = start + extraction_text.len();
125            return Some((start, end, AlignmentStatus::MatchExact));
126        }
127
128        // Try to find the extraction text as a substring (MATCH_LESSER)
129        if self.config.accept_match_lesser {
130            // Look for words from the extraction text
131            let extraction_words: Vec<&str> = extraction_text.split_whitespace().collect();
132            if extraction_words.len() > 1 {
133                // Try to find the first and last words
134                if let (Some(first_word), Some(last_word)) = (extraction_words.first(), extraction_words.last()) {
135                    if let Some(first_start) = source_text.find(first_word) {
136                        if let Some(last_start) = source_text[first_start..].find(last_word) {
137                            let last_absolute_start = first_start + last_start;
138                            let last_end = last_absolute_start + last_word.len();
139                            
140                            // Check if this span is reasonable (not too long)
141                            if last_end - first_start < extraction_text.len() * 2 {
142                                return Some((first_start, last_end, AlignmentStatus::MatchLesser));
143                            }
144                        }
145                    }
146                }
147            }
148        }
149
150        None
151    }
152
153    /// Find fuzzy matches using sliding window approach
154    fn find_fuzzy_match(&self, extraction_text: &str, source_text: &str) -> Option<(usize, usize, AlignmentStatus)> {
155        let extraction_words: Vec<&str> = extraction_text.split_whitespace().collect();
156        let source_words: Vec<&str> = source_text.split_whitespace().collect();
157
158        if extraction_words.is_empty() || source_words.is_empty() {
159            return None;
160        }
161
162        let mut best_match: Option<(usize, usize, f32)> = None;
163        
164        // Try different window sizes, starting with a reasonable size
165        let max_window = min(source_words.len(), self.config.max_search_window);
166        let min_window = extraction_words.len();
167        
168        for window_size in min_window..=max_window {
169            for start_idx in 0..=source_words.len().saturating_sub(window_size) {
170                let end_idx = start_idx + window_size;
171                let window = &source_words[start_idx..end_idx];
172
173                let similarity = self.calculate_word_similarity(&extraction_words, window);
174                
175                if similarity >= self.config.fuzzy_alignment_threshold {
176                    if let Some((_, _, current_best)) = best_match {
177                        if similarity > current_best {
178                            best_match = Some((start_idx, end_idx, similarity));
179                        }
180                    } else {
181                        best_match = Some((start_idx, end_idx, similarity));
182                    }
183                }
184            }
185            
186            // If we found a good match with a smaller window, prefer it
187            if best_match.is_some() {
188                break;
189            }
190        }
191
192        // Convert word positions back to character positions
193        if let Some((start_word_idx, end_word_idx, _)) = best_match {
194            let char_start = if start_word_idx == 0 {
195                0
196            } else {
197                source_words[..start_word_idx].join(" ").len() + 1
198            };
199
200            let char_end = if end_word_idx >= source_words.len() {
201                source_text.len()
202            } else {
203                source_words[..end_word_idx].join(" ").len()
204            };
205
206            return Some((char_start, char_end, AlignmentStatus::MatchFuzzy));
207        }
208
209        None
210    }
211
212    /// Calculate similarity between two word sequences using coverage-based similarity
213    fn calculate_word_similarity(&self, words1: &[&str], words2: &[&str]) -> f32 {
214        if words1.is_empty() && words2.is_empty() {
215            return 1.0;
216        }
217        if words1.is_empty() || words2.is_empty() {
218            return 0.0;
219        }
220
221        // Convert to lowercase for case-insensitive comparison
222        let normalized_words1: Vec<String> = words1.iter().map(|w| w.to_lowercase()).collect();
223        let normalized_words2: Vec<String> = words2.iter().map(|w| w.to_lowercase()).collect();
224
225        // Count how many words from extraction are found in the source window
226        let mut found_count = 0;
227        for word1 in &normalized_words1 {
228            if normalized_words2.contains(word1) {
229                found_count += 1;
230            }
231        }
232
233        // Calculate coverage: what percentage of extraction words are found
234        found_count as f32 / normalized_words1.len() as f32
235    }
236
237    /// Align extractions for chunked text processing
238    pub fn align_chunk_extractions(
239        &self,
240        extractions: &mut [Extraction],
241        chunk_text: &str,
242        chunk_char_offset: usize,
243    ) -> LangExtractResult<usize> {
244        self.align_extractions(extractions, chunk_text, chunk_char_offset)
245    }
246
247    /// Get alignment statistics
248    pub fn get_alignment_stats(&self, extractions: &[Extraction]) -> AlignmentStats {
249        let total = extractions.len();
250        let mut exact = 0;
251        let mut fuzzy = 0;
252        let mut lesser = 0;
253        let mut greater = 0;
254        let mut unaligned = 0;
255
256        for extraction in extractions {
257            match extraction.alignment_status {
258                Some(AlignmentStatus::MatchExact) => exact += 1,
259                Some(AlignmentStatus::MatchFuzzy) => fuzzy += 1,
260                Some(AlignmentStatus::MatchLesser) => lesser += 1,
261                Some(AlignmentStatus::MatchGreater) => greater += 1,
262                None => unaligned += 1,
263            }
264        }
265
266        AlignmentStats {
267            total,
268            exact,
269            fuzzy,
270            lesser,
271            greater,
272            unaligned,
273        }
274    }
275}
276
277impl Default for TextAligner {
278    fn default() -> Self {
279        Self::new()
280    }
281}
282
283/// Statistics about alignment results
284#[derive(Debug, Clone)]
285pub struct AlignmentStats {
286    pub total: usize,
287    pub exact: usize,
288    pub fuzzy: usize,
289    pub lesser: usize,
290    pub greater: usize,
291    pub unaligned: usize,
292}
293
294impl AlignmentStats {
295    /// Get the alignment success rate (0.0 to 1.0)
296    pub fn success_rate(&self) -> f32 {
297        if self.total == 0 {
298            1.0
299        } else {
300            (self.total - self.unaligned) as f32 / self.total as f32
301        }
302    }
303
304    /// Get the exact match rate (0.0 to 1.0)
305    pub fn exact_match_rate(&self) -> f32 {
306        if self.total == 0 {
307            0.0
308        } else {
309            self.exact as f32 / self.total as f32
310        }
311    }
312}
313
314#[cfg(test)]
315mod tests {
316    use super::*;
317
318    #[test]
319    fn test_exact_alignment() {
320        let aligner = TextAligner::new();
321        let mut extraction = Extraction::new("person".to_string(), "John Doe".to_string());
322        let source_text = "Hello, John Doe is a software engineer.";
323
324        let result = aligner.align_single_extraction(&mut extraction, source_text, 0).unwrap();
325
326        assert!(result.is_some());
327        let interval = result.unwrap();
328        assert_eq!(interval.start_pos, Some(7));
329        assert_eq!(interval.end_pos, Some(15));
330        assert_eq!(extraction.alignment_status, Some(AlignmentStatus::MatchExact));
331    }
332
333    #[test]
334    fn test_case_insensitive_alignment() {
335        let aligner = TextAligner::new();
336        let mut extraction = Extraction::new("person".to_string(), "JOHN DOE".to_string());
337        let source_text = "Hello, john doe is a software engineer.";
338
339        let result = aligner.align_single_extraction(&mut extraction, source_text, 0).unwrap();
340
341        assert!(result.is_some());
342        let interval = result.unwrap();
343        assert_eq!(interval.start_pos, Some(7));
344        assert_eq!(interval.end_pos, Some(15));
345        assert_eq!(extraction.alignment_status, Some(AlignmentStatus::MatchExact));
346    }
347
348    #[test]
349    fn test_fuzzy_alignment() {
350        let aligner = TextAligner::new();
351        let mut extraction = Extraction::new("person".to_string(), "John Smith".to_string());
352        let source_text = "Hello, John is a software engineer named Smith.";
353
354        let result = aligner.align_single_extraction(&mut extraction, source_text, 0).unwrap();
355
356        assert!(result.is_some());
357        assert_eq!(extraction.alignment_status, Some(AlignmentStatus::MatchFuzzy));
358    }
359
360    #[test]
361    fn test_no_alignment() {
362        let aligner = TextAligner::new();
363        let mut extraction = Extraction::new("person".to_string(), "Jane Doe".to_string());
364        let source_text = "Hello, John Smith is a software engineer.";
365
366        let result = aligner.align_single_extraction(&mut extraction, source_text, 0).unwrap();
367
368        assert!(result.is_none());
369        assert_eq!(extraction.alignment_status, None);
370    }
371
372    #[test]
373    fn test_chunk_offset() {
374        let aligner = TextAligner::new();
375        let mut extraction = Extraction::new("person".to_string(), "John Doe".to_string());
376        let chunk_text = "John Doe is here.";
377        let chunk_offset = 100;
378
379        let result = aligner.align_single_extraction(&mut extraction, chunk_text, chunk_offset).unwrap();
380
381        assert!(result.is_some());
382        let interval = result.unwrap();
383        assert_eq!(interval.start_pos, Some(100)); // 0 + 100
384        assert_eq!(interval.end_pos, Some(108));   // 8 + 100
385    }
386
387    #[test]
388    fn test_alignment_stats() {
389        let aligner = TextAligner::new();
390        let extractions = vec![
391            Extraction {
392                extraction_class: "test".to_string(),
393                extraction_text: "test".to_string(),
394                alignment_status: Some(AlignmentStatus::MatchExact),
395                ..Default::default()
396            },
397            Extraction {
398                extraction_class: "test".to_string(),
399                extraction_text: "test".to_string(),
400                alignment_status: Some(AlignmentStatus::MatchFuzzy),
401                ..Default::default()
402            },
403            Extraction {
404                extraction_class: "test".to_string(),
405                extraction_text: "test".to_string(),
406                alignment_status: None,
407                ..Default::default()
408            },
409        ];
410
411        let stats = aligner.get_alignment_stats(&extractions);
412        assert_eq!(stats.total, 3);
413        assert_eq!(stats.exact, 1);
414        assert_eq!(stats.fuzzy, 1);
415        assert_eq!(stats.unaligned, 1);
416        assert_eq!(stats.success_rate(), 2.0 / 3.0);
417        assert_eq!(stats.exact_match_rate(), 1.0 / 3.0);
418    }
419}