Skip to main content

oxidize_pdf/text/
validation.rs

1//! Text validation and search utilities for OCR results
2//!
3//! This module provides functionality for validating and searching through
4//! OCR-extracted text to find key elements like dates, contract terms, etc.
5
6use regex::Regex;
7use std::collections::HashMap;
8
9/// Results from searching and validating OCR text
10#[derive(Debug, Clone)]
11pub struct TextValidationResult {
12    /// Whether the target string was found
13    pub found: bool,
14    /// All matches found
15    pub matches: Vec<TextMatch>,
16    /// Confidence score of the overall validation
17    pub confidence: f64,
18    /// Additional metadata extracted
19    pub metadata: HashMap<String, String>,
20}
21
22/// A specific match found in the text
23#[derive(Debug, Clone)]
24pub struct TextMatch {
25    /// The matched text
26    pub text: String,
27    /// Position in the original text
28    pub position: usize,
29    /// Length of the match
30    pub length: usize,
31    /// Confidence of this specific match
32    pub confidence: f64,
33    /// Type of match (date, name, etc.)
34    pub match_type: MatchType,
35}
36
37/// Type of text match found
38#[derive(Debug, Clone, PartialEq)]
39pub enum MatchType {
40    Date,
41    ContractNumber,
42    PartyName,
43    MonetaryAmount,
44    Location,
45    Custom(String),
46}
47
48/// Text validator for OCR results
49pub struct TextValidator {
50    /// Date patterns to search for
51    date_patterns: Vec<Regex>,
52    /// Contract-specific patterns
53    contract_patterns: Vec<Regex>,
54    /// Custom patterns (reserved for future use)
55    #[allow(dead_code)]
56    custom_patterns: HashMap<String, Regex>,
57}
58
59impl TextValidator {
60    /// Create a new text validator with default patterns
61    pub fn new() -> Self {
62        let mut validator = Self {
63            date_patterns: Vec::new(),
64            contract_patterns: Vec::new(),
65            custom_patterns: HashMap::new(),
66        };
67
68        validator.init_default_patterns();
69        validator
70    }
71
72    /// Initialize default patterns for common contract elements
73    fn init_default_patterns(&mut self) {
74        // Date patterns - various formats
75        let date_patterns = vec![
76            // "30 September 2016", "September 30, 2016", etc.
77            r"\b\d{1,2}\s+(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{4}\b",
78            // "September 30, 2016"
79            r"\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{4}\b",
80            // "30/09/2016", "09/30/2016"
81            r"\b\d{1,2}[\/\-]\d{1,2}[\/\-]\d{4}\b",
82            // "2016-09-30"
83            r"\b\d{4}[\/\-]\d{1,2}[\/\-]\d{1,2}\b",
84        ];
85
86        for pattern in date_patterns {
87            if let Ok(regex) = Regex::new(&format!("(?i){}", pattern)) {
88                self.date_patterns.push(regex);
89            }
90        }
91
92        // Contract-specific patterns
93        let contract_patterns = vec![
94            // Agreement numbers, contract numbers
95            r"\b(?:Agreement|Contract)\s+(?:No\.?|Number)?\s*:?\s*([A-Z0-9\-\/]+)",
96            // Party names (organizations ending with common suffixes)
97            r"\b([A-Z][A-Za-z\s&,\.]+(?:LLC|Ltd|Corp|Corporation|Inc|Company|Co\.)\b)",
98            // Monetary amounts
99            r"\$\s*[\d,]+(?:\.\d{2})?(?:\s*(?:million|thousand|M|K))?",
100        ];
101
102        for pattern in contract_patterns {
103            if let Ok(regex) = Regex::new(&format!("(?i){}", pattern)) {
104                self.contract_patterns.push(regex);
105            }
106        }
107    }
108
109    /// Search for a specific target string in the text
110    pub fn search_for_target(&self, text: &str, target: &str) -> TextValidationResult {
111        let target_lower = target.to_lowercase();
112        let text_lower = text.to_lowercase();
113
114        let mut matches = Vec::new();
115        let mut position = 0;
116
117        // Find all occurrences of the target string
118        while let Some(found_pos) = text_lower[position..].find(&target_lower) {
119            let actual_pos = position + found_pos;
120            let actual_text = &text[actual_pos..actual_pos + target.len()];
121
122            matches.push(TextMatch {
123                text: actual_text.to_string(),
124                position: actual_pos,
125                length: target.len(),
126                confidence: calculate_string_similarity(
127                    &target_lower,
128                    &text_lower[actual_pos..actual_pos + target.len()],
129                ),
130                match_type: MatchType::Custom("target_search".to_string()),
131            });
132
133            position = actual_pos + 1;
134        }
135
136        TextValidationResult {
137            found: !matches.is_empty(),
138            confidence: if matches.is_empty() {
139                0.0
140            } else {
141                matches.iter().map(|m| m.confidence).sum::<f64>() / matches.len() as f64
142            },
143            matches,
144            metadata: HashMap::new(),
145        }
146    }
147
148    /// Perform comprehensive validation of OCR text
149    pub fn validate_contract_text(&self, text: &str) -> TextValidationResult {
150        let mut all_matches = Vec::new();
151        let mut metadata = HashMap::new();
152
153        // Search for dates
154        for pattern in &self.date_patterns {
155            for mat in pattern.find_iter(text) {
156                all_matches.push(TextMatch {
157                    text: mat.as_str().to_string(),
158                    position: mat.start(),
159                    length: mat.len(),
160                    confidence: 0.9, // High confidence for regex matches
161                    match_type: MatchType::Date,
162                });
163            }
164        }
165
166        // Search for contract elements
167        for pattern in &self.contract_patterns {
168            for mat in pattern.find_iter(text) {
169                let match_text = mat.as_str().to_string();
170                let match_type = if match_text.contains("$") {
171                    MatchType::MonetaryAmount
172                } else if match_text.to_lowercase().contains("agreement")
173                    || match_text.to_lowercase().contains("contract")
174                {
175                    MatchType::ContractNumber
176                } else {
177                    MatchType::PartyName
178                };
179
180                all_matches.push(TextMatch {
181                    text: match_text,
182                    position: mat.start(),
183                    length: mat.len(),
184                    confidence: 0.8,
185                    match_type,
186                });
187            }
188        }
189
190        // Calculate overall confidence
191        let confidence = if all_matches.is_empty() {
192            0.0
193        } else {
194            all_matches.iter().map(|m| m.confidence).sum::<f64>() / all_matches.len() as f64
195        };
196
197        // Add metadata
198        metadata.insert("total_matches".to_string(), all_matches.len().to_string());
199        metadata.insert("text_length".to_string(), text.len().to_string());
200
201        let date_matches = all_matches
202            .iter()
203            .filter(|m| m.match_type == MatchType::Date)
204            .count();
205        metadata.insert("date_matches".to_string(), date_matches.to_string());
206
207        TextValidationResult {
208            found: !all_matches.is_empty(),
209            confidence,
210            matches: all_matches,
211            metadata,
212        }
213    }
214
215    /// Extract key information from contract text
216    pub fn extract_key_info(&self, text: &str) -> HashMap<String, Vec<String>> {
217        let mut extracted = HashMap::new();
218
219        // Extract dates
220        let mut dates = Vec::new();
221        for pattern in &self.date_patterns {
222            for mat in pattern.find_iter(text) {
223                dates.push(mat.as_str().to_string());
224            }
225        }
226        if !dates.is_empty() {
227            extracted.insert("dates".to_string(), dates);
228        }
229
230        // Extract monetary amounts
231        if let Ok(money_regex) =
232            Regex::new(r"\$\s*[\d,]+(?:\.\d{2})?(?:\s*(?:million|thousand|M|K))?")
233        {
234            let mut amounts = Vec::new();
235            for mat in money_regex.find_iter(text) {
236                amounts.push(mat.as_str().to_string());
237            }
238            if !amounts.is_empty() {
239                extracted.insert("monetary_amounts".to_string(), amounts);
240            }
241        }
242
243        // Extract potential party names (capitalized words followed by organization suffixes)
244        if let Ok(org_regex) =
245            Regex::new(r"\b([A-Z][A-Za-z\s&,\.]+(?:LLC|Ltd|Corp|Corporation|Inc|Company|Co\.)\b)")
246        {
247            let mut organizations = Vec::new();
248            for mat in org_regex.find_iter(text) {
249                organizations.push(mat.as_str().to_string());
250            }
251            if !organizations.is_empty() {
252                extracted.insert("organizations".to_string(), organizations);
253            }
254        }
255
256        extracted
257    }
258}
259
260impl Default for TextValidator {
261    fn default() -> Self {
262        Self::new()
263    }
264}
265
266/// Calculate similarity between two strings (0.0 to 1.0)
267fn calculate_string_similarity(s1: &str, s2: &str) -> f64 {
268    if s1 == s2 {
269        return 1.0;
270    }
271
272    let s1_chars: Vec<char> = s1.chars().collect();
273    let s2_chars: Vec<char> = s2.chars().collect();
274
275    if s1_chars.is_empty() || s2_chars.is_empty() {
276        return 0.0;
277    }
278
279    // Simple character-based similarity
280    let max_len = s1_chars.len().max(s2_chars.len());
281    let min_len = s1_chars.len().min(s2_chars.len());
282
283    let mut matches = 0;
284    for i in 0..min_len {
285        if s1_chars[i] == s2_chars[i] {
286            matches += 1;
287        }
288    }
289
290    matches as f64 / max_len as f64
291}
292
293#[cfg(test)]
294mod tests {
295    use super::*;
296
297    #[test]
298    fn test_date_validation() {
299        let validator = TextValidator::new();
300        let text =
301            "This agreement was signed on 30 September 2016 and expires on December 31, 2020.";
302
303        let result = validator.validate_contract_text(text);
304        assert!(result.found);
305
306        // Should find at least the dates
307        let date_matches: Vec<_> = result
308            .matches
309            .iter()
310            .filter(|m| m.match_type == MatchType::Date)
311            .collect();
312        assert!(!date_matches.is_empty());
313    }
314
315    #[test]
316    fn test_target_search() {
317        let validator = TextValidator::new();
318        let text = "The contract was executed on 30 September 2016 by both parties.";
319
320        let result = validator.search_for_target(text, "30 September 2016");
321        assert!(result.found);
322        assert_eq!(result.matches.len(), 1);
323        assert_eq!(result.matches[0].text, "30 September 2016");
324    }
325
326    #[test]
327    fn test_key_info_extraction() {
328        let validator = TextValidator::new();
329        let text =
330            "Agreement between ABC Corp and XYZ LLC for $1,000,000 signed on 30 September 2016.";
331
332        let extracted = validator.extract_key_info(text);
333
334        assert!(extracted.contains_key("dates"));
335        assert!(extracted.contains_key("monetary_amounts"));
336        assert!(extracted.contains_key("organizations"));
337    }
338
339    #[test]
340    fn test_string_similarity_identical() {
341        let similarity = calculate_string_similarity("hello", "hello");
342        assert_eq!(similarity, 1.0);
343    }
344
345    #[test]
346    fn test_string_similarity_empty() {
347        assert_eq!(calculate_string_similarity("", "test"), 0.0);
348        assert_eq!(calculate_string_similarity("test", ""), 0.0);
349        // Empty strings are equal so similarity is 1.0
350        assert_eq!(calculate_string_similarity("", ""), 1.0);
351    }
352
353    #[test]
354    fn test_string_similarity_partial() {
355        let similarity = calculate_string_similarity("hello", "hella");
356        assert!(similarity > 0.5);
357        assert!(similarity < 1.0);
358    }
359
360    #[test]
361    fn test_string_similarity_different_lengths() {
362        let similarity = calculate_string_similarity("hi", "hello");
363        assert!(similarity < 0.5); // Different lengths, partial match
364    }
365
366    #[test]
367    fn test_target_search_not_found() {
368        let validator = TextValidator::new();
369        let text = "This text does not contain the target.";
370
371        let result = validator.search_for_target(text, "nonexistent phrase");
372        assert!(!result.found);
373        assert!(result.matches.is_empty());
374        assert_eq!(result.confidence, 0.0);
375    }
376
377    #[test]
378    fn test_target_search_multiple_occurrences() {
379        let validator = TextValidator::new();
380        let text = "The date is 2016 and year 2016 was important. Also 2016.";
381
382        let result = validator.search_for_target(text, "2016");
383        assert!(result.found);
384        assert_eq!(result.matches.len(), 3);
385    }
386
387    #[test]
388    fn test_target_search_case_insensitive() {
389        let validator = TextValidator::new();
390        let text = "Hello WORLD and hello world";
391
392        let result = validator.search_for_target(text, "hello");
393        assert!(result.found);
394        assert_eq!(result.matches.len(), 2);
395    }
396
397    #[test]
398    fn test_validate_contract_no_matches() {
399        let validator = TextValidator::new();
400        let text = "just some random text without dates or amounts";
401
402        let result = validator.validate_contract_text(text);
403        assert!(!result.found);
404        assert!(result.matches.is_empty());
405        assert_eq!(result.confidence, 0.0);
406        assert_eq!(result.metadata.get("total_matches").unwrap(), "0");
407    }
408
409    #[test]
410    fn test_match_type_variants() {
411        assert_eq!(MatchType::Date, MatchType::Date);
412        assert_eq!(MatchType::ContractNumber, MatchType::ContractNumber);
413        assert_eq!(MatchType::PartyName, MatchType::PartyName);
414        assert_eq!(MatchType::MonetaryAmount, MatchType::MonetaryAmount);
415        assert_eq!(MatchType::Location, MatchType::Location);
416        assert_eq!(
417            MatchType::Custom("test".to_string()),
418            MatchType::Custom("test".to_string())
419        );
420        assert_ne!(MatchType::Date, MatchType::ContractNumber);
421    }
422
423    #[test]
424    fn test_text_validator_default() {
425        let validator = TextValidator::default();
426        // Verify it can validate text (patterns initialized)
427        let result = validator.validate_contract_text("Signed on 01/01/2020");
428        assert!(result.found);
429    }
430
431    #[test]
432    fn test_monetary_amount_match_type() {
433        let validator = TextValidator::new();
434        let text = "The amount is $50,000.00 payable immediately.";
435
436        let result = validator.validate_contract_text(text);
437        let money_matches: Vec<_> = result
438            .matches
439            .iter()
440            .filter(|m| m.match_type == MatchType::MonetaryAmount)
441            .collect();
442        assert!(!money_matches.is_empty());
443    }
444
445    #[test]
446    fn test_extract_key_info_no_matches() {
447        let validator = TextValidator::new();
448        let text = "Simple text with no special elements";
449
450        let extracted = validator.extract_key_info(text);
451        assert!(!extracted.contains_key("dates"));
452        assert!(!extracted.contains_key("monetary_amounts"));
453        assert!(!extracted.contains_key("organizations"));
454    }
455
456    #[test]
457    fn test_validation_metadata() {
458        let validator = TextValidator::new();
459        let text = "Agreement dated 30 September 2016 for $100,000";
460
461        let result = validator.validate_contract_text(text);
462        assert!(result.metadata.contains_key("total_matches"));
463        assert!(result.metadata.contains_key("text_length"));
464        assert!(result.metadata.contains_key("date_matches"));
465        assert_eq!(
466            result.metadata.get("text_length").unwrap(),
467            &text.len().to_string()
468        );
469    }
470}