1use regex::Regex;
7use std::collections::HashMap;
8
9#[derive(Debug, Clone)]
11pub struct TextValidationResult {
12 pub found: bool,
14 pub matches: Vec<TextMatch>,
16 pub confidence: f64,
18 pub metadata: HashMap<String, String>,
20}
21
22#[derive(Debug, Clone)]
24pub struct TextMatch {
25 pub text: String,
27 pub position: usize,
29 pub length: usize,
31 pub confidence: f64,
33 pub match_type: MatchType,
35}
36
37#[derive(Debug, Clone, PartialEq)]
39pub enum MatchType {
40 Date,
41 ContractNumber,
42 PartyName,
43 MonetaryAmount,
44 Location,
45 Custom(String),
46}
47
48pub struct TextValidator {
50 date_patterns: Vec<Regex>,
52 contract_patterns: Vec<Regex>,
54 #[allow(dead_code)]
56 custom_patterns: HashMap<String, Regex>,
57}
58
59impl TextValidator {
60 pub fn new() -> Self {
62 let mut validator = Self {
63 date_patterns: Vec::new(),
64 contract_patterns: Vec::new(),
65 custom_patterns: HashMap::new(),
66 };
67
68 validator.init_default_patterns();
69 validator
70 }
71
72 fn init_default_patterns(&mut self) {
74 let date_patterns = vec![
76 r"\b\d{1,2}\s+(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{4}\b",
78 r"\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{4}\b",
80 r"\b\d{1,2}[\/\-]\d{1,2}[\/\-]\d{4}\b",
82 r"\b\d{4}[\/\-]\d{1,2}[\/\-]\d{1,2}\b",
84 ];
85
86 for pattern in date_patterns {
87 if let Ok(regex) = Regex::new(&format!("(?i){}", pattern)) {
88 self.date_patterns.push(regex);
89 }
90 }
91
92 let contract_patterns = vec![
94 r"\b(?:Agreement|Contract)\s+(?:No\.?|Number)?\s*:?\s*([A-Z0-9\-\/]+)",
96 r"\b([A-Z][A-Za-z\s&,\.]+(?:LLC|Ltd|Corp|Corporation|Inc|Company|Co\.)\b)",
98 r"\$\s*[\d,]+(?:\.\d{2})?(?:\s*(?:million|thousand|M|K))?",
100 ];
101
102 for pattern in contract_patterns {
103 if let Ok(regex) = Regex::new(&format!("(?i){}", pattern)) {
104 self.contract_patterns.push(regex);
105 }
106 }
107 }
108
109 pub fn search_for_target(&self, text: &str, target: &str) -> TextValidationResult {
111 let target_lower = target.to_lowercase();
112 let text_lower = text.to_lowercase();
113
114 let mut matches = Vec::new();
115 let mut position = 0;
116
117 while let Some(found_pos) = text_lower[position..].find(&target_lower) {
119 let actual_pos = position + found_pos;
120 let actual_text = &text[actual_pos..actual_pos + target.len()];
121
122 matches.push(TextMatch {
123 text: actual_text.to_string(),
124 position: actual_pos,
125 length: target.len(),
126 confidence: calculate_string_similarity(
127 &target_lower,
128 &text_lower[actual_pos..actual_pos + target.len()],
129 ),
130 match_type: MatchType::Custom("target_search".to_string()),
131 });
132
133 position = actual_pos + 1;
134 }
135
136 TextValidationResult {
137 found: !matches.is_empty(),
138 confidence: if matches.is_empty() {
139 0.0
140 } else {
141 matches.iter().map(|m| m.confidence).sum::<f64>() / matches.len() as f64
142 },
143 matches,
144 metadata: HashMap::new(),
145 }
146 }
147
148 pub fn validate_contract_text(&self, text: &str) -> TextValidationResult {
150 let mut all_matches = Vec::new();
151 let mut metadata = HashMap::new();
152
153 for pattern in &self.date_patterns {
155 for mat in pattern.find_iter(text) {
156 all_matches.push(TextMatch {
157 text: mat.as_str().to_string(),
158 position: mat.start(),
159 length: mat.len(),
160 confidence: 0.9, match_type: MatchType::Date,
162 });
163 }
164 }
165
166 for pattern in &self.contract_patterns {
168 for mat in pattern.find_iter(text) {
169 let match_text = mat.as_str().to_string();
170 let match_type = if match_text.contains("$") {
171 MatchType::MonetaryAmount
172 } else if match_text.to_lowercase().contains("agreement")
173 || match_text.to_lowercase().contains("contract")
174 {
175 MatchType::ContractNumber
176 } else {
177 MatchType::PartyName
178 };
179
180 all_matches.push(TextMatch {
181 text: match_text,
182 position: mat.start(),
183 length: mat.len(),
184 confidence: 0.8,
185 match_type,
186 });
187 }
188 }
189
190 let confidence = if all_matches.is_empty() {
192 0.0
193 } else {
194 all_matches.iter().map(|m| m.confidence).sum::<f64>() / all_matches.len() as f64
195 };
196
197 metadata.insert("total_matches".to_string(), all_matches.len().to_string());
199 metadata.insert("text_length".to_string(), text.len().to_string());
200
201 let date_matches = all_matches
202 .iter()
203 .filter(|m| m.match_type == MatchType::Date)
204 .count();
205 metadata.insert("date_matches".to_string(), date_matches.to_string());
206
207 TextValidationResult {
208 found: !all_matches.is_empty(),
209 confidence,
210 matches: all_matches,
211 metadata,
212 }
213 }
214
215 pub fn extract_key_info(&self, text: &str) -> HashMap<String, Vec<String>> {
217 let mut extracted = HashMap::new();
218
219 let mut dates = Vec::new();
221 for pattern in &self.date_patterns {
222 for mat in pattern.find_iter(text) {
223 dates.push(mat.as_str().to_string());
224 }
225 }
226 if !dates.is_empty() {
227 extracted.insert("dates".to_string(), dates);
228 }
229
230 if let Ok(money_regex) =
232 Regex::new(r"\$\s*[\d,]+(?:\.\d{2})?(?:\s*(?:million|thousand|M|K))?")
233 {
234 let mut amounts = Vec::new();
235 for mat in money_regex.find_iter(text) {
236 amounts.push(mat.as_str().to_string());
237 }
238 if !amounts.is_empty() {
239 extracted.insert("monetary_amounts".to_string(), amounts);
240 }
241 }
242
243 if let Ok(org_regex) =
245 Regex::new(r"\b([A-Z][A-Za-z\s&,\.]+(?:LLC|Ltd|Corp|Corporation|Inc|Company|Co\.)\b)")
246 {
247 let mut organizations = Vec::new();
248 for mat in org_regex.find_iter(text) {
249 organizations.push(mat.as_str().to_string());
250 }
251 if !organizations.is_empty() {
252 extracted.insert("organizations".to_string(), organizations);
253 }
254 }
255
256 extracted
257 }
258}
259
260impl Default for TextValidator {
261 fn default() -> Self {
262 Self::new()
263 }
264}
265
266fn calculate_string_similarity(s1: &str, s2: &str) -> f64 {
268 if s1 == s2 {
269 return 1.0;
270 }
271
272 let s1_chars: Vec<char> = s1.chars().collect();
273 let s2_chars: Vec<char> = s2.chars().collect();
274
275 if s1_chars.is_empty() || s2_chars.is_empty() {
276 return 0.0;
277 }
278
279 let max_len = s1_chars.len().max(s2_chars.len());
281 let min_len = s1_chars.len().min(s2_chars.len());
282
283 let mut matches = 0;
284 for i in 0..min_len {
285 if s1_chars[i] == s2_chars[i] {
286 matches += 1;
287 }
288 }
289
290 matches as f64 / max_len as f64
291}
292
293#[cfg(test)]
294mod tests {
295 use super::*;
296
297 #[test]
298 fn test_date_validation() {
299 let validator = TextValidator::new();
300 let text =
301 "This agreement was signed on 30 September 2016 and expires on December 31, 2020.";
302
303 let result = validator.validate_contract_text(text);
304 assert!(result.found);
305
306 let date_matches: Vec<_> = result
308 .matches
309 .iter()
310 .filter(|m| m.match_type == MatchType::Date)
311 .collect();
312 assert!(!date_matches.is_empty());
313 }
314
315 #[test]
316 fn test_target_search() {
317 let validator = TextValidator::new();
318 let text = "The contract was executed on 30 September 2016 by both parties.";
319
320 let result = validator.search_for_target(text, "30 September 2016");
321 assert!(result.found);
322 assert_eq!(result.matches.len(), 1);
323 assert_eq!(result.matches[0].text, "30 September 2016");
324 }
325
326 #[test]
327 fn test_key_info_extraction() {
328 let validator = TextValidator::new();
329 let text =
330 "Agreement between ABC Corp and XYZ LLC for $1,000,000 signed on 30 September 2016.";
331
332 let extracted = validator.extract_key_info(text);
333
334 assert!(extracted.contains_key("dates"));
335 assert!(extracted.contains_key("monetary_amounts"));
336 assert!(extracted.contains_key("organizations"));
337 }
338
339 #[test]
340 fn test_string_similarity_identical() {
341 let similarity = calculate_string_similarity("hello", "hello");
342 assert_eq!(similarity, 1.0);
343 }
344
345 #[test]
346 fn test_string_similarity_empty() {
347 assert_eq!(calculate_string_similarity("", "test"), 0.0);
348 assert_eq!(calculate_string_similarity("test", ""), 0.0);
349 assert_eq!(calculate_string_similarity("", ""), 1.0);
351 }
352
353 #[test]
354 fn test_string_similarity_partial() {
355 let similarity = calculate_string_similarity("hello", "hella");
356 assert!(similarity > 0.5);
357 assert!(similarity < 1.0);
358 }
359
360 #[test]
361 fn test_string_similarity_different_lengths() {
362 let similarity = calculate_string_similarity("hi", "hello");
363 assert!(similarity < 0.5); }
365
366 #[test]
367 fn test_target_search_not_found() {
368 let validator = TextValidator::new();
369 let text = "This text does not contain the target.";
370
371 let result = validator.search_for_target(text, "nonexistent phrase");
372 assert!(!result.found);
373 assert!(result.matches.is_empty());
374 assert_eq!(result.confidence, 0.0);
375 }
376
377 #[test]
378 fn test_target_search_multiple_occurrences() {
379 let validator = TextValidator::new();
380 let text = "The date is 2016 and year 2016 was important. Also 2016.";
381
382 let result = validator.search_for_target(text, "2016");
383 assert!(result.found);
384 assert_eq!(result.matches.len(), 3);
385 }
386
387 #[test]
388 fn test_target_search_case_insensitive() {
389 let validator = TextValidator::new();
390 let text = "Hello WORLD and hello world";
391
392 let result = validator.search_for_target(text, "hello");
393 assert!(result.found);
394 assert_eq!(result.matches.len(), 2);
395 }
396
397 #[test]
398 fn test_validate_contract_no_matches() {
399 let validator = TextValidator::new();
400 let text = "just some random text without dates or amounts";
401
402 let result = validator.validate_contract_text(text);
403 assert!(!result.found);
404 assert!(result.matches.is_empty());
405 assert_eq!(result.confidence, 0.0);
406 assert_eq!(result.metadata.get("total_matches").unwrap(), "0");
407 }
408
409 #[test]
410 fn test_match_type_variants() {
411 assert_eq!(MatchType::Date, MatchType::Date);
412 assert_eq!(MatchType::ContractNumber, MatchType::ContractNumber);
413 assert_eq!(MatchType::PartyName, MatchType::PartyName);
414 assert_eq!(MatchType::MonetaryAmount, MatchType::MonetaryAmount);
415 assert_eq!(MatchType::Location, MatchType::Location);
416 assert_eq!(
417 MatchType::Custom("test".to_string()),
418 MatchType::Custom("test".to_string())
419 );
420 assert_ne!(MatchType::Date, MatchType::ContractNumber);
421 }
422
423 #[test]
424 fn test_text_validator_default() {
425 let validator = TextValidator::default();
426 let result = validator.validate_contract_text("Signed on 01/01/2020");
428 assert!(result.found);
429 }
430
431 #[test]
432 fn test_monetary_amount_match_type() {
433 let validator = TextValidator::new();
434 let text = "The amount is $50,000.00 payable immediately.";
435
436 let result = validator.validate_contract_text(text);
437 let money_matches: Vec<_> = result
438 .matches
439 .iter()
440 .filter(|m| m.match_type == MatchType::MonetaryAmount)
441 .collect();
442 assert!(!money_matches.is_empty());
443 }
444
445 #[test]
446 fn test_extract_key_info_no_matches() {
447 let validator = TextValidator::new();
448 let text = "Simple text with no special elements";
449
450 let extracted = validator.extract_key_info(text);
451 assert!(!extracted.contains_key("dates"));
452 assert!(!extracted.contains_key("monetary_amounts"));
453 assert!(!extracted.contains_key("organizations"));
454 }
455
456 #[test]
457 fn test_validation_metadata() {
458 let validator = TextValidator::new();
459 let text = "Agreement dated 30 September 2016 for $100,000";
460
461 let result = validator.validate_contract_text(text);
462 assert!(result.metadata.contains_key("total_matches"));
463 assert!(result.metadata.contains_key("text_length"));
464 assert!(result.metadata.contains_key("date_matches"));
465 assert_eq!(
466 result.metadata.get("text_length").unwrap(),
467 &text.len().to_string()
468 );
469 }
470}