use regex::Regex;
use std::collections::HashMap;
#[derive(Debug, Clone)]
pub struct TextValidationResult {
pub found: bool,
pub matches: Vec<TextMatch>,
pub confidence: f64,
pub metadata: HashMap<String, String>,
}
#[derive(Debug, Clone)]
pub struct TextMatch {
pub text: String,
pub position: usize,
pub length: usize,
pub confidence: f64,
pub match_type: MatchType,
}
#[derive(Debug, Clone, PartialEq)]
pub enum MatchType {
Date,
ContractNumber,
PartyName,
MonetaryAmount,
Location,
Custom(String),
}
pub struct TextValidator {
date_patterns: Vec<Regex>,
contract_patterns: Vec<Regex>,
#[allow(dead_code)]
custom_patterns: HashMap<String, Regex>,
}
impl TextValidator {
pub fn new() -> Self {
let mut validator = Self {
date_patterns: Vec::new(),
contract_patterns: Vec::new(),
custom_patterns: HashMap::new(),
};
validator.init_default_patterns();
validator
}
fn init_default_patterns(&mut self) {
let date_patterns = vec![
r"\b\d{1,2}\s+(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{4}\b",
r"\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{4}\b",
r"\b\d{1,2}[\/\-]\d{1,2}[\/\-]\d{4}\b",
r"\b\d{4}[\/\-]\d{1,2}[\/\-]\d{1,2}\b",
];
for pattern in date_patterns {
if let Ok(regex) = Regex::new(&format!("(?i){}", pattern)) {
self.date_patterns.push(regex);
}
}
let contract_patterns = vec![
r"\b(?:Agreement|Contract)\s+(?:No\.?|Number)?\s*:?\s*([A-Z0-9\-\/]+)",
r"\b([A-Z][A-Za-z\s&,\.]+(?:LLC|Ltd|Corp|Corporation|Inc|Company|Co\.)\b)",
r"\$\s*[\d,]+(?:\.\d{2})?(?:\s*(?:million|thousand|M|K))?",
];
for pattern in contract_patterns {
if let Ok(regex) = Regex::new(&format!("(?i){}", pattern)) {
self.contract_patterns.push(regex);
}
}
}
pub fn search_for_target(&self, text: &str, target: &str) -> TextValidationResult {
let target_lower = target.to_lowercase();
let text_lower = text.to_lowercase();
let mut matches = Vec::new();
let mut position = 0;
while let Some(found_pos) = text_lower[position..].find(&target_lower) {
let actual_pos = position + found_pos;
let actual_text = &text[actual_pos..actual_pos + target.len()];
matches.push(TextMatch {
text: actual_text.to_string(),
position: actual_pos,
length: target.len(),
confidence: calculate_string_similarity(
&target_lower,
&text_lower[actual_pos..actual_pos + target.len()],
),
match_type: MatchType::Custom("target_search".to_string()),
});
position = actual_pos + 1;
}
TextValidationResult {
found: !matches.is_empty(),
confidence: if matches.is_empty() {
0.0
} else {
matches.iter().map(|m| m.confidence).sum::<f64>() / matches.len() as f64
},
matches,
metadata: HashMap::new(),
}
}
pub fn validate_contract_text(&self, text: &str) -> TextValidationResult {
let mut all_matches = Vec::new();
let mut metadata = HashMap::new();
for pattern in &self.date_patterns {
for mat in pattern.find_iter(text) {
all_matches.push(TextMatch {
text: mat.as_str().to_string(),
position: mat.start(),
length: mat.len(),
confidence: 0.9, match_type: MatchType::Date,
});
}
}
for pattern in &self.contract_patterns {
for mat in pattern.find_iter(text) {
let match_text = mat.as_str().to_string();
let match_type = if match_text.contains("$") {
MatchType::MonetaryAmount
} else if match_text.to_lowercase().contains("agreement")
|| match_text.to_lowercase().contains("contract")
{
MatchType::ContractNumber
} else {
MatchType::PartyName
};
all_matches.push(TextMatch {
text: match_text,
position: mat.start(),
length: mat.len(),
confidence: 0.8,
match_type,
});
}
}
let confidence = if all_matches.is_empty() {
0.0
} else {
all_matches.iter().map(|m| m.confidence).sum::<f64>() / all_matches.len() as f64
};
metadata.insert("total_matches".to_string(), all_matches.len().to_string());
metadata.insert("text_length".to_string(), text.len().to_string());
let date_matches = all_matches
.iter()
.filter(|m| m.match_type == MatchType::Date)
.count();
metadata.insert("date_matches".to_string(), date_matches.to_string());
TextValidationResult {
found: !all_matches.is_empty(),
confidence,
matches: all_matches,
metadata,
}
}
pub fn extract_key_info(&self, text: &str) -> HashMap<String, Vec<String>> {
let mut extracted = HashMap::new();
let mut dates = Vec::new();
for pattern in &self.date_patterns {
for mat in pattern.find_iter(text) {
dates.push(mat.as_str().to_string());
}
}
if !dates.is_empty() {
extracted.insert("dates".to_string(), dates);
}
if let Ok(money_regex) =
Regex::new(r"\$\s*[\d,]+(?:\.\d{2})?(?:\s*(?:million|thousand|M|K))?")
{
let mut amounts = Vec::new();
for mat in money_regex.find_iter(text) {
amounts.push(mat.as_str().to_string());
}
if !amounts.is_empty() {
extracted.insert("monetary_amounts".to_string(), amounts);
}
}
if let Ok(org_regex) =
Regex::new(r"\b([A-Z][A-Za-z\s&,\.]+(?:LLC|Ltd|Corp|Corporation|Inc|Company|Co\.)\b)")
{
let mut organizations = Vec::new();
for mat in org_regex.find_iter(text) {
organizations.push(mat.as_str().to_string());
}
if !organizations.is_empty() {
extracted.insert("organizations".to_string(), organizations);
}
}
extracted
}
}
impl Default for TextValidator {
fn default() -> Self {
Self::new()
}
}
fn calculate_string_similarity(s1: &str, s2: &str) -> f64 {
if s1 == s2 {
return 1.0;
}
let s1_chars: Vec<char> = s1.chars().collect();
let s2_chars: Vec<char> = s2.chars().collect();
if s1_chars.is_empty() || s2_chars.is_empty() {
return 0.0;
}
let max_len = s1_chars.len().max(s2_chars.len());
let min_len = s1_chars.len().min(s2_chars.len());
let mut matches = 0;
for i in 0..min_len {
if s1_chars[i] == s2_chars[i] {
matches += 1;
}
}
matches as f64 / max_len as f64
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_date_validation() {
let validator = TextValidator::new();
let text =
"This agreement was signed on 30 September 2016 and expires on December 31, 2020.";
let result = validator.validate_contract_text(text);
assert!(result.found);
let date_matches: Vec<_> = result
.matches
.iter()
.filter(|m| m.match_type == MatchType::Date)
.collect();
assert!(!date_matches.is_empty());
}
#[test]
fn test_target_search() {
let validator = TextValidator::new();
let text = "The contract was executed on 30 September 2016 by both parties.";
let result = validator.search_for_target(text, "30 September 2016");
assert!(result.found);
assert_eq!(result.matches.len(), 1);
assert_eq!(result.matches[0].text, "30 September 2016");
}
#[test]
fn test_key_info_extraction() {
let validator = TextValidator::new();
let text =
"Agreement between ABC Corp and XYZ LLC for $1,000,000 signed on 30 September 2016.";
let extracted = validator.extract_key_info(text);
assert!(extracted.contains_key("dates"));
assert!(extracted.contains_key("monetary_amounts"));
assert!(extracted.contains_key("organizations"));
}
#[test]
fn test_string_similarity_identical() {
let similarity = calculate_string_similarity("hello", "hello");
assert_eq!(similarity, 1.0);
}
#[test]
fn test_string_similarity_empty() {
assert_eq!(calculate_string_similarity("", "test"), 0.0);
assert_eq!(calculate_string_similarity("test", ""), 0.0);
assert_eq!(calculate_string_similarity("", ""), 1.0);
}
#[test]
fn test_string_similarity_partial() {
let similarity = calculate_string_similarity("hello", "hella");
assert!(similarity > 0.5);
assert!(similarity < 1.0);
}
#[test]
fn test_string_similarity_different_lengths() {
let similarity = calculate_string_similarity("hi", "hello");
assert!(similarity < 0.5); }
#[test]
fn test_target_search_not_found() {
let validator = TextValidator::new();
let text = "This text does not contain the target.";
let result = validator.search_for_target(text, "nonexistent phrase");
assert!(!result.found);
assert!(result.matches.is_empty());
assert_eq!(result.confidence, 0.0);
}
#[test]
fn test_target_search_multiple_occurrences() {
let validator = TextValidator::new();
let text = "The date is 2016 and year 2016 was important. Also 2016.";
let result = validator.search_for_target(text, "2016");
assert!(result.found);
assert_eq!(result.matches.len(), 3);
}
#[test]
fn test_target_search_case_insensitive() {
let validator = TextValidator::new();
let text = "Hello WORLD and hello world";
let result = validator.search_for_target(text, "hello");
assert!(result.found);
assert_eq!(result.matches.len(), 2);
}
#[test]
fn test_validate_contract_no_matches() {
let validator = TextValidator::new();
let text = "just some random text without dates or amounts";
let result = validator.validate_contract_text(text);
assert!(!result.found);
assert!(result.matches.is_empty());
assert_eq!(result.confidence, 0.0);
assert_eq!(result.metadata.get("total_matches").unwrap(), "0");
}
#[test]
fn test_match_type_variants() {
assert_eq!(MatchType::Date, MatchType::Date);
assert_eq!(MatchType::ContractNumber, MatchType::ContractNumber);
assert_eq!(MatchType::PartyName, MatchType::PartyName);
assert_eq!(MatchType::MonetaryAmount, MatchType::MonetaryAmount);
assert_eq!(MatchType::Location, MatchType::Location);
assert_eq!(
MatchType::Custom("test".to_string()),
MatchType::Custom("test".to_string())
);
assert_ne!(MatchType::Date, MatchType::ContractNumber);
}
#[test]
fn test_text_validator_default() {
let validator = TextValidator::default();
let result = validator.validate_contract_text("Signed on 01/01/2020");
assert!(result.found);
}
#[test]
fn test_monetary_amount_match_type() {
let validator = TextValidator::new();
let text = "The amount is $50,000.00 payable immediately.";
let result = validator.validate_contract_text(text);
let money_matches: Vec<_> = result
.matches
.iter()
.filter(|m| m.match_type == MatchType::MonetaryAmount)
.collect();
assert!(!money_matches.is_empty());
}
#[test]
fn test_extract_key_info_no_matches() {
let validator = TextValidator::new();
let text = "Simple text with no special elements";
let extracted = validator.extract_key_info(text);
assert!(!extracted.contains_key("dates"));
assert!(!extracted.contains_key("monetary_amounts"));
assert!(!extracted.contains_key("organizations"));
}
#[test]
fn test_validation_metadata() {
let validator = TextValidator::new();
let text = "Agreement dated 30 September 2016 for $100,000";
let result = validator.validate_contract_text(text);
assert!(result.metadata.contains_key("total_matches"));
assert!(result.metadata.contains_key("text_length"));
assert!(result.metadata.contains_key("date_matches"));
assert_eq!(
result.metadata.get("text_length").unwrap(),
&text.len().to_string()
);
}
}