use crate::error::EvalResult;
use serde::{Deserialize, Serialize};
use std::collections::HashSet;
#[derive(Debug, Clone)]
pub struct EnrichedFieldData {
pub field_name: String,
pub text_value: String,
pub structured_context: Option<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EnrichmentThresholds {
pub min_non_empty_rate: f64,
pub min_unique_rate: f64,
pub max_suspicious_rate: f64,
}
impl Default for EnrichmentThresholds {
fn default() -> Self {
Self {
min_non_empty_rate: 0.95,
min_unique_rate: 0.80,
max_suspicious_rate: 0.05,
}
}
}
const SUSPICIOUS_PATTERNS: &[&str] = &[
"lorem ipsum",
"placeholder",
"todo",
"test data",
"sample text",
"n/a",
"tbd",
"xxx",
"abc123",
"asdf",
];
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EnrichmentQualityEvaluation {
pub non_empty_rate: f64,
pub unique_text_rate: f64,
pub suspicious_pattern_rate: f64,
pub avg_text_length: f64,
pub total_fields: usize,
pub non_empty_count: usize,
pub suspicious_count: usize,
pub passes: bool,
pub issues: Vec<String>,
}
pub struct EnrichmentQualityEvaluator {
thresholds: EnrichmentThresholds,
}
impl EnrichmentQualityEvaluator {
pub fn new() -> Self {
Self {
thresholds: EnrichmentThresholds::default(),
}
}
pub fn with_thresholds(thresholds: EnrichmentThresholds) -> Self {
Self { thresholds }
}
pub fn evaluate(
&self,
fields: &[EnrichedFieldData],
) -> EvalResult<EnrichmentQualityEvaluation> {
let mut issues = Vec::new();
let total = fields.len();
if total == 0 {
return Ok(EnrichmentQualityEvaluation {
non_empty_rate: 1.0,
unique_text_rate: 1.0,
suspicious_pattern_rate: 0.0,
avg_text_length: 0.0,
total_fields: 0,
non_empty_count: 0,
suspicious_count: 0,
passes: true,
issues: Vec::new(),
});
}
let non_empty: Vec<&EnrichedFieldData> = fields
.iter()
.filter(|f| !f.text_value.trim().is_empty())
.collect();
let non_empty_count = non_empty.len();
let non_empty_rate = non_empty_count as f64 / total as f64;
let unique_texts: HashSet<&str> = non_empty.iter().map(|f| f.text_value.as_str()).collect();
let unique_text_rate = if non_empty_count > 0 {
unique_texts.len() as f64 / non_empty_count as f64
} else {
1.0
};
let suspicious_count = non_empty
.iter()
.filter(|f| {
let lower = f.text_value.to_lowercase();
SUSPICIOUS_PATTERNS
.iter()
.any(|pattern| lower.contains(pattern))
})
.count();
let suspicious_pattern_rate = if non_empty_count > 0 {
suspicious_count as f64 / non_empty_count as f64
} else {
0.0
};
let total_length: usize = non_empty.iter().map(|f| f.text_value.len()).sum();
let avg_text_length = if non_empty_count > 0 {
total_length as f64 / non_empty_count as f64
} else {
0.0
};
if non_empty_rate < self.thresholds.min_non_empty_rate {
issues.push(format!(
"Non-empty rate {:.3} < {:.3}",
non_empty_rate, self.thresholds.min_non_empty_rate
));
}
if unique_text_rate < self.thresholds.min_unique_rate {
issues.push(format!(
"Unique text rate {:.3} < {:.3}",
unique_text_rate, self.thresholds.min_unique_rate
));
}
if suspicious_pattern_rate > self.thresholds.max_suspicious_rate {
issues.push(format!(
"Suspicious pattern rate {:.3} > {:.3}",
suspicious_pattern_rate, self.thresholds.max_suspicious_rate
));
}
let passes = issues.is_empty();
Ok(EnrichmentQualityEvaluation {
non_empty_rate,
unique_text_rate,
suspicious_pattern_rate,
avg_text_length,
total_fields: total,
non_empty_count,
suspicious_count,
passes,
issues,
})
}
}
impl Default for EnrichmentQualityEvaluator {
fn default() -> Self {
Self::new()
}
}
#[cfg(test)]
#[allow(clippy::unwrap_used)]
mod tests {
use super::*;
#[test]
fn test_good_enrichment() {
let evaluator = EnrichmentQualityEvaluator::new();
let fields = vec![
EnrichedFieldData {
field_name: "description".to_string(),
text_value: "Office supplies for Q1 2024 operations".to_string(),
structured_context: None,
},
EnrichedFieldData {
field_name: "description".to_string(),
text_value: "IT equipment maintenance contract renewal".to_string(),
structured_context: None,
},
];
let result = evaluator.evaluate(&fields).unwrap();
assert!(result.passes);
assert_eq!(result.non_empty_rate, 1.0);
assert_eq!(result.unique_text_rate, 1.0);
}
#[test]
fn test_suspicious_patterns() {
let evaluator = EnrichmentQualityEvaluator::new();
let fields = vec![
EnrichedFieldData {
field_name: "desc".to_string(),
text_value: "Lorem ipsum dolor sit amet".to_string(),
structured_context: None,
},
EnrichedFieldData {
field_name: "desc".to_string(),
text_value: "This is placeholder text for testing".to_string(),
structured_context: None,
},
];
let result = evaluator.evaluate(&fields).unwrap();
assert!(!result.passes);
assert_eq!(result.suspicious_count, 2);
}
#[test]
fn test_all_duplicate_text() {
let evaluator = EnrichmentQualityEvaluator::new();
let fields: Vec<EnrichedFieldData> = (0..10)
.map(|_| EnrichedFieldData {
field_name: "desc".to_string(),
text_value: "Same text everywhere".to_string(),
structured_context: None,
})
.collect();
let result = evaluator.evaluate(&fields).unwrap();
assert!(!result.passes);
assert!((result.unique_text_rate - 0.1).abs() < 0.01);
}
#[test]
fn test_empty() {
let evaluator = EnrichmentQualityEvaluator::new();
let result = evaluator.evaluate(&[]).unwrap();
assert!(result.passes);
}
}