datasynth_eval/enrichment/
mod.rs1use crate::error::EvalResult;
7use serde::{Deserialize, Serialize};
8use std::collections::HashSet;
9
10#[derive(Debug, Clone)]
12pub struct EnrichedFieldData {
13 pub field_name: String,
15 pub text_value: String,
17 pub structured_context: Option<String>,
19}
20
21#[derive(Debug, Clone, Serialize, Deserialize)]
23pub struct EnrichmentThresholds {
24 pub min_non_empty_rate: f64,
26 pub min_unique_rate: f64,
28 pub max_suspicious_rate: f64,
30}
31
32impl Default for EnrichmentThresholds {
33 fn default() -> Self {
34 Self {
35 min_non_empty_rate: 0.95,
36 min_unique_rate: 0.80,
37 max_suspicious_rate: 0.05,
38 }
39 }
40}
41
42const SUSPICIOUS_PATTERNS: &[&str] = &[
44 "lorem ipsum",
45 "placeholder",
46 "todo",
47 "test data",
48 "sample text",
49 "n/a",
50 "tbd",
51 "xxx",
52 "abc123",
53 "asdf",
54];
55
56#[derive(Debug, Clone, Serialize, Deserialize)]
58pub struct EnrichmentQualityEvaluation {
59 pub non_empty_rate: f64,
61 pub unique_text_rate: f64,
63 pub suspicious_pattern_rate: f64,
65 pub avg_text_length: f64,
67 pub total_fields: usize,
69 pub non_empty_count: usize,
71 pub suspicious_count: usize,
73 pub passes: bool,
75 pub issues: Vec<String>,
77}
78
79pub struct EnrichmentQualityEvaluator {
81 thresholds: EnrichmentThresholds,
82}
83
84impl EnrichmentQualityEvaluator {
85 pub fn new() -> Self {
87 Self {
88 thresholds: EnrichmentThresholds::default(),
89 }
90 }
91
92 pub fn with_thresholds(thresholds: EnrichmentThresholds) -> Self {
94 Self { thresholds }
95 }
96
97 pub fn evaluate(
99 &self,
100 fields: &[EnrichedFieldData],
101 ) -> EvalResult<EnrichmentQualityEvaluation> {
102 let mut issues = Vec::new();
103 let total = fields.len();
104
105 if total == 0 {
106 return Ok(EnrichmentQualityEvaluation {
107 non_empty_rate: 1.0,
108 unique_text_rate: 1.0,
109 suspicious_pattern_rate: 0.0,
110 avg_text_length: 0.0,
111 total_fields: 0,
112 non_empty_count: 0,
113 suspicious_count: 0,
114 passes: true,
115 issues: Vec::new(),
116 });
117 }
118
119 let non_empty: Vec<&EnrichedFieldData> = fields
121 .iter()
122 .filter(|f| !f.text_value.trim().is_empty())
123 .collect();
124 let non_empty_count = non_empty.len();
125 let non_empty_rate = non_empty_count as f64 / total as f64;
126
127 let unique_texts: HashSet<&str> = non_empty.iter().map(|f| f.text_value.as_str()).collect();
129 let unique_text_rate = if non_empty_count > 0 {
130 unique_texts.len() as f64 / non_empty_count as f64
131 } else {
132 1.0
133 };
134
135 let suspicious_count = non_empty
137 .iter()
138 .filter(|f| {
139 let lower = f.text_value.to_lowercase();
140 SUSPICIOUS_PATTERNS
141 .iter()
142 .any(|pattern| lower.contains(pattern))
143 })
144 .count();
145 let suspicious_pattern_rate = if non_empty_count > 0 {
146 suspicious_count as f64 / non_empty_count as f64
147 } else {
148 0.0
149 };
150
151 let total_length: usize = non_empty.iter().map(|f| f.text_value.len()).sum();
153 let avg_text_length = if non_empty_count > 0 {
154 total_length as f64 / non_empty_count as f64
155 } else {
156 0.0
157 };
158
159 if non_empty_rate < self.thresholds.min_non_empty_rate {
161 issues.push(format!(
162 "Non-empty rate {:.3} < {:.3}",
163 non_empty_rate, self.thresholds.min_non_empty_rate
164 ));
165 }
166 if unique_text_rate < self.thresholds.min_unique_rate {
167 issues.push(format!(
168 "Unique text rate {:.3} < {:.3}",
169 unique_text_rate, self.thresholds.min_unique_rate
170 ));
171 }
172 if suspicious_pattern_rate > self.thresholds.max_suspicious_rate {
173 issues.push(format!(
174 "Suspicious pattern rate {:.3} > {:.3}",
175 suspicious_pattern_rate, self.thresholds.max_suspicious_rate
176 ));
177 }
178
179 let passes = issues.is_empty();
180
181 Ok(EnrichmentQualityEvaluation {
182 non_empty_rate,
183 unique_text_rate,
184 suspicious_pattern_rate,
185 avg_text_length,
186 total_fields: total,
187 non_empty_count,
188 suspicious_count,
189 passes,
190 issues,
191 })
192 }
193}
194
195impl Default for EnrichmentQualityEvaluator {
196 fn default() -> Self {
197 Self::new()
198 }
199}
200
201#[cfg(test)]
202#[allow(clippy::unwrap_used)]
203mod tests {
204 use super::*;
205
206 #[test]
207 fn test_good_enrichment() {
208 let evaluator = EnrichmentQualityEvaluator::new();
209 let fields = vec![
210 EnrichedFieldData {
211 field_name: "description".to_string(),
212 text_value: "Office supplies for Q1 2024 operations".to_string(),
213 structured_context: None,
214 },
215 EnrichedFieldData {
216 field_name: "description".to_string(),
217 text_value: "IT equipment maintenance contract renewal".to_string(),
218 structured_context: None,
219 },
220 ];
221
222 let result = evaluator.evaluate(&fields).unwrap();
223 assert!(result.passes);
224 assert_eq!(result.non_empty_rate, 1.0);
225 assert_eq!(result.unique_text_rate, 1.0);
226 }
227
228 #[test]
229 fn test_suspicious_patterns() {
230 let evaluator = EnrichmentQualityEvaluator::new();
231 let fields = vec![
232 EnrichedFieldData {
233 field_name: "desc".to_string(),
234 text_value: "Lorem ipsum dolor sit amet".to_string(),
235 structured_context: None,
236 },
237 EnrichedFieldData {
238 field_name: "desc".to_string(),
239 text_value: "This is placeholder text for testing".to_string(),
240 structured_context: None,
241 },
242 ];
243
244 let result = evaluator.evaluate(&fields).unwrap();
245 assert!(!result.passes);
246 assert_eq!(result.suspicious_count, 2);
247 }
248
249 #[test]
250 fn test_all_duplicate_text() {
251 let evaluator = EnrichmentQualityEvaluator::new();
252 let fields: Vec<EnrichedFieldData> = (0..10)
253 .map(|_| EnrichedFieldData {
254 field_name: "desc".to_string(),
255 text_value: "Same text everywhere".to_string(),
256 structured_context: None,
257 })
258 .collect();
259
260 let result = evaluator.evaluate(&fields).unwrap();
261 assert!(!result.passes);
262 assert!((result.unique_text_rate - 0.1).abs() < 0.01);
263 }
264
265 #[test]
266 fn test_empty() {
267 let evaluator = EnrichmentQualityEvaluator::new();
268 let result = evaluator.evaluate(&[]).unwrap();
269 assert!(result.passes);
270 }
271}