1use std::collections::HashMap;
11
12#[derive(Debug, Clone)]
14pub struct Tile {
15 pub id: String,
16 pub question: String,
17 pub answer: String,
18 pub domain: String,
19 pub confidence: f64,
20 pub tags: Vec<String>,
21}
22
23#[derive(Debug, Clone)]
25pub struct BatchResult {
26 pub tile_id: String,
27 pub status: TileStatus,
28 pub errors: Vec<String>,
29 pub warnings: Vec<String>,
30}
31
32#[derive(Debug, Clone, PartialEq)]
33pub enum TileStatus {
34 Accepted,
35 Rejected,
36 AcceptedWithWarnings,
37 Skipped,
38}
39
40#[derive(Debug, Clone)]
42pub struct BatchSummary {
43 pub total: usize,
44 pub accepted: usize,
45 pub rejected: usize,
46 pub skipped: usize,
47 pub warnings: usize,
48 pub domains: HashMap<String, usize>,
49 pub errors: Vec<String>,
50}
51
52impl BatchSummary {
53 pub fn pass_rate(&self) -> f64 {
54 if self.total == 0 { return 0.0; }
55 (self.accepted + self.accepted_with_warnings()) as f64 / self.total as f64
56 }
57 pub fn accepted_with_warnings(&self) -> usize { self.warnings }
58}
59
60pub struct BatchValidator {
62 min_confidence: f64,
63 min_question_len: usize,
64 min_answer_len: usize,
65 max_question_len: usize,
66 max_answer_len: usize,
67}
68
69impl Default for BatchValidator {
70 fn default() -> Self {
71 BatchValidator {
72 min_confidence: 0.3,
73 min_question_len: 10,
74 min_answer_len: 10,
75 max_question_len: 50000,
76 max_answer_len: 100000,
77 }
78 }
79}
80
81impl BatchValidator {
82 pub fn validate(&self, tile: &Tile) -> BatchResult {
83 let mut errors = Vec::new();
84 let mut warnings = Vec::new();
85
86 if tile.confidence < self.min_confidence {
87 errors.push(format!("confidence {:.2} < {:.2}", tile.confidence, self.min_confidence));
88 }
89 if tile.question.len() < self.min_question_len {
90 errors.push(format!("question {} chars < {} min", tile.question.len(), self.min_question_len));
91 }
92 if tile.answer.len() < self.min_answer_len {
93 errors.push(format!("answer {} chars < {} min", tile.answer.len(), self.min_answer_len));
94 }
95 if tile.question.len() > self.max_question_len {
96 warnings.push(format!("question {} chars > {} max (truncatable)", tile.question.len(), self.max_question_len));
97 }
98 if tile.answer.len() > self.max_answer_len {
99 warnings.push(format!("answer {} chars > {} max (truncatable)", tile.answer.len(), self.max_answer_len));
100 }
101 if tile.domain.is_empty() {
102 errors.push("empty domain".into());
103 }
104 if tile.tags.is_empty() {
105 warnings.push("no tags".into());
106 }
107
108 let status = if !errors.is_empty() { TileStatus::Rejected }
109 else if !warnings.is_empty() { TileStatus::AcceptedWithWarnings }
110 else { TileStatus::Accepted };
111
112 BatchResult { tile_id: tile.id.clone(), status, errors, warnings }
113 }
114}
115
116pub struct TileBatch;
118
119impl TileBatch {
120 pub fn validate_batch(tiles: &[Tile], validator: &BatchValidator) -> (Vec<BatchResult>, BatchSummary) {
122 let mut results = Vec::new();
123 let mut domains: HashMap<String, usize> = HashMap::new();
124 let mut accepted = 0usize;
125 let mut rejected = 0usize;
126 let mut skipped = 0usize;
127 let mut warnings = 0usize;
128 let mut all_errors = Vec::new();
129
130 for tile in tiles {
131 *domains.entry(tile.domain.clone()).or_insert(0) += 1;
132 let result = validator.validate(tile);
133 match result.status {
134 TileStatus::Accepted => accepted += 1,
135 TileStatus::AcceptedWithWarnings => { accepted += 1; warnings += 1; }
136 TileStatus::Rejected => rejected += 1,
137 TileStatus::Skipped => skipped += 1,
138 }
139 if result.status == TileStatus::Rejected {
140 all_errors.push(format!("{}: {}", tile.id, result.errors.join("; ")));
141 }
142 results.push(result);
143 }
144
145 let summary = BatchSummary {
146 total: tiles.len(), accepted, rejected, skipped, warnings, domains, errors: all_errors,
147 };
148
149 (results, summary)
150 }
151
152 pub fn filter_by_domain<'a>(tiles: &'a [Tile], domain: &str) -> Vec<&'a Tile> {
154 tiles.iter().filter(|t| t.domain == domain).collect()
155 }
156
157 pub fn partition<'a>(results: &[BatchResult], tiles: &'a [Tile]) -> (Vec<&'a Tile>, Vec<&'a Tile>) {
159 let mut accepted = Vec::new();
160 let mut rejected = Vec::new();
161 for (r, t) in results.iter().zip(tiles.iter()) {
162 match r.status {
163 TileStatus::Accepted | TileStatus::AcceptedWithWarnings => accepted.push(t),
164 TileStatus::Rejected | TileStatus::Skipped => rejected.push(t),
165 }
166 }
167 (accepted, rejected)
168 }
169
170 pub fn dedup(tiles: &[Tile]) -> Vec<&Tile> {
172 let mut seen: std::collections::HashSet<(String, String)> = std::collections::HashSet::new();
173 tiles.iter().filter(|t| {
174 let key = (t.question.clone(), t.answer.clone());
175 seen.insert(key)
176 }).collect()
177 }
178
179 pub fn assign_ids(tiles: &mut [Tile], prefix: &str) {
181 for (i, tile) in tiles.iter_mut().enumerate() {
182 if tile.id.is_empty() {
183 tile.id = format!("{}-{}", prefix, i + 1);
184 }
185 }
186 }
187
188 pub fn quick_stats(tiles: &[Tile]) -> BatchStats {
190 if tiles.is_empty() {
191 return BatchStats::default();
192 }
193 let total = tiles.len();
194 let avg_confidence: f64 = tiles.iter().map(|t| t.confidence).sum::<f64>() / total as f64;
195 let min_conf = tiles.iter().map(|t| t.confidence).fold(f64::MAX, f64::min);
196 let max_conf = tiles.iter().map(|t| t.confidence).fold(f64::MIN, f64::max);
197 let avg_q_len: f64 = tiles.iter().map(|t| t.question.len() as f64).sum::<f64>() / total as f64;
198 let avg_a_len: f64 = tiles.iter().map(|t| t.answer.len() as f64).sum::<f64>() / total as f64;
199 let mut domains: HashMap<String, usize> = HashMap::new();
200 for t in tiles { *domains.entry(t.domain.clone()).or_insert(0) += 1; }
201 BatchStats { total, avg_confidence, min_confidence: min_conf, max_confidence: max_conf, avg_question_len: avg_q_len, avg_answer_len: avg_a_len, domain_count: domains.len(), domains }
202 }
203}
204
205#[derive(Debug, Clone, Default)]
206pub struct BatchStats {
207 pub total: usize,
208 pub avg_confidence: f64,
209 pub min_confidence: f64,
210 pub max_confidence: f64,
211 pub avg_question_len: f64,
212 pub avg_answer_len: f64,
213 pub domain_count: usize,
214 pub domains: HashMap<String, usize>,
215}
216
217fn make_tile(id: &str, q: &str, a: &str, domain: &str, conf: f64, tags: Vec<&str>) -> Tile {
218 Tile { id: id.into(), question: q.into(), answer: a.into(), domain: domain.into(), confidence: conf, tags: tags.iter().map(|s| s.to_string()).collect() }
219}
220
221#[cfg(test)]
222mod tests {
223 use super::*;
224
225 #[test]
226 fn test_validate_good_tile() {
227 let v = BatchValidator::default();
228 let t = make_tile("t1", "What is PLATO?", "Training pipeline for agents.", "plato", 0.9, vec!["training", "pipeline"]);
229 let r = v.validate(&t);
230 assert_eq!(r.status, TileStatus::Accepted);
231 assert!(r.errors.is_empty());
232 }
233
234 #[test]
235 fn test_validate_low_confidence() {
236 let v = BatchValidator::default();
237 let t = make_tile("t2", "What is PLATO?", "Training pipeline.", "plato", 0.1, vec![]);
238 let r = v.validate(&t);
239 assert_eq!(r.status, TileStatus::Rejected);
240 assert!(r.errors.iter().any(|e| e.contains("confidence")));
241 }
242
243 #[test]
244 fn test_validate_short_content() {
245 let v = BatchValidator::default();
246 let t = make_tile("t3", "Short", "Short", "x", 0.9, vec![]);
247 let r = v.validate(&t);
248 assert_eq!(r.status, TileStatus::Rejected);
249 }
250
251 #[test]
252 fn test_validate_empty_domain() {
253 let v = BatchValidator::default();
254 let t = make_tile("t4", "What is flux?", "Bytecode runtime for agents.", "", 0.9, vec![]);
255 let r = v.validate(&t);
256 assert!(r.errors.iter().any(|e| e.contains("domain")));
257 }
258
259 #[test]
260 fn test_validate_warnings() {
261 let v = BatchValidator::default();
262 let t = make_tile("t5", "What is constraint theory?", "Geometric snapping for deterministic computation across all machines.", "ct", 0.9, vec![]);
263 let r = v.validate(&t);
264 assert_eq!(r.status, TileStatus::AcceptedWithWarnings);
265 assert!(r.warnings.iter().any(|w| w.contains("tags")));
266 }
267
268 #[test]
269 fn test_batch_validate_100() {
270 let tiles: Vec<Tile> = (0..100).map(|i| {
271 if i < 90 { make_tile(&format!("g{}", i), &format!("Question number {} about PLATO and tiles", i), &format!("Answer number {} describing the training pipeline", i), "plato", 0.9, vec!["test"]) }
272 else { make_tile(&format!("b{}", i), "Short", "Short", "x", 0.1, vec![]) }
273 }).collect();
274 let (results, summary) = TileBatch::validate_batch(&tiles, &BatchValidator::default());
275 assert_eq!(summary.total, 100);
276 assert_eq!(summary.accepted, 90);
277 assert_eq!(summary.rejected, 10);
278 assert!(summary.pass_rate() > 0.89);
279 assert_eq!(summary.domains.get("plato").copied().unwrap_or(0), 90);
280 }
281
282 #[test]
283 fn test_filter_by_domain() {
284 let tiles = vec![
285 make_tile("t1", "Q1 about PLATO?", "A1 about PLATO.", "plato", 0.9, vec![]),
286 make_tile("t2", "Q2 about flux?", "A2 about flux.", "flux", 0.9, vec![]),
287 make_tile("t3", "Q3 about PLATO?", "A3 about PLATO.", "plato", 0.8, vec![]),
288 ];
289 let plato = TileBatch::filter_by_domain(&tiles, "plato");
290 assert_eq!(plato.len(), 2);
291 }
292
293 #[test]
294 fn test_partition() {
295 let tiles = vec![
296 make_tile("g1", "Good question about tiles", "Good answer about tiles", "plato", 0.9, vec!["test"]),
297 make_tile("b1", "Short", "Short", "x", 0.1, vec![]),
298 ];
299 let (results, _) = TileBatch::validate_batch(&tiles, &BatchValidator::default());
300 let (accepted, rejected) = TileBatch::partition(&results, &tiles);
301 assert_eq!(accepted.len(), 1);
302 assert_eq!(rejected.len(), 1);
303 }
304
305 #[test]
306 fn test_dedup() {
307 let tiles = vec![
308 make_tile("t1", "Q?", "A.", "x", 0.9, vec![]),
309 make_tile("t2", "Q?", "A.", "x", 0.9, vec![]),
310 make_tile("t3", "Different Q", "Different A", "x", 0.9, vec![]),
311 ];
312 let deduped = TileBatch::dedup(&tiles);
313 assert_eq!(deduped.len(), 2);
314 }
315
316 #[test]
317 fn test_assign_ids() {
318 let mut tiles = vec![
319 Tile { id: String::new(), question: "Q".into(), answer: "A".into(), domain: "x".into(), confidence: 0.9, tags: vec![] },
320 Tile { id: "existing".into(), question: "Q".into(), answer: "A".into(), domain: "x".into(), confidence: 0.9, tags: vec![] },
321 ];
322 TileBatch::assign_ids(&mut tiles, "batch");
323 assert_eq!(tiles[0].id, "batch-1");
324 assert_eq!(tiles[1].id, "existing");
325 }
326
327 #[test]
328 fn test_quick_stats() {
329 let tiles = vec![
330 make_tile("t1", "Q1 about PLATO?", "A1 about PLATO.", "plato", 0.9, vec![]),
331 make_tile("t2", "Q2 about flux?", "A2 about flux.", "flux", 0.8, vec![]),
332 make_tile("t3", "Q3 about tiles?", "A3 about tiles.", "plato", 0.7, vec![]),
333 ];
334 let stats = TileBatch::quick_stats(&tiles);
335 assert_eq!(stats.total, 3);
336 assert_eq!(stats.domain_count, 2);
337 assert!((stats.avg_confidence - 0.8).abs() < 0.01);
338 }
339
340 #[test]
341 fn test_quick_stats_empty() {
342 let stats = TileBatch::quick_stats(&[]);
343 assert_eq!(stats.total, 0);
344 }
345
346 #[test]
347 fn test_summary_errors_list() {
348 let tiles = vec![
349 make_tile("bad1", "Q?", "A.", "", 0.1, vec![]),
350 make_tile("bad2", "Tiny", "Tiny", "x", 0.1, vec![]),
351 ];
352 let (_, summary) = TileBatch::validate_batch(&tiles, &BatchValidator::default());
353 assert_eq!(summary.rejected, 2);
354 assert_eq!(summary.errors.len(), 2);
355 }
356}