Skip to main content

plato_tile_batch/
lib.rs

1//! plato-tile-batch — Batch Tile Processing
2//!
3//! Import, export, transform, and validate tiles in bulk.
4//! Designed for Oracle1's 2,000+ tile exports and JC1's living knowledge dumps.
5//!
6//! ## Why
7//! Single-tile operations are for interactive use. The fleet needs to process
8//! thousands of tiles at once — from exports, from training runs, from audits.
9
10use std::collections::HashMap;
11
12/// A minimal tile for batch processing.
13#[derive(Debug, Clone)]
14pub struct Tile {
15    pub id: String,
16    pub question: String,
17    pub answer: String,
18    pub domain: String,
19    pub confidence: f64,
20    pub tags: Vec<String>,
21}
22
23/// Result of processing a single tile in a batch.
24#[derive(Debug, Clone)]
25pub struct BatchResult {
26    pub tile_id: String,
27    pub status: TileStatus,
28    pub errors: Vec<String>,
29    pub warnings: Vec<String>,
30}
31
32#[derive(Debug, Clone, PartialEq)]
33pub enum TileStatus {
34    Accepted,
35    Rejected,
36    AcceptedWithWarnings,
37    Skipped,
38}
39
40/// Result of an entire batch operation.
41#[derive(Debug, Clone)]
42pub struct BatchSummary {
43    pub total: usize,
44    pub accepted: usize,
45    pub rejected: usize,
46    pub skipped: usize,
47    pub warnings: usize,
48    pub domains: HashMap<String, usize>,
49    pub errors: Vec<String>,
50}
51
52impl BatchSummary {
53    pub fn pass_rate(&self) -> f64 {
54        if self.total == 0 { return 0.0; }
55        (self.accepted + self.accepted_with_warnings()) as f64 / self.total as f64
56    }
57    pub fn accepted_with_warnings(&self) -> usize { self.warnings }
58}
59
60/// Validation rules for batch processing.
61pub struct BatchValidator {
62    min_confidence: f64,
63    min_question_len: usize,
64    min_answer_len: usize,
65    max_question_len: usize,
66    max_answer_len: usize,
67}
68
69impl Default for BatchValidator {
70    fn default() -> Self {
71        BatchValidator {
72            min_confidence: 0.3,
73            min_question_len: 10,
74            min_answer_len: 10,
75            max_question_len: 50000,
76            max_answer_len: 100000,
77        }
78    }
79}
80
81impl BatchValidator {
82    pub fn validate(&self, tile: &Tile) -> BatchResult {
83        let mut errors = Vec::new();
84        let mut warnings = Vec::new();
85
86        if tile.confidence < self.min_confidence {
87            errors.push(format!("confidence {:.2} < {:.2}", tile.confidence, self.min_confidence));
88        }
89        if tile.question.len() < self.min_question_len {
90            errors.push(format!("question {} chars < {} min", tile.question.len(), self.min_question_len));
91        }
92        if tile.answer.len() < self.min_answer_len {
93            errors.push(format!("answer {} chars < {} min", tile.answer.len(), self.min_answer_len));
94        }
95        if tile.question.len() > self.max_question_len {
96            warnings.push(format!("question {} chars > {} max (truncatable)", tile.question.len(), self.max_question_len));
97        }
98        if tile.answer.len() > self.max_answer_len {
99            warnings.push(format!("answer {} chars > {} max (truncatable)", tile.answer.len(), self.max_answer_len));
100        }
101        if tile.domain.is_empty() {
102            errors.push("empty domain".into());
103        }
104        if tile.tags.is_empty() {
105            warnings.push("no tags".into());
106        }
107
108        let status = if !errors.is_empty() { TileStatus::Rejected }
109        else if !warnings.is_empty() { TileStatus::AcceptedWithWarnings }
110        else { TileStatus::Accepted };
111
112        BatchResult { tile_id: tile.id.clone(), status, errors, warnings }
113    }
114}
115
116/// Batch processor.
117pub struct TileBatch;
118
119impl TileBatch {
120    /// Process a batch of tiles through validation.
121    pub fn validate_batch(tiles: &[Tile], validator: &BatchValidator) -> (Vec<BatchResult>, BatchSummary) {
122        let mut results = Vec::new();
123        let mut domains: HashMap<String, usize> = HashMap::new();
124        let mut accepted = 0usize;
125        let mut rejected = 0usize;
126        let mut skipped = 0usize;
127        let mut warnings = 0usize;
128        let mut all_errors = Vec::new();
129
130        for tile in tiles {
131            *domains.entry(tile.domain.clone()).or_insert(0) += 1;
132            let result = validator.validate(tile);
133            match result.status {
134                TileStatus::Accepted => accepted += 1,
135                TileStatus::AcceptedWithWarnings => { accepted += 1; warnings += 1; }
136                TileStatus::Rejected => rejected += 1,
137                TileStatus::Skipped => skipped += 1,
138            }
139            if result.status == TileStatus::Rejected {
140                all_errors.push(format!("{}: {}", tile.id, result.errors.join("; ")));
141            }
142            results.push(result);
143        }
144
145        let summary = BatchSummary {
146            total: tiles.len(), accepted, rejected, skipped, warnings, domains, errors: all_errors,
147        };
148
149        (results, summary)
150    }
151
152    /// Filter tiles by domain.
153    pub fn filter_by_domain<'a>(tiles: &'a [Tile], domain: &str) -> Vec<&'a Tile> {
154        tiles.iter().filter(|t| t.domain == domain).collect()
155    }
156
157    /// Partition tiles into accepted and rejected.
158    pub fn partition<'a>(results: &[BatchResult], tiles: &'a [Tile]) -> (Vec<&'a Tile>, Vec<&'a Tile>) {
159        let mut accepted = Vec::new();
160        let mut rejected = Vec::new();
161        for (r, t) in results.iter().zip(tiles.iter()) {
162            match r.status {
163                TileStatus::Accepted | TileStatus::AcceptedWithWarnings => accepted.push(t),
164                TileStatus::Rejected | TileStatus::Skipped => rejected.push(t),
165            }
166        }
167        (accepted, rejected)
168    }
169
170    /// Deduplicate tiles by question+answer exact match.
171    pub fn dedup(tiles: &[Tile]) -> Vec<&Tile> {
172        let mut seen: std::collections::HashSet<(String, String)> = std::collections::HashSet::new();
173        tiles.iter().filter(|t| {
174            let key = (t.question.clone(), t.answer.clone());
175            seen.insert(key)
176        }).collect()
177    }
178
179    /// Assign sequential IDs to tiles that have empty IDs.
180    pub fn assign_ids(tiles: &mut [Tile], prefix: &str) {
181        for (i, tile) in tiles.iter_mut().enumerate() {
182            if tile.id.is_empty() {
183                tile.id = format!("{}-{}", prefix, i + 1);
184            }
185        }
186    }
187
188    /// Compute batch statistics without full validation.
189    pub fn quick_stats(tiles: &[Tile]) -> BatchStats {
190        if tiles.is_empty() {
191            return BatchStats::default();
192        }
193        let total = tiles.len();
194        let avg_confidence: f64 = tiles.iter().map(|t| t.confidence).sum::<f64>() / total as f64;
195        let min_conf = tiles.iter().map(|t| t.confidence).fold(f64::MAX, f64::min);
196        let max_conf = tiles.iter().map(|t| t.confidence).fold(f64::MIN, f64::max);
197        let avg_q_len: f64 = tiles.iter().map(|t| t.question.len() as f64).sum::<f64>() / total as f64;
198        let avg_a_len: f64 = tiles.iter().map(|t| t.answer.len() as f64).sum::<f64>() / total as f64;
199        let mut domains: HashMap<String, usize> = HashMap::new();
200        for t in tiles { *domains.entry(t.domain.clone()).or_insert(0) += 1; }
201        BatchStats { total, avg_confidence, min_confidence: min_conf, max_confidence: max_conf, avg_question_len: avg_q_len, avg_answer_len: avg_a_len, domain_count: domains.len(), domains }
202    }
203}
204
205#[derive(Debug, Clone, Default)]
206pub struct BatchStats {
207    pub total: usize,
208    pub avg_confidence: f64,
209    pub min_confidence: f64,
210    pub max_confidence: f64,
211    pub avg_question_len: f64,
212    pub avg_answer_len: f64,
213    pub domain_count: usize,
214    pub domains: HashMap<String, usize>,
215}
216
217fn make_tile(id: &str, q: &str, a: &str, domain: &str, conf: f64, tags: Vec<&str>) -> Tile {
218    Tile { id: id.into(), question: q.into(), answer: a.into(), domain: domain.into(), confidence: conf, tags: tags.iter().map(|s| s.to_string()).collect() }
219}
220
221#[cfg(test)]
222mod tests {
223    use super::*;
224
225    #[test]
226    fn test_validate_good_tile() {
227        let v = BatchValidator::default();
228        let t = make_tile("t1", "What is PLATO?", "Training pipeline for agents.", "plato", 0.9, vec!["training", "pipeline"]);
229        let r = v.validate(&t);
230        assert_eq!(r.status, TileStatus::Accepted);
231        assert!(r.errors.is_empty());
232    }
233
234    #[test]
235    fn test_validate_low_confidence() {
236        let v = BatchValidator::default();
237        let t = make_tile("t2", "What is PLATO?", "Training pipeline.", "plato", 0.1, vec![]);
238        let r = v.validate(&t);
239        assert_eq!(r.status, TileStatus::Rejected);
240        assert!(r.errors.iter().any(|e| e.contains("confidence")));
241    }
242
243    #[test]
244    fn test_validate_short_content() {
245        let v = BatchValidator::default();
246        let t = make_tile("t3", "Short", "Short", "x", 0.9, vec![]);
247        let r = v.validate(&t);
248        assert_eq!(r.status, TileStatus::Rejected);
249    }
250
251    #[test]
252    fn test_validate_empty_domain() {
253        let v = BatchValidator::default();
254        let t = make_tile("t4", "What is flux?", "Bytecode runtime for agents.", "", 0.9, vec![]);
255        let r = v.validate(&t);
256        assert!(r.errors.iter().any(|e| e.contains("domain")));
257    }
258
259    #[test]
260    fn test_validate_warnings() {
261        let v = BatchValidator::default();
262        let t = make_tile("t5", "What is constraint theory?", "Geometric snapping for deterministic computation across all machines.", "ct", 0.9, vec![]);
263        let r = v.validate(&t);
264        assert_eq!(r.status, TileStatus::AcceptedWithWarnings);
265        assert!(r.warnings.iter().any(|w| w.contains("tags")));
266    }
267
268    #[test]
269    fn test_batch_validate_100() {
270        let tiles: Vec<Tile> = (0..100).map(|i| {
271            if i < 90 { make_tile(&format!("g{}", i), &format!("Question number {} about PLATO and tiles", i), &format!("Answer number {} describing the training pipeline", i), "plato", 0.9, vec!["test"]) }
272            else { make_tile(&format!("b{}", i), "Short", "Short", "x", 0.1, vec![]) }
273        }).collect();
274        let (results, summary) = TileBatch::validate_batch(&tiles, &BatchValidator::default());
275        assert_eq!(summary.total, 100);
276        assert_eq!(summary.accepted, 90);
277        assert_eq!(summary.rejected, 10);
278        assert!(summary.pass_rate() > 0.89);
279        assert_eq!(summary.domains.get("plato").copied().unwrap_or(0), 90);
280    }
281
282    #[test]
283    fn test_filter_by_domain() {
284        let tiles = vec![
285            make_tile("t1", "Q1 about PLATO?", "A1 about PLATO.", "plato", 0.9, vec![]),
286            make_tile("t2", "Q2 about flux?", "A2 about flux.", "flux", 0.9, vec![]),
287            make_tile("t3", "Q3 about PLATO?", "A3 about PLATO.", "plato", 0.8, vec![]),
288        ];
289        let plato = TileBatch::filter_by_domain(&tiles, "plato");
290        assert_eq!(plato.len(), 2);
291    }
292
293    #[test]
294    fn test_partition() {
295        let tiles = vec![
296            make_tile("g1", "Good question about tiles", "Good answer about tiles", "plato", 0.9, vec!["test"]),
297            make_tile("b1", "Short", "Short", "x", 0.1, vec![]),
298        ];
299        let (results, _) = TileBatch::validate_batch(&tiles, &BatchValidator::default());
300        let (accepted, rejected) = TileBatch::partition(&results, &tiles);
301        assert_eq!(accepted.len(), 1);
302        assert_eq!(rejected.len(), 1);
303    }
304
305    #[test]
306    fn test_dedup() {
307        let tiles = vec![
308            make_tile("t1", "Q?", "A.", "x", 0.9, vec![]),
309            make_tile("t2", "Q?", "A.", "x", 0.9, vec![]),
310            make_tile("t3", "Different Q", "Different A", "x", 0.9, vec![]),
311        ];
312        let deduped = TileBatch::dedup(&tiles);
313        assert_eq!(deduped.len(), 2);
314    }
315
316    #[test]
317    fn test_assign_ids() {
318        let mut tiles = vec![
319            Tile { id: String::new(), question: "Q".into(), answer: "A".into(), domain: "x".into(), confidence: 0.9, tags: vec![] },
320            Tile { id: "existing".into(), question: "Q".into(), answer: "A".into(), domain: "x".into(), confidence: 0.9, tags: vec![] },
321        ];
322        TileBatch::assign_ids(&mut tiles, "batch");
323        assert_eq!(tiles[0].id, "batch-1");
324        assert_eq!(tiles[1].id, "existing");
325    }
326
327    #[test]
328    fn test_quick_stats() {
329        let tiles = vec![
330            make_tile("t1", "Q1 about PLATO?", "A1 about PLATO.", "plato", 0.9, vec![]),
331            make_tile("t2", "Q2 about flux?", "A2 about flux.", "flux", 0.8, vec![]),
332            make_tile("t3", "Q3 about tiles?", "A3 about tiles.", "plato", 0.7, vec![]),
333        ];
334        let stats = TileBatch::quick_stats(&tiles);
335        assert_eq!(stats.total, 3);
336        assert_eq!(stats.domain_count, 2);
337        assert!((stats.avg_confidence - 0.8).abs() < 0.01);
338    }
339
340    #[test]
341    fn test_quick_stats_empty() {
342        let stats = TileBatch::quick_stats(&[]);
343        assert_eq!(stats.total, 0);
344    }
345
346    #[test]
347    fn test_summary_errors_list() {
348        let tiles = vec![
349            make_tile("bad1", "Q?", "A.", "", 0.1, vec![]),
350            make_tile("bad2", "Tiny", "Tiny", "x", 0.1, vec![]),
351        ];
352        let (_, summary) = TileBatch::validate_batch(&tiles, &BatchValidator::default());
353        assert_eq!(summary.rejected, 2);
354        assert_eq!(summary.errors.len(), 2);
355    }
356}