aprender/synthetic/
mod.rs

1//! Synthetic Data Generation for AutoML.
2//!
3//! This module provides automatic synthetic data generation capabilities
4//! to improve model performance in low-resource domains. Generated data
5//! is validated, quality-scored, and integrated into the AutoML optimization loop.
6//!
7//! # Quick Start
8//!
9//! ```
10//! use aprender::synthetic::{SyntheticConfig, GenerationStrategy};
11//!
12//! // Configure synthetic data generation
13//! let config = SyntheticConfig::default()
14//!     .with_augmentation_ratio(0.5)
15//!     .with_quality_threshold(0.7);
16//!
17//! assert_eq!(config.augmentation_ratio, 0.5);
18//! assert_eq!(config.quality_threshold, 0.7);
19//! ```
20//!
21//! # Design Principles
22//!
23//! - **Quality-First**: All generated samples validated before inclusion
24//! - **Diversity-Aware**: Monitors for mode collapse and distribution shift
25//! - **AutoML Integration**: Augmentation parameters jointly optimized with model hyperparameters
26//!
27//! # References
28//!
29//! - Cubuk et al. (2019). AutoAugment: Learning Augmentation Strategies. CVPR.
30//! - Wei & Zou (2019). EDA: Easy Data Augmentation. EMNLP.
31//! - Ratner et al. (2017). Snorkel: Weak Supervision. VLDB.
32
33pub mod andon;
34pub mod cache;
35pub mod code_eda;
36pub mod code_features;
37mod config;
38mod diversity;
39pub mod eda;
40pub mod mixup;
41mod params;
42mod quality;
43pub mod shell;
44mod strategy;
45pub mod template;
46mod validator;
47pub mod weak_supervision;
48
49pub use andon::{AndonConfig, AndonEvent, AndonHandler, AndonSeverity, DefaultAndon, TestAndon};
50pub use config::SyntheticConfig;
51pub use diversity::{DiversityMonitor, DiversityScore};
52pub use params::SyntheticParam;
53pub use quality::QualityDegradationDetector;
54pub use strategy::GenerationStrategy;
55pub use validator::{SyntheticValidator, ValidationResult};
56
57use crate::error::Result;
58
59/// Trait for synthetic data generators.
60///
61/// Implement this trait for domain-specific data generation (e.g., shell commands,
62/// code translation pairs). The generator produces synthetic samples from seed data,
63/// with quality and diversity scoring for filtering.
64///
65/// # Type Parameters
66///
67/// - `Input`: The type of seed samples used for generation
68/// - `Output`: The type of generated synthetic samples
69///
70/// # Example
71///
72/// ```
73/// use aprender::synthetic::{SyntheticGenerator, SyntheticConfig};
74/// use aprender::error::Result;
75///
76/// struct TextGenerator;
77///
78/// impl SyntheticGenerator for TextGenerator {
79///     type Input = String;
80///     type Output = String;
81///
82///     fn generate(&self, seeds: &[Self::Input], config: &SyntheticConfig)
83///         -> Result<Vec<Self::Output>>
84///     {
85///         let target = (seeds.len() as f32 * config.augmentation_ratio) as usize;
86///         let synthetic: Vec<_> = seeds.iter()
87///             .take(target)
88///             .map(|s| format!("{s} [synthetic]"))
89///             .collect();
90///         Ok(synthetic)
91///     }
92///
93///     fn quality_score(&self, _generated: &Self::Output, _seed: &Self::Input) -> f32 {
94///         0.85
95///     }
96///
97///     fn diversity_score(&self, batch: &[Self::Output]) -> f32 {
98///         if batch.is_empty() { 0.0 } else { 1.0 / batch.len() as f32 }
99///     }
100/// }
101/// ```
102pub trait SyntheticGenerator {
103    /// Type of seed samples used for generation.
104    type Input;
105    /// Type of generated synthetic samples.
106    type Output;
107
108    /// Generate synthetic examples from seed data.
109    ///
110    /// # Arguments
111    ///
112    /// * `seeds` - Original samples to use as generation seeds
113    /// * `config` - Configuration controlling generation behavior
114    ///
115    /// # Returns
116    ///
117    /// Vector of generated synthetic samples, filtered by quality threshold.
118    fn generate(
119        &self,
120        seeds: &[Self::Input],
121        config: &SyntheticConfig,
122    ) -> Result<Vec<Self::Output>>;
123
124    /// Estimate quality of a generated sample relative to its seed.
125    ///
126    /// Returns a score in [0.0, 1.0] where higher is better quality.
127    /// Samples below `config.quality_threshold` are rejected.
128    fn quality_score(&self, generated: &Self::Output, seed: &Self::Input) -> f32;
129
130    /// Measure diversity of a batch of generated samples.
131    ///
132    /// Returns a score in [0.0, 1.0] where higher indicates more diverse samples.
133    /// Low diversity suggests mode collapse in generation.
134    fn diversity_score(&self, batch: &[Self::Output]) -> f32;
135}
136
137/// Callback trait for monitoring synthetic data generation.
138///
139/// Implement this to receive notifications during generation for logging,
140/// metrics collection, or early termination.
141pub trait SyntheticCallback: Send + Sync {
142    /// Called after each batch of synthetic samples is generated.
143    fn on_batch_generated(&mut self, count: usize, config: &SyntheticConfig);
144
145    /// Called when quality falls below threshold.
146    fn on_quality_below_threshold(&mut self, actual: f32, threshold: f32);
147
148    /// Called when diversity metrics indicate potential collapse.
149    fn on_diversity_collapse(&mut self, score: &DiversityScore);
150}
151
152/// Check Andon conditions and trigger events if thresholds exceeded.
153///
154/// Returns an error with Andon halt if critical conditions detected.
155///
156/// # Arguments
157///
158/// * `accepted` - Number of accepted samples
159/// * `total` - Total samples generated
160/// * `diversity` - Current diversity score
161/// * `config` - Synthetic configuration with Andon settings
162/// * `andon` - Optional Andon handler for event notification
163///
164/// # Returns
165///
166/// Ok(()) if generation should continue, Err if Andon halt triggered.
167pub fn check_andon<A: AndonHandler>(
168    accepted: usize,
169    total: usize,
170    diversity: f32,
171    config: &SyntheticConfig,
172    andon: Option<&A>,
173) -> Result<()> {
174    if !config.andon.enabled || total == 0 {
175        return Ok(());
176    }
177
178    let rejection_rate = 1.0 - (accepted as f32 / total as f32);
179
180    // Check rejection rate
181    if config.andon.exceeds_rejection_threshold(rejection_rate) {
182        let event = AndonEvent::HighRejectionRate {
183            rate: rejection_rate,
184            threshold: config.andon.rejection_threshold,
185        };
186
187        if let Some(handler) = andon {
188            handler.on_event(&event);
189            if handler.should_halt(&event) {
190                return Err(crate::error::AprenderError::Other(format!(
191                    "ANDON HALT: Rejection rate {:.1}% exceeds threshold {:.1}%",
192                    rejection_rate * 100.0,
193                    config.andon.rejection_threshold * 100.0
194                )));
195            }
196        }
197    }
198
199    // Check diversity collapse
200    if config.andon.has_diversity_collapse(diversity) {
201        let event = AndonEvent::DiversityCollapse {
202            score: diversity,
203            minimum: config.andon.diversity_minimum,
204        };
205
206        if let Some(handler) = andon {
207            handler.on_event(&event);
208            // Diversity collapse is warning, not halt
209        }
210    }
211
212    Ok(())
213}
214
215/// Generate synthetic data in batches to manage memory.
216///
217/// # Arguments
218///
219/// * `generator` - The synthetic data generator to use
220/// * `seeds` - Original samples to use as generation seeds
221/// * `config` - Configuration controlling generation behavior
222/// * `batch_size` - Number of seeds to process per batch
223///
224/// # Example
225///
226/// ```
227/// use aprender::synthetic::{generate_batched, SyntheticGenerator, SyntheticConfig};
228/// use aprender::error::Result;
229///
230/// struct SimpleGenerator;
231///
232/// impl SyntheticGenerator for SimpleGenerator {
233///     type Input = i32;
234///     type Output = i32;
235///
236///     fn generate(&self, seeds: &[i32], config: &SyntheticConfig) -> Result<Vec<i32>> {
237///         Ok(seeds.iter().map(|x| x * 2).collect())
238///     }
239///
240///     fn quality_score(&self, _: &i32, _: &i32) -> f32 { 1.0 }
241///     fn diversity_score(&self, _: &[i32]) -> f32 { 1.0 }
242/// }
243///
244/// let gen = SimpleGenerator;
245/// let seeds = vec![1, 2, 3, 4, 5];
246/// let config = SyntheticConfig::default();
247/// let result = generate_batched(&gen, &seeds, &config, 2).expect("generation should succeed");
248/// assert_eq!(result, vec![2, 4, 6, 8, 10]);
249/// ```
250pub fn generate_batched<G>(
251    generator: &G,
252    seeds: &[G::Input],
253    config: &SyntheticConfig,
254    batch_size: usize,
255) -> Result<Vec<G::Output>>
256where
257    G: SyntheticGenerator,
258{
259    let mut all_synthetic = Vec::new();
260
261    for chunk in seeds.chunks(batch_size.max(1)) {
262        let batch = generator.generate(chunk, config)?;
263        all_synthetic.extend(batch);
264    }
265
266    Ok(all_synthetic)
267}
268
269/// Streaming iterator for memory-constrained synthetic generation.
270///
271/// Generates synthetic data on-demand rather than all at once,
272/// reducing peak memory usage for large datasets.
273#[derive(Debug)]
274pub struct SyntheticStream<'a, G: SyntheticGenerator + std::fmt::Debug> {
275    generator: &'a G,
276    seeds: &'a [G::Input],
277    config: &'a SyntheticConfig,
278    current_idx: usize,
279    batch_size: usize,
280}
281
282impl<'a, G: SyntheticGenerator + std::fmt::Debug> SyntheticStream<'a, G> {
283    /// Create a new streaming generator.
284    ///
285    /// # Arguments
286    ///
287    /// * `generator` - The synthetic data generator to use
288    /// * `seeds` - Original samples to use as generation seeds
289    /// * `config` - Configuration controlling generation behavior
290    /// * `batch_size` - Number of seeds to process per iteration
291    #[must_use]
292    pub fn new(
293        generator: &'a G,
294        seeds: &'a [G::Input],
295        config: &'a SyntheticConfig,
296        batch_size: usize,
297    ) -> Self {
298        Self {
299            generator,
300            seeds,
301            config,
302            current_idx: 0,
303            batch_size: batch_size.max(1),
304        }
305    }
306
307    /// Check if there are more batches to generate.
308    #[must_use]
309    pub fn has_next(&self) -> bool {
310        self.current_idx < self.seeds.len()
311    }
312
313    /// Get the number of seeds remaining to process.
314    #[must_use]
315    pub fn remaining(&self) -> usize {
316        self.seeds.len().saturating_sub(self.current_idx)
317    }
318}
319
320impl<G: SyntheticGenerator + std::fmt::Debug> Iterator for SyntheticStream<'_, G> {
321    type Item = Result<Vec<G::Output>>;
322
323    fn next(&mut self) -> Option<Self::Item> {
324        if self.current_idx >= self.seeds.len() {
325            return None;
326        }
327        let end = (self.current_idx + self.batch_size).min(self.seeds.len());
328        let chunk = &self.seeds[self.current_idx..end];
329        self.current_idx = end;
330        Some(self.generator.generate(chunk, self.config))
331    }
332}
333
334#[cfg(test)]
335mod tests {
336    use super::*;
337
338    /// Simple test generator for unit tests
339    #[derive(Debug)]
340    struct DoubleGenerator;
341
342    impl SyntheticGenerator for DoubleGenerator {
343        type Input = i32;
344        type Output = i32;
345
346        fn generate(&self, seeds: &[i32], _config: &SyntheticConfig) -> Result<Vec<i32>> {
347            Ok(seeds.iter().map(|x| x * 2).collect())
348        }
349
350        fn quality_score(&self, generated: &i32, seed: &i32) -> f32 {
351            if *generated == seed * 2 {
352                1.0
353            } else {
354                0.0
355            }
356        }
357
358        fn diversity_score(&self, batch: &[i32]) -> f32 {
359            use std::collections::HashSet;
360            let unique: HashSet<_> = batch.iter().collect();
361            if batch.is_empty() {
362                0.0
363            } else {
364                unique.len() as f32 / batch.len() as f32
365            }
366        }
367    }
368
369    #[test]
370    fn test_synthetic_generator_trait() {
371        let gen = DoubleGenerator;
372        let seeds = vec![1, 2, 3];
373        let config = SyntheticConfig::default();
374
375        let result = gen.generate(&seeds, &config).expect("generation failed");
376        assert_eq!(result, vec![2, 4, 6]);
377    }
378
379    #[test]
380    fn test_quality_score() {
381        let gen = DoubleGenerator;
382        assert!((gen.quality_score(&4, &2) - 1.0).abs() < f32::EPSILON);
383        assert!((gen.quality_score(&5, &2) - 0.0).abs() < f32::EPSILON);
384    }
385
386    #[test]
387    fn test_diversity_score() {
388        let gen = DoubleGenerator;
389        assert!((gen.diversity_score(&[1, 2, 3]) - 1.0).abs() < f32::EPSILON);
390        assert!((gen.diversity_score(&[1, 1, 1]) - (1.0 / 3.0)).abs() < f32::EPSILON);
391        assert!((gen.diversity_score(&[]) - 0.0).abs() < f32::EPSILON);
392    }
393
394    #[test]
395    fn test_generate_batched() {
396        let gen = DoubleGenerator;
397        let seeds = vec![1, 2, 3, 4, 5];
398        let config = SyntheticConfig::default();
399
400        let result = generate_batched(&gen, &seeds, &config, 2).expect("batched generation failed");
401        assert_eq!(result, vec![2, 4, 6, 8, 10]);
402    }
403
404    #[test]
405    fn test_generate_batched_single_batch() {
406        let gen = DoubleGenerator;
407        let seeds = vec![1, 2, 3];
408        let config = SyntheticConfig::default();
409
410        let result =
411            generate_batched(&gen, &seeds, &config, 100).expect("batched generation failed");
412        assert_eq!(result, vec![2, 4, 6]);
413    }
414
415    #[test]
416    fn test_generate_batched_empty() {
417        let gen = DoubleGenerator;
418        let seeds: Vec<i32> = vec![];
419        let config = SyntheticConfig::default();
420
421        let result = generate_batched(&gen, &seeds, &config, 2).expect("batched generation failed");
422        assert!(result.is_empty());
423    }
424
425    #[test]
426    fn test_synthetic_stream_basic() {
427        let gen = DoubleGenerator;
428        let seeds = vec![1, 2, 3, 4, 5];
429        let config = SyntheticConfig::default();
430
431        let stream = SyntheticStream::new(&gen, &seeds, &config, 2);
432        let results: Vec<_> = stream.map(|r| r.expect("generation failed")).collect();
433
434        assert_eq!(results.len(), 3); // [1,2], [3,4], [5]
435        assert_eq!(results[0], vec![2, 4]);
436        assert_eq!(results[1], vec![6, 8]);
437        assert_eq!(results[2], vec![10]);
438    }
439
440    #[test]
441    fn test_synthetic_stream_has_next() {
442        let gen = DoubleGenerator;
443        let seeds = vec![1, 2];
444        let config = SyntheticConfig::default();
445
446        let mut stream = SyntheticStream::new(&gen, &seeds, &config, 1);
447        assert!(stream.has_next());
448        assert_eq!(stream.remaining(), 2);
449
450        stream.next();
451        assert!(stream.has_next());
452        assert_eq!(stream.remaining(), 1);
453
454        stream.next();
455        assert!(!stream.has_next());
456        assert_eq!(stream.remaining(), 0);
457    }
458
459    #[test]
460    fn test_synthetic_stream_empty() {
461        let gen = DoubleGenerator;
462        let seeds: Vec<i32> = vec![];
463        let config = SyntheticConfig::default();
464
465        let mut stream = SyntheticStream::new(&gen, &seeds, &config, 2);
466        assert!(!stream.has_next());
467        assert!(stream.next().is_none());
468    }
469
470    #[test]
471    fn test_batch_size_zero_becomes_one() {
472        let gen = DoubleGenerator;
473        let seeds = vec![1, 2, 3];
474        let config = SyntheticConfig::default();
475
476        // batch_size of 0 should be treated as 1
477        let result = generate_batched(&gen, &seeds, &config, 0).expect("generation failed");
478        assert_eq!(result, vec![2, 4, 6]);
479    }
480
481    // ============================================================================
482    // EXTREME TDD: Andon Integration Tests
483    // ============================================================================
484
485    #[test]
486    fn test_check_andon_disabled() {
487        let config = SyntheticConfig::default().with_andon_enabled(false);
488        let andon = TestAndon::new();
489
490        // Should not trigger even with 100% rejection
491        let result = check_andon::<TestAndon>(0, 100, 0.5, &config, Some(&andon));
492        assert!(result.is_ok());
493        assert!(andon.events().is_empty());
494    }
495
496    #[test]
497    fn test_check_andon_empty_total() {
498        let config = SyntheticConfig::default();
499        let andon = TestAndon::new();
500
501        // Zero total should not trigger
502        let result = check_andon::<TestAndon>(0, 0, 0.5, &config, Some(&andon));
503        assert!(result.is_ok());
504    }
505
506    #[test]
507    fn test_check_andon_high_rejection_halts() {
508        let config = SyntheticConfig::default().with_andon_rejection_threshold(0.90);
509        let andon = TestAndon::new();
510
511        // 95% rejection rate (5 accepted out of 100)
512        let result = check_andon::<TestAndon>(5, 100, 0.5, &config, Some(&andon));
513        assert!(result.is_err());
514        assert!(andon.was_halted());
515        assert_eq!(andon.count_high_rejection(), 1);
516    }
517
518    #[test]
519    fn test_check_andon_acceptable_rejection() {
520        let config = SyntheticConfig::default().with_andon_rejection_threshold(0.90);
521        let andon = TestAndon::new();
522
523        // 80% rejection rate (20 accepted out of 100) - below threshold
524        let result = check_andon::<TestAndon>(20, 100, 0.5, &config, Some(&andon));
525        assert!(result.is_ok());
526        assert!(!andon.was_halted());
527    }
528
529    #[test]
530    fn test_check_andon_diversity_collapse_warns() {
531        let config =
532            SyntheticConfig::default().with_andon(AndonConfig::new().with_diversity_minimum(0.2));
533        let andon = TestAndon::new();
534
535        // Low diversity (0.1 < 0.2 minimum) but good acceptance
536        let result = check_andon::<TestAndon>(80, 100, 0.1, &config, Some(&andon));
537        assert!(result.is_ok()); // Diversity collapse is warning, not halt
538        assert!(!andon.was_halted());
539        assert_eq!(andon.events().len(), 1);
540    }
541
542    #[test]
543    fn test_check_andon_no_handler() {
544        let config = SyntheticConfig::default().with_andon_rejection_threshold(0.90);
545
546        // High rejection but no handler - should not error
547        let result = check_andon::<DefaultAndon>(5, 100, 0.5, &config, None);
548        assert!(result.is_ok());
549    }
550
551    #[test]
552    fn test_check_andon_multiple_conditions() {
553        let config = SyntheticConfig::default()
554            .with_andon_rejection_threshold(0.90)
555            .with_andon(AndonConfig::new().with_diversity_minimum(0.2));
556        let andon = TestAndon::new();
557
558        // Both high rejection AND low diversity
559        let result = check_andon::<TestAndon>(3, 100, 0.05, &config, Some(&andon));
560        assert!(result.is_err()); // Rejection halts first
561        assert!(andon.was_halted());
562    }
563}