Skip to main content

aprender/synthetic/
mod.rs

1//! Synthetic Data Generation for `AutoML`.
2//!
3//! This module provides automatic synthetic data generation capabilities
4//! to improve model performance in low-resource domains. Generated data
5//! is validated, quality-scored, and integrated into the `AutoML` optimization loop.
6//!
7//! # Quick Start
8//!
9//! ```
10//! use aprender::synthetic::{SyntheticConfig, GenerationStrategy};
11//!
12//! // Configure synthetic data generation
13//! let config = SyntheticConfig::default()
14//!     .with_augmentation_ratio(0.5)
15//!     .with_quality_threshold(0.7);
16//!
17//! assert_eq!(config.augmentation_ratio, 0.5);
18//! assert_eq!(config.quality_threshold, 0.7);
19//! ```
20//!
21//! # Design Principles
22//!
23//! - **Quality-First**: All generated samples validated before inclusion
24//! - **Diversity-Aware**: Monitors for mode collapse and distribution shift
25//! - **`AutoML` Integration**: Augmentation parameters jointly optimized with model hyperparameters
26//!
27//! # References
28//!
29//! - Cubuk et al. (2019). `AutoAugment`: Learning Augmentation Strategies. CVPR.
30//! - Wei & Zou (2019). EDA: Easy Data Augmentation. EMNLP.
31//! - Ratner et al. (2017). Snorkel: Weak Supervision. VLDB.
32
33pub mod andon;
34pub mod cache;
35pub mod code_eda;
36pub mod code_features;
37mod config;
38mod diversity;
39pub mod eda;
40pub mod mixup;
41mod params;
42mod quality;
43pub mod shell;
44mod strategy;
45pub mod template;
46mod validator;
47pub mod weak_supervision;
48
49pub use andon::{AndonConfig, AndonEvent, AndonHandler, AndonSeverity, DefaultAndon, TestAndon};
50pub use config::SyntheticConfig;
51pub use diversity::{DiversityMonitor, DiversityScore};
52pub use params::SyntheticParam;
53pub use quality::QualityDegradationDetector;
54pub use strategy::GenerationStrategy;
55pub use validator::{SyntheticValidator, ValidationResult};
56
57use crate::error::Result;
58
59/// Trait for synthetic data generators.
60///
61/// Implement this trait for domain-specific data generation (e.g., shell commands,
62/// code translation pairs). The generator produces synthetic samples from seed data,
63/// with quality and diversity scoring for filtering.
64///
65/// # Type Parameters
66///
67/// - `Input`: The type of seed samples used for generation
68/// - `Output`: The type of generated synthetic samples
69///
70/// # Example
71///
72/// ```
73/// use aprender::synthetic::{SyntheticGenerator, SyntheticConfig};
74/// use aprender::error::Result;
75///
76/// struct TextGenerator;
77///
78/// impl SyntheticGenerator for TextGenerator {
79///     type Input = String;
80///     type Output = String;
81///
82///     fn generate(&self, seeds: &[Self::Input], config: &SyntheticConfig)
83///         -> Result<Vec<Self::Output>>
84///     {
85///         let target = (seeds.len() as f32 * config.augmentation_ratio) as usize;
86///         let synthetic: Vec<_> = seeds.iter()
87///             .take(target)
88///             .map(|s| format!("{s} [synthetic]"))
89///             .collect();
90///         Ok(synthetic)
91///     }
92///
93///     fn quality_score(&self, _generated: &Self::Output, _seed: &Self::Input) -> f32 {
94///         0.85
95///     }
96///
97///     fn diversity_score(&self, batch: &[Self::Output]) -> f32 {
98///         if batch.is_empty() { 0.0 } else { 1.0 / batch.len() as f32 }
99///     }
100/// }
101/// ```
102pub trait SyntheticGenerator {
103    /// Type of seed samples used for generation.
104    type Input;
105    /// Type of generated synthetic samples.
106    type Output;
107
108    /// Generate synthetic examples from seed data.
109    ///
110    /// # Arguments
111    ///
112    /// * `seeds` - Original samples to use as generation seeds
113    /// * `config` - Configuration controlling generation behavior
114    ///
115    /// # Returns
116    ///
117    /// Vector of generated synthetic samples, filtered by quality threshold.
118    fn generate(
119        &self,
120        seeds: &[Self::Input],
121        config: &SyntheticConfig,
122    ) -> Result<Vec<Self::Output>>;
123
124    /// Estimate quality of a generated sample relative to its seed.
125    ///
126    /// Returns a score in [0.0, 1.0] where higher is better quality.
127    /// Samples below `config.quality_threshold` are rejected.
128    fn quality_score(&self, generated: &Self::Output, seed: &Self::Input) -> f32;
129
130    /// Measure diversity of a batch of generated samples.
131    ///
132    /// Returns a score in [0.0, 1.0] where higher indicates more diverse samples.
133    /// Low diversity suggests mode collapse in generation.
134    fn diversity_score(&self, batch: &[Self::Output]) -> f32;
135}
136
137/// Callback trait for monitoring synthetic data generation.
138///
139/// Implement this to receive notifications during generation for logging,
140/// metrics collection, or early termination.
141pub trait SyntheticCallback: Send + Sync {
142    /// Called after each batch of synthetic samples is generated.
143    fn on_batch_generated(&mut self, count: usize, config: &SyntheticConfig);
144
145    /// Called when quality falls below threshold.
146    fn on_quality_below_threshold(&mut self, actual: f32, threshold: f32);
147
148    /// Called when diversity metrics indicate potential collapse.
149    fn on_diversity_collapse(&mut self, score: &DiversityScore);
150}
151
152/// Check Andon conditions and trigger events if thresholds exceeded.
153///
154/// Returns an error with Andon halt if critical conditions detected.
155///
156/// # Arguments
157///
158/// * `accepted` - Number of accepted samples
159/// * `total` - Total samples generated
160/// * `diversity` - Current diversity score
161/// * `config` - Synthetic configuration with Andon settings
162/// * `andon` - Optional Andon handler for event notification
163///
164/// # Returns
165///
166/// Ok(()) if generation should continue, Err if Andon halt triggered.
167pub fn check_andon<A: AndonHandler>(
168    accepted: usize,
169    total: usize,
170    diversity: f32,
171    config: &SyntheticConfig,
172    andon: Option<&A>,
173) -> Result<()> {
174    if !config.andon.enabled || total == 0 {
175        return Ok(());
176    }
177
178    let rejection_rate = 1.0 - (accepted as f32 / total as f32);
179    check_rejection_rate(rejection_rate, config, andon)?;
180    check_diversity(diversity, config, andon);
181
182    Ok(())
183}
184
185/// Check rejection rate and trigger Andon event if threshold exceeded.
186fn check_rejection_rate<A: AndonHandler>(
187    rejection_rate: f32,
188    config: &SyntheticConfig,
189    andon: Option<&A>,
190) -> Result<()> {
191    if !config.andon.exceeds_rejection_threshold(rejection_rate) {
192        return Ok(());
193    }
194
195    let event = AndonEvent::HighRejectionRate {
196        rate: rejection_rate,
197        threshold: config.andon.rejection_threshold,
198    };
199
200    if let Some(handler) = andon {
201        handler.on_event(&event);
202        if handler.should_halt(&event) {
203            return Err(crate::error::AprenderError::Other(format!(
204                "ANDON HALT: Rejection rate {:.1}% exceeds threshold {:.1}%",
205                rejection_rate * 100.0,
206                config.andon.rejection_threshold * 100.0
207            )));
208        }
209    }
210    Ok(())
211}
212
213/// Check diversity and trigger Andon event if collapsed.
214fn check_diversity<A: AndonHandler>(diversity: f32, config: &SyntheticConfig, andon: Option<&A>) {
215    if !config.andon.has_diversity_collapse(diversity) {
216        return;
217    }
218
219    let event = AndonEvent::DiversityCollapse {
220        score: diversity,
221        minimum: config.andon.diversity_minimum,
222    };
223
224    if let Some(handler) = andon {
225        handler.on_event(&event);
226        // Diversity collapse is warning, not halt
227    }
228}
229
230/// Generate synthetic data in batches to manage memory.
231///
232/// # Arguments
233///
234/// * `generator` - The synthetic data generator to use
235/// * `seeds` - Original samples to use as generation seeds
236/// * `config` - Configuration controlling generation behavior
237/// * `batch_size` - Number of seeds to process per batch
238///
239/// # Example
240///
241/// ```
242/// use aprender::synthetic::{generate_batched, SyntheticGenerator, SyntheticConfig};
243/// use aprender::error::Result;
244///
245/// struct SimpleGenerator;
246///
247/// impl SyntheticGenerator for SimpleGenerator {
248///     type Input = i32;
249///     type Output = i32;
250///
251///     fn generate(&self, seeds: &[i32], config: &SyntheticConfig) -> Result<Vec<i32>> {
252///         Ok(seeds.iter().map(|x| x * 2).collect())
253///     }
254///
255///     fn quality_score(&self, _: &i32, _: &i32) -> f32 { 1.0 }
256///     fn diversity_score(&self, _: &[i32]) -> f32 { 1.0 }
257/// }
258///
259/// let gen = SimpleGenerator;
260/// let seeds = vec![1, 2, 3, 4, 5];
261/// let config = SyntheticConfig::default();
262/// let result = generate_batched(&gen, &seeds, &config, 2).expect("generation should succeed");
263/// assert_eq!(result, vec![2, 4, 6, 8, 10]);
264/// ```
265pub fn generate_batched<G>(
266    generator: &G,
267    seeds: &[G::Input],
268    config: &SyntheticConfig,
269    batch_size: usize,
270) -> Result<Vec<G::Output>>
271where
272    G: SyntheticGenerator,
273{
274    let mut all_synthetic = Vec::new();
275
276    for chunk in seeds.chunks(batch_size.max(1)) {
277        let batch = generator.generate(chunk, config)?;
278        all_synthetic.extend(batch);
279    }
280
281    Ok(all_synthetic)
282}
283
284/// Streaming iterator for memory-constrained synthetic generation.
285///
286/// Generates synthetic data on-demand rather than all at once,
287/// reducing peak memory usage for large datasets.
288#[derive(Debug)]
289pub struct SyntheticStream<'a, G: SyntheticGenerator + std::fmt::Debug> {
290    generator: &'a G,
291    seeds: &'a [G::Input],
292    config: &'a SyntheticConfig,
293    current_idx: usize,
294    batch_size: usize,
295}
296
297impl<'a, G: SyntheticGenerator + std::fmt::Debug> SyntheticStream<'a, G> {
298    /// Create a new streaming generator.
299    ///
300    /// # Arguments
301    ///
302    /// * `generator` - The synthetic data generator to use
303    /// * `seeds` - Original samples to use as generation seeds
304    /// * `config` - Configuration controlling generation behavior
305    /// * `batch_size` - Number of seeds to process per iteration
306    #[must_use]
307    pub fn new(
308        generator: &'a G,
309        seeds: &'a [G::Input],
310        config: &'a SyntheticConfig,
311        batch_size: usize,
312    ) -> Self {
313        Self {
314            generator,
315            seeds,
316            config,
317            current_idx: 0,
318            batch_size: batch_size.max(1),
319        }
320    }
321
322    /// Check if there are more batches to generate.
323    #[must_use]
324    pub fn has_next(&self) -> bool {
325        self.current_idx < self.seeds.len()
326    }
327
328    /// Get the number of seeds remaining to process.
329    #[must_use]
330    pub fn remaining(&self) -> usize {
331        self.seeds.len().saturating_sub(self.current_idx)
332    }
333}
334
335impl<G: SyntheticGenerator + std::fmt::Debug> Iterator for SyntheticStream<'_, G> {
336    type Item = Result<Vec<G::Output>>;
337
338    fn next(&mut self) -> Option<Self::Item> {
339        if self.current_idx >= self.seeds.len() {
340            return None;
341        }
342        let end = (self.current_idx + self.batch_size).min(self.seeds.len());
343        let chunk = &self.seeds[self.current_idx..end];
344        self.current_idx = end;
345        Some(self.generator.generate(chunk, self.config))
346    }
347}
348
349#[cfg(test)]
350mod tests {
351    use super::*;
352
353    /// Simple test generator for unit tests
354    #[derive(Debug)]
355    struct DoubleGenerator;
356
357    impl SyntheticGenerator for DoubleGenerator {
358        type Input = i32;
359        type Output = i32;
360
361        fn generate(&self, seeds: &[i32], _config: &SyntheticConfig) -> Result<Vec<i32>> {
362            Ok(seeds.iter().map(|x| x * 2).collect())
363        }
364
365        fn quality_score(&self, generated: &i32, seed: &i32) -> f32 {
366            if *generated == seed * 2 {
367                1.0
368            } else {
369                0.0
370            }
371        }
372
373        fn diversity_score(&self, batch: &[i32]) -> f32 {
374            use std::collections::HashSet;
375            let unique: HashSet<_> = batch.iter().collect();
376            if batch.is_empty() {
377                0.0
378            } else {
379                unique.len() as f32 / batch.len() as f32
380            }
381        }
382    }
383
384    #[test]
385    fn test_synthetic_generator_trait() {
386        let gen = DoubleGenerator;
387        let seeds = vec![1, 2, 3];
388        let config = SyntheticConfig::default();
389
390        let result = gen.generate(&seeds, &config).expect("generation failed");
391        assert_eq!(result, vec![2, 4, 6]);
392    }
393
394    #[test]
395    fn test_quality_score() {
396        let gen = DoubleGenerator;
397        assert!((gen.quality_score(&4, &2) - 1.0).abs() < f32::EPSILON);
398        assert!((gen.quality_score(&5, &2) - 0.0).abs() < f32::EPSILON);
399    }
400
401    #[test]
402    fn test_diversity_score() {
403        let gen = DoubleGenerator;
404        assert!((gen.diversity_score(&[1, 2, 3]) - 1.0).abs() < f32::EPSILON);
405        assert!((gen.diversity_score(&[1, 1, 1]) - (1.0 / 3.0)).abs() < f32::EPSILON);
406        assert!((gen.diversity_score(&[]) - 0.0).abs() < f32::EPSILON);
407    }
408
409    #[test]
410    fn test_generate_batched() {
411        let gen = DoubleGenerator;
412        let seeds = vec![1, 2, 3, 4, 5];
413        let config = SyntheticConfig::default();
414
415        let result = generate_batched(&gen, &seeds, &config, 2).expect("batched generation failed");
416        assert_eq!(result, vec![2, 4, 6, 8, 10]);
417    }
418
419    #[test]
420    fn test_generate_batched_single_batch() {
421        let gen = DoubleGenerator;
422        let seeds = vec![1, 2, 3];
423        let config = SyntheticConfig::default();
424
425        let result =
426            generate_batched(&gen, &seeds, &config, 100).expect("batched generation failed");
427        assert_eq!(result, vec![2, 4, 6]);
428    }
429
430    #[test]
431    fn test_generate_batched_empty() {
432        let gen = DoubleGenerator;
433        let seeds: Vec<i32> = vec![];
434        let config = SyntheticConfig::default();
435
436        let result = generate_batched(&gen, &seeds, &config, 2).expect("batched generation failed");
437        assert!(result.is_empty());
438    }
439
440    #[test]
441    fn test_synthetic_stream_basic() {
442        let gen = DoubleGenerator;
443        let seeds = vec![1, 2, 3, 4, 5];
444        let config = SyntheticConfig::default();
445
446        let stream = SyntheticStream::new(&gen, &seeds, &config, 2);
447        let results: Vec<_> = stream.map(|r| r.expect("generation failed")).collect();
448
449        assert_eq!(results.len(), 3); // [1,2], [3,4], [5]
450        assert_eq!(results[0], vec![2, 4]);
451        assert_eq!(results[1], vec![6, 8]);
452        assert_eq!(results[2], vec![10]);
453    }
454
455    #[test]
456    fn test_synthetic_stream_has_next() {
457        let gen = DoubleGenerator;
458        let seeds = vec![1, 2];
459        let config = SyntheticConfig::default();
460
461        let mut stream = SyntheticStream::new(&gen, &seeds, &config, 1);
462        assert!(stream.has_next());
463        assert_eq!(stream.remaining(), 2);
464
465        stream.next();
466        assert!(stream.has_next());
467        assert_eq!(stream.remaining(), 1);
468
469        stream.next();
470        assert!(!stream.has_next());
471        assert_eq!(stream.remaining(), 0);
472    }
473
474    #[test]
475    fn test_synthetic_stream_empty() {
476        let gen = DoubleGenerator;
477        let seeds: Vec<i32> = vec![];
478        let config = SyntheticConfig::default();
479
480        let mut stream = SyntheticStream::new(&gen, &seeds, &config, 2);
481        assert!(!stream.has_next());
482        assert!(stream.next().is_none());
483    }
484
485    #[test]
486    fn test_batch_size_zero_becomes_one() {
487        let gen = DoubleGenerator;
488        let seeds = vec![1, 2, 3];
489        let config = SyntheticConfig::default();
490
491        // batch_size of 0 should be treated as 1
492        let result = generate_batched(&gen, &seeds, &config, 0).expect("generation failed");
493        assert_eq!(result, vec![2, 4, 6]);
494    }
495
496    // ============================================================================
497    // EXTREME TDD: Andon Integration Tests
498    // ============================================================================
499
500    #[test]
501    fn test_check_andon_disabled() {
502        let config = SyntheticConfig::default().with_andon_enabled(false);
503        let andon = TestAndon::new();
504
505        // Should not trigger even with 100% rejection
506        let result = check_andon::<TestAndon>(0, 100, 0.5, &config, Some(&andon));
507        assert!(result.is_ok());
508        assert!(andon.events().is_empty());
509    }
510
511    #[test]
512    fn test_check_andon_empty_total() {
513        let config = SyntheticConfig::default();
514        let andon = TestAndon::new();
515
516        // Zero total should not trigger
517        let result = check_andon::<TestAndon>(0, 0, 0.5, &config, Some(&andon));
518        assert!(result.is_ok());
519    }
520
521    #[test]
522    fn test_check_andon_high_rejection_halts() {
523        let config = SyntheticConfig::default().with_andon_rejection_threshold(0.90);
524        let andon = TestAndon::new();
525
526        // 95% rejection rate (5 accepted out of 100)
527        let result = check_andon::<TestAndon>(5, 100, 0.5, &config, Some(&andon));
528        assert!(result.is_err());
529        assert!(andon.was_halted());
530        assert_eq!(andon.count_high_rejection(), 1);
531    }
532
533    #[test]
534    fn test_check_andon_acceptable_rejection() {
535        let config = SyntheticConfig::default().with_andon_rejection_threshold(0.90);
536        let andon = TestAndon::new();
537
538        // 80% rejection rate (20 accepted out of 100) - below threshold
539        let result = check_andon::<TestAndon>(20, 100, 0.5, &config, Some(&andon));
540        assert!(result.is_ok());
541        assert!(!andon.was_halted());
542    }
543
544    #[test]
545    fn test_check_andon_diversity_collapse_warns() {
546        let config =
547            SyntheticConfig::default().with_andon(AndonConfig::new().with_diversity_minimum(0.2));
548        let andon = TestAndon::new();
549
550        // Low diversity (0.1 < 0.2 minimum) but good acceptance
551        let result = check_andon::<TestAndon>(80, 100, 0.1, &config, Some(&andon));
552        assert!(result.is_ok()); // Diversity collapse is warning, not halt
553        assert!(!andon.was_halted());
554        assert_eq!(andon.events().len(), 1);
555    }
556
557    #[test]
558    fn test_check_andon_no_handler() {
559        let config = SyntheticConfig::default().with_andon_rejection_threshold(0.90);
560
561        // High rejection but no handler - should not error
562        let result = check_andon::<DefaultAndon>(5, 100, 0.5, &config, None);
563        assert!(result.is_ok());
564    }
565
566    #[test]
567    fn test_check_andon_multiple_conditions() {
568        let config = SyntheticConfig::default()
569            .with_andon_rejection_threshold(0.90)
570            .with_andon(AndonConfig::new().with_diversity_minimum(0.2));
571        let andon = TestAndon::new();
572
573        // Both high rejection AND low diversity
574        let result = check_andon::<TestAndon>(3, 100, 0.05, &config, Some(&andon));
575        assert!(result.is_err()); // Rejection halts first
576        assert!(andon.was_halted());
577    }
578}