Skip to main content

verificar/ml/
codex_pipeline.rs

1//! CODEX Integration - End-to-end pipeline
2//!
3//! Unified pipeline orchestrating all CODEX components:
4//! Generator → Quality Gate → Bug Priority → Oracle → Rich Labels → Export
5//!
6//! # Pipeline Flow
7//!
8//! ```text
9//! Generator ──► QualityGate ──► DefectPredictor ──► ActiveLearner
10//!                   │                 │                   │
11//!                   ▼                 ▼                   ▼
12//!              (filtered)        (prioritized)       (sampled)
13//!                                                        │
14//!                                                        ▼
15//!                                                     Oracle
16//!                                                        │
17//!                                                        ▼
18//!                                               RichLabel + Export
19//! ```
20//!
21//! # Reference
22//! - VER-054: CODEX Integration - End-to-end pipeline
23
24use serde::{Deserialize, Serialize};
25
26use crate::generator::GeneratedCode;
27use crate::ml::{ActiveLearner, CommitFeatures, DefectPredictor, QualityGate, RichLabel};
28
29/// Pipeline configuration
30#[derive(Debug, Clone, Serialize, Deserialize)]
31pub struct PipelineConfig {
32    /// Quality gate threshold (0.0 to 1.0)
33    pub quality_threshold: f32,
34    /// Number of clusters for active learning
35    pub num_clusters: usize,
36    /// Batch size for oracle calls
37    pub batch_size: usize,
38    /// Maximum oracle calls per run
39    pub max_oracle_calls: usize,
40    /// Target oracle call reduction (e.g., 10 for 10x reduction)
41    pub target_reduction: f32,
42    /// Enable active learning exploration
43    pub enable_active_learning: bool,
44    /// Enable defect prediction prioritization
45    pub enable_defect_priority: bool,
46}
47
48impl Default for PipelineConfig {
49    fn default() -> Self {
50        Self {
51            quality_threshold: 0.5,
52            num_clusters: 5,
53            batch_size: 100,
54            max_oracle_calls: 1000,
55            target_reduction: 10.0,
56            enable_active_learning: true,
57            enable_defect_priority: true,
58        }
59    }
60}
61
62impl PipelineConfig {
63    /// Create strict config (high quality threshold)
64    #[must_use]
65    pub fn strict() -> Self {
66        Self {
67            quality_threshold: 0.7,
68            ..Default::default()
69        }
70    }
71
72    /// Create fast config (minimal filtering)
73    #[must_use]
74    pub fn fast() -> Self {
75        Self {
76            quality_threshold: 0.3,
77            enable_active_learning: false,
78            enable_defect_priority: false,
79            ..Default::default()
80        }
81    }
82
83    /// Validate configuration
84    #[must_use]
85    pub fn validate(&self) -> Vec<String> {
86        let mut errors = Vec::new();
87
88        if self.quality_threshold < 0.0 || self.quality_threshold > 1.0 {
89            errors.push("quality_threshold must be in [0.0, 1.0]".to_string());
90        }
91
92        if self.num_clusters == 0 {
93            errors.push("num_clusters must be > 0".to_string());
94        }
95
96        if self.batch_size == 0 {
97            errors.push("batch_size must be > 0".to_string());
98        }
99
100        if self.target_reduction <= 0.0 {
101            errors.push("target_reduction must be > 0".to_string());
102        }
103
104        errors
105    }
106}
107
108/// Data quality metrics
109#[derive(Debug, Clone, Default, Serialize, Deserialize)]
110pub struct DataQualityMetrics {
111    /// Novelty score (0-1): how different from existing samples
112    pub novelty: f32,
113    /// Diversity score (0-1): variety within dataset (silhouette)
114    pub diversity: f32,
115    /// Difficulty score (0-1): complexity of samples
116    pub difficulty: f32,
117    /// Coverage score (0-1): AST/feature space coverage
118    pub coverage: f32,
119    /// Bug revelation rate (fraction of samples revealing bugs)
120    pub bug_rate: f32,
121}
122
123impl DataQualityMetrics {
124    /// Overall quality score (weighted average)
125    #[must_use]
126    pub fn overall(&self) -> f32 {
127        let weights = [0.2, 0.25, 0.2, 0.2, 0.15]; // novelty, diversity, difficulty, coverage, bug_rate
128        let values = [
129            self.novelty,
130            self.diversity,
131            self.difficulty,
132            self.coverage,
133            self.bug_rate,
134        ];
135
136        let weighted_sum: f32 = values.iter().zip(&weights).map(|(v, w)| v * w).sum();
137        let total_weight: f32 = weights.iter().sum();
138
139        weighted_sum / total_weight
140    }
141
142    /// Check if quality meets targets
143    #[must_use]
144    pub fn meets_targets(&self) -> bool {
145        self.diversity >= 0.6 && self.bug_rate >= 0.15 && self.coverage >= 0.7
146    }
147}
148
149/// Pipeline stage result
150#[derive(Debug, Clone)]
151pub struct StageResult {
152    /// Stage name
153    pub stage: String,
154    /// Number of items input
155    pub input_count: usize,
156    /// Number of items output
157    pub output_count: usize,
158    /// Processing time in milliseconds
159    pub time_ms: u64,
160}
161
162impl StageResult {
163    /// Reduction factor (input / output)
164    #[must_use]
165    pub fn reduction_factor(&self) -> f32 {
166        if self.output_count == 0 {
167            f32::INFINITY
168        } else {
169            self.input_count as f32 / self.output_count as f32
170        }
171    }
172
173    /// Pass-through rate (output / input)
174    #[must_use]
175    pub fn pass_rate(&self) -> f32 {
176        if self.input_count == 0 {
177            0.0
178        } else {
179            self.output_count as f32 / self.input_count as f32
180        }
181    }
182}
183
184/// Pipeline execution result
185#[derive(Debug, Clone, Default)]
186pub struct PipelineResult {
187    /// Results from each stage
188    pub stages: Vec<StageResult>,
189    /// Final labeled samples
190    pub labels: Vec<RichLabel>,
191    /// Data quality metrics
192    pub quality: DataQualityMetrics,
193    /// Total samples generated
194    pub total_generated: usize,
195    /// Total oracle calls made
196    pub oracle_calls: usize,
197    /// Oracle call reduction achieved
198    pub oracle_reduction: f32,
199}
200
201impl PipelineResult {
202    /// Get stage by name
203    #[must_use]
204    pub fn stage(&self, name: &str) -> Option<&StageResult> {
205        self.stages.iter().find(|s| s.stage == name)
206    }
207
208    /// Total pipeline time in milliseconds
209    #[must_use]
210    pub fn total_time_ms(&self) -> u64 {
211        self.stages.iter().map(|s| s.time_ms).sum()
212    }
213
214    /// Did pipeline meet oracle reduction target?
215    #[must_use]
216    pub fn met_oracle_target(&self, target: f32) -> bool {
217        self.oracle_reduction >= target
218    }
219}
220
221/// Sample prepared for oracle verification
222#[derive(Debug, Clone)]
223pub struct PreparedSample {
224    /// Generated code
225    pub code: GeneratedCode,
226    /// Quality score
227    pub quality_score: f32,
228    /// Defect probability
229    pub defect_probability: f32,
230    /// Cluster assignment
231    pub cluster: Option<usize>,
232    /// Priority rank
233    pub priority: usize,
234}
235
236/// CODEX pipeline orchestrator
237#[derive(Debug)]
238pub struct CodexPipeline {
239    /// Configuration
240    config: PipelineConfig,
241    /// Quality gate
242    quality_gate: QualityGate,
243    /// Defect predictor
244    defect_predictor: DefectPredictor,
245    /// Active learner
246    active_learner: ActiveLearner,
247    /// Pipeline statistics
248    stats: PipelineStats,
249}
250
251/// Pipeline statistics
252#[derive(Debug, Clone, Default)]
253pub struct PipelineStats {
254    /// Total runs
255    pub runs: usize,
256    /// Total samples processed
257    pub samples_processed: usize,
258    /// Total oracle calls
259    pub oracle_calls: usize,
260    /// Total bugs found
261    pub bugs_found: usize,
262    /// Average oracle reduction
263    pub avg_oracle_reduction: f32,
264}
265
266impl Default for CodexPipeline {
267    fn default() -> Self {
268        Self::new(PipelineConfig::default())
269    }
270}
271
272impl CodexPipeline {
273    /// Create new pipeline with config
274    #[must_use]
275    pub fn new(config: PipelineConfig) -> Self {
276        Self {
277            quality_gate: QualityGate::new(config.quality_threshold),
278            defect_predictor: DefectPredictor::new(),
279            active_learner: ActiveLearner::new(config.num_clusters),
280            config,
281            stats: PipelineStats::default(),
282        }
283    }
284
285    /// Get configuration
286    #[must_use]
287    pub fn config(&self) -> &PipelineConfig {
288        &self.config
289    }
290
291    /// Get statistics
292    #[must_use]
293    pub fn stats(&self) -> &PipelineStats {
294        &self.stats
295    }
296
297    /// Stage 1: Filter by quality gate
298    pub fn filter_quality<'a>(
299        &mut self,
300        codes: &'a [GeneratedCode],
301    ) -> (Vec<&'a GeneratedCode>, StageResult) {
302        let start = std::time::Instant::now();
303        let input_count = codes.len();
304
305        let passed = self.quality_gate.filter_batch(codes);
306
307        let result = StageResult {
308            stage: "quality_gate".to_string(),
309            input_count,
310            output_count: passed.len(),
311            time_ms: start.elapsed().as_millis() as u64,
312        };
313
314        (passed, result)
315    }
316
317    /// Stage 2: Prioritize by defect likelihood
318    pub fn prioritize_defects<'a>(
319        &self,
320        codes: &'a [&GeneratedCode],
321    ) -> (Vec<&'a GeneratedCode>, StageResult) {
322        let start = std::time::Instant::now();
323        let input_count = codes.len();
324
325        if !self.config.enable_defect_priority {
326            return (
327                codes.to_vec(),
328                StageResult {
329                    stage: "defect_priority".to_string(),
330                    input_count,
331                    output_count: input_count,
332                    time_ms: start.elapsed().as_millis() as u64,
333                },
334            );
335        }
336
337        // Create feature/code pairs for prioritization
338        let pairs: Vec<(CommitFeatures, String)> = codes
339            .iter()
340            .map(|c| (CommitFeatures::default(), c.code.clone()))
341            .collect();
342
343        let order = self.defect_predictor.prioritize(&pairs);
344
345        // Take top batch_size samples
346        let output_count = order.len().min(self.config.batch_size);
347        let prioritized: Vec<&GeneratedCode> = order
348            .iter()
349            .take(output_count)
350            .filter_map(|&i| codes.get(i).copied())
351            .collect();
352
353        let result = StageResult {
354            stage: "defect_priority".to_string(),
355            input_count,
356            output_count: prioritized.len(),
357            time_ms: start.elapsed().as_millis() as u64,
358        };
359
360        (prioritized, result)
361    }
362
363    /// Stage 3: Sample via active learning
364    pub fn sample_active<'a>(
365        &mut self,
366        codes: &'a [&GeneratedCode],
367    ) -> (Vec<&'a GeneratedCode>, StageResult) {
368        let start = std::time::Instant::now();
369        let input_count = codes.len();
370
371        if !self.config.enable_active_learning || codes.is_empty() {
372            return (
373                codes.to_vec(),
374                StageResult {
375                    stage: "active_learning".to_string(),
376                    input_count,
377                    output_count: input_count,
378                    time_ms: start.elapsed().as_millis() as u64,
379                },
380            );
381        }
382
383        // Fit clusters
384        let code_strings: Vec<&str> = codes.iter().map(|c| c.code.as_str()).collect();
385        self.active_learner.fit(&code_strings);
386
387        // Select batch via Thompson Sampling
388        let batch_size = self.config.batch_size.min(codes.len());
389        let selected_indices = self.active_learner.select_batch(&code_strings, batch_size);
390
391        let selected: Vec<&GeneratedCode> = selected_indices
392            .iter()
393            .filter_map(|&i| codes.get(i).copied())
394            .collect();
395
396        let result = StageResult {
397            stage: "active_learning".to_string(),
398            input_count,
399            output_count: selected.len(),
400            time_ms: start.elapsed().as_millis() as u64,
401        };
402
403        (selected, result)
404    }
405
406    /// Prepare samples for oracle (all stages)
407    pub fn prepare(&mut self, codes: &[GeneratedCode]) -> (Vec<PreparedSample>, Vec<StageResult>) {
408        let mut stages = Vec::new();
409
410        // Stage 1: Quality Gate - clone to avoid borrow issues
411        let (quality_passed_refs, stage1) = self.filter_quality(codes);
412        let quality_passed: Vec<GeneratedCode> = quality_passed_refs.into_iter().cloned().collect();
413        stages.push(stage1);
414
415        if quality_passed.is_empty() {
416            return (vec![], stages);
417        }
418
419        // Stage 2: Defect Priority
420        let quality_refs: Vec<&GeneratedCode> = quality_passed.iter().collect();
421        let (prioritized_refs, stage2) = self.prioritize_defects(&quality_refs);
422        let prioritized: Vec<GeneratedCode> = prioritized_refs.into_iter().cloned().collect();
423        stages.push(stage2);
424
425        // Stage 3: Active Learning
426        let prioritized_refs: Vec<&GeneratedCode> = prioritized.iter().collect();
427        let (sampled_refs, stage3) = self.sample_active(&prioritized_refs);
428        let sampled: Vec<GeneratedCode> = sampled_refs.into_iter().cloned().collect();
429        stages.push(stage3);
430
431        // Create prepared samples - now we can use self freely
432        let prepared: Vec<PreparedSample> = sampled
433            .into_iter()
434            .enumerate()
435            .map(|(i, code)| {
436                let quality_score = self.quality_gate.score(
437                    &crate::ml::QualityFeatureExtractor::new().extract_from_generated(&code),
438                );
439
440                let defect_pred = self
441                    .defect_predictor
442                    .predict(&CommitFeatures::default(), &code.code);
443
444                let cluster = self.active_learner.get_cluster(&code.code);
445
446                PreparedSample {
447                    code,
448                    quality_score,
449                    defect_probability: defect_pred.base_probability,
450                    cluster,
451                    priority: i,
452                }
453            })
454            .collect();
455
456        (prepared, stages)
457    }
458
459    /// Update from oracle feedback
460    pub fn update_feedback(&mut self, code: &str, revealed_bug: bool) {
461        self.active_learner.update_feedback(code, revealed_bug);
462
463        if revealed_bug {
464            self.stats.bugs_found += 1;
465        }
466    }
467
468    /// Compute data quality metrics
469    #[must_use]
470    pub fn compute_quality(&self, labels: &[RichLabel]) -> DataQualityMetrics {
471        if labels.is_empty() {
472            return DataQualityMetrics::default();
473        }
474
475        // Bug rate
476        let bugs = labels.iter().filter(|l| !l.is_correct).count();
477        let bug_rate = bugs as f32 / labels.len() as f32;
478
479        // Diversity from active learner
480        let diversity = self.active_learner.silhouette_score().max(0.0);
481
482        // Difficulty based on error severity
483        let total_severity: f32 = labels
484            .iter()
485            .filter_map(|l| l.error_category)
486            .map(|c| c.severity())
487            .sum();
488        let difficulty = if bugs > 0 {
489            (total_severity / bugs as f32).min(1.0)
490        } else {
491            0.3
492        };
493
494        // Coverage estimate from soft labels
495        let avg_structural_sim: f32 = labels
496            .iter()
497            .map(|l| l.soft_labels.structural_similarity)
498            .sum::<f32>()
499            / labels.len() as f32;
500        let coverage = 1.0 - avg_structural_sim; // Less similarity = more coverage
501
502        // Novelty placeholder (would need historical comparison)
503        let novelty = 0.5;
504
505        DataQualityMetrics {
506            novelty,
507            diversity,
508            difficulty,
509            coverage,
510            bug_rate,
511        }
512    }
513
514    /// Run full pipeline (without actual oracle - for testing)
515    pub fn run_dry(&mut self, codes: &[GeneratedCode]) -> PipelineResult {
516        let total_generated = codes.len();
517
518        let (prepared, stages) = self.prepare(codes);
519
520        let oracle_calls = prepared.len();
521        let oracle_reduction = if oracle_calls > 0 {
522            total_generated as f32 / oracle_calls as f32
523        } else {
524            f32::INFINITY
525        };
526
527        // Update stats
528        self.stats.runs += 1;
529        self.stats.samples_processed += total_generated;
530        self.stats.oracle_calls += oracle_calls;
531
532        if self.stats.runs > 1 {
533            self.stats.avg_oracle_reduction =
534                (self.stats.avg_oracle_reduction * (self.stats.runs - 1) as f32 + oracle_reduction)
535                    / self.stats.runs as f32;
536        } else {
537            self.stats.avg_oracle_reduction = oracle_reduction;
538        }
539
540        PipelineResult {
541            stages,
542            labels: vec![], // No actual oracle calls in dry run
543            quality: DataQualityMetrics::default(),
544            total_generated,
545            oracle_calls,
546            oracle_reduction,
547        }
548    }
549
550    /// Reset pipeline state
551    pub fn reset(&mut self) {
552        self.quality_gate.reset_stats();
553        self.active_learner = ActiveLearner::new(self.config.num_clusters);
554    }
555}
556
557#[cfg(test)]
558mod tests {
559    use super::*;
560    use crate::ml::ErrorCategory;
561    use crate::Language;
562
563    fn sample_codes() -> Vec<GeneratedCode> {
564        vec![
565            GeneratedCode {
566                code: "x = 1".to_string(),
567                language: Language::Python,
568                ast_depth: 1,
569                features: vec![],
570            },
571            GeneratedCode {
572                code: "def add(a, b):\n    return a + b".to_string(),
573                language: Language::Python,
574                ast_depth: 3,
575                features: vec!["function".to_string()],
576            },
577            GeneratedCode {
578                code: "for i in range(10):\n    if i % 2 == 0:\n        print(i)".to_string(),
579                language: Language::Python,
580                ast_depth: 5,
581                features: vec!["loop".to_string(), "conditional".to_string()],
582            },
583            GeneratedCode {
584                code: "class Foo:\n    def __init__(self):\n        self.x = 0\n    def get(self):\n        return self.x".to_string(),
585                language: Language::Python,
586                ast_depth: 6,
587                features: vec!["class".to_string(), "method".to_string()],
588            },
589        ]
590    }
591
592    // ========== PipelineConfig Tests ==========
593
594    #[test]
595    fn test_pipeline_config_default() {
596        let config = PipelineConfig::default();
597        assert!((config.quality_threshold - 0.5).abs() < f32::EPSILON);
598        assert_eq!(config.num_clusters, 5);
599    }
600
601    #[test]
602    fn test_pipeline_config_strict() {
603        let config = PipelineConfig::strict();
604        assert!((config.quality_threshold - 0.7).abs() < f32::EPSILON);
605    }
606
607    #[test]
608    fn test_pipeline_config_fast() {
609        let config = PipelineConfig::fast();
610        assert!(!config.enable_active_learning);
611        assert!(!config.enable_defect_priority);
612    }
613
614    #[test]
615    fn test_pipeline_config_validate() {
616        let valid = PipelineConfig::default();
617        assert!(valid.validate().is_empty());
618
619        let invalid = PipelineConfig {
620            quality_threshold: 1.5,
621            num_clusters: 0,
622            ..Default::default()
623        };
624        assert!(!invalid.validate().is_empty());
625    }
626
627    // ========== DataQualityMetrics Tests ==========
628
629    #[test]
630    fn test_data_quality_overall() {
631        let metrics = DataQualityMetrics {
632            novelty: 0.8,
633            diversity: 0.7,
634            difficulty: 0.6,
635            coverage: 0.8,
636            bug_rate: 0.2,
637        };
638
639        let score = metrics.overall();
640        assert!(score > 0.0);
641        assert!(score <= 1.0);
642    }
643
644    #[test]
645    fn test_data_quality_meets_targets() {
646        let good = DataQualityMetrics {
647            diversity: 0.7,
648            bug_rate: 0.2,
649            coverage: 0.8,
650            ..Default::default()
651        };
652        assert!(good.meets_targets());
653
654        let bad = DataQualityMetrics::default();
655        assert!(!bad.meets_targets());
656    }
657
658    // ========== StageResult Tests ==========
659
660    #[test]
661    fn test_stage_result_reduction() {
662        let result = StageResult {
663            stage: "test".to_string(),
664            input_count: 100,
665            output_count: 10,
666            time_ms: 50,
667        };
668
669        assert!((result.reduction_factor() - 10.0).abs() < 0.001);
670        assert!((result.pass_rate() - 0.1).abs() < 0.001);
671    }
672
673    #[test]
674    fn test_stage_result_edge_cases() {
675        let zero_output = StageResult {
676            stage: "test".to_string(),
677            input_count: 100,
678            output_count: 0,
679            time_ms: 0,
680        };
681        assert!(zero_output.reduction_factor().is_infinite());
682
683        let zero_input = StageResult {
684            stage: "test".to_string(),
685            input_count: 0,
686            output_count: 0,
687            time_ms: 0,
688        };
689        assert!((zero_input.pass_rate() - 0.0).abs() < 0.001);
690    }
691
692    // ========== PipelineResult Tests ==========
693
694    #[test]
695    fn test_pipeline_result_stage_lookup() {
696        let result = PipelineResult {
697            stages: vec![
698                StageResult {
699                    stage: "quality_gate".to_string(),
700                    input_count: 100,
701                    output_count: 50,
702                    time_ms: 10,
703                },
704                StageResult {
705                    stage: "defect_priority".to_string(),
706                    input_count: 50,
707                    output_count: 20,
708                    time_ms: 5,
709                },
710            ],
711            ..Default::default()
712        };
713
714        assert!(result.stage("quality_gate").is_some());
715        assert!(result.stage("nonexistent").is_none());
716    }
717
718    #[test]
719    fn test_pipeline_result_total_time() {
720        let result = PipelineResult {
721            stages: vec![
722                StageResult {
723                    stage: "a".to_string(),
724                    input_count: 0,
725                    output_count: 0,
726                    time_ms: 100,
727                },
728                StageResult {
729                    stage: "b".to_string(),
730                    input_count: 0,
731                    output_count: 0,
732                    time_ms: 200,
733                },
734            ],
735            ..Default::default()
736        };
737
738        assert_eq!(result.total_time_ms(), 300);
739    }
740
741    // ========== CodexPipeline Tests ==========
742
743    #[test]
744    fn test_codex_pipeline_new() {
745        let pipeline = CodexPipeline::default();
746        assert_eq!(pipeline.stats().runs, 0);
747    }
748
749    #[test]
750    fn test_codex_pipeline_filter_quality() {
751        let mut pipeline = CodexPipeline::new(PipelineConfig {
752            quality_threshold: 0.3,
753            ..Default::default()
754        });
755
756        let codes = sample_codes();
757        let (passed, stage) = pipeline.filter_quality(&codes);
758
759        assert!(passed.len() <= codes.len());
760        assert_eq!(stage.stage, "quality_gate");
761        assert_eq!(stage.input_count, codes.len());
762    }
763
764    #[test]
765    fn test_codex_pipeline_prioritize_defects() {
766        let pipeline = CodexPipeline::default();
767        let codes = sample_codes();
768        let refs: Vec<&GeneratedCode> = codes.iter().collect();
769
770        let (prioritized, stage) = pipeline.prioritize_defects(&refs);
771
772        assert!(!prioritized.is_empty());
773        assert_eq!(stage.stage, "defect_priority");
774    }
775
776    #[test]
777    fn test_codex_pipeline_sample_active() {
778        let mut pipeline = CodexPipeline::new(PipelineConfig {
779            batch_size: 2,
780            ..Default::default()
781        });
782
783        let codes = sample_codes();
784        let refs: Vec<&GeneratedCode> = codes.iter().collect();
785
786        let (sampled, stage) = pipeline.sample_active(&refs);
787
788        assert!(sampled.len() <= 2);
789        assert_eq!(stage.stage, "active_learning");
790    }
791
792    #[test]
793    fn test_codex_pipeline_prepare() {
794        let mut pipeline = CodexPipeline::new(PipelineConfig {
795            quality_threshold: 0.2,
796            batch_size: 10,
797            ..Default::default()
798        });
799
800        let codes = sample_codes();
801        let (prepared, stages) = pipeline.prepare(&codes);
802
803        assert!(!prepared.is_empty());
804        assert_eq!(stages.len(), 3); // quality, defect, active
805    }
806
807    #[test]
808    fn test_codex_pipeline_run_dry() {
809        let mut pipeline = CodexPipeline::new(PipelineConfig {
810            quality_threshold: 0.2,
811            ..Default::default()
812        });
813
814        let codes = sample_codes();
815        let result = pipeline.run_dry(&codes);
816
817        assert_eq!(result.total_generated, codes.len());
818        assert!(result.oracle_calls <= codes.len());
819        assert!(result.oracle_reduction >= 1.0);
820    }
821
822    #[test]
823    fn test_codex_pipeline_update_feedback() {
824        let mut pipeline = CodexPipeline::default();
825
826        // Need to fit first
827        let codes = sample_codes();
828        let refs: Vec<&GeneratedCode> = codes.iter().collect();
829        let _ = pipeline.sample_active(&refs);
830
831        pipeline.update_feedback("def add(a, b): return a + b", true);
832        assert_eq!(pipeline.stats().bugs_found, 1);
833
834        pipeline.update_feedback("x = 1", false);
835        assert_eq!(pipeline.stats().bugs_found, 1);
836    }
837
838    #[test]
839    fn test_codex_pipeline_compute_quality() {
840        let pipeline = CodexPipeline::default();
841
842        let labels = vec![
843            RichLabel::correct(crate::ml::SoftLabels::default()),
844            RichLabel::incorrect(
845                ErrorCategory::TypeMismatch,
846                "error".to_string(),
847                crate::ml::SoftLabels::default(),
848            ),
849        ];
850
851        let quality = pipeline.compute_quality(&labels);
852        assert!((quality.bug_rate - 0.5).abs() < 0.001);
853    }
854
855    #[test]
856    fn test_codex_pipeline_reset() {
857        let mut pipeline = CodexPipeline::default();
858
859        let codes = sample_codes();
860        let _ = pipeline.run_dry(&codes);
861
862        pipeline.reset();
863        // Stats should remain, but internal state reset
864        assert_eq!(pipeline.stats().runs, 1);
865    }
866
867    // ========== Debug Tests ==========
868
869    #[test]
870    fn test_pipeline_config_debug() {
871        let config = PipelineConfig::default();
872        let debug = format!("{config:?}");
873        assert!(debug.contains("PipelineConfig"));
874    }
875
876    #[test]
877    fn test_data_quality_metrics_debug() {
878        let metrics = DataQualityMetrics::default();
879        let debug = format!("{metrics:?}");
880        assert!(debug.contains("DataQualityMetrics"));
881    }
882
883    #[test]
884    fn test_codex_pipeline_debug() {
885        let pipeline = CodexPipeline::default();
886        let debug = format!("{pipeline:?}");
887        assert!(debug.contains("CodexPipeline"));
888    }
889
890    // ========== Serialization Tests ==========
891
892    #[test]
893    fn test_pipeline_config_serialize() {
894        let config = PipelineConfig::default();
895        let json = serde_json::to_string(&config).unwrap();
896        let restored: PipelineConfig = serde_json::from_str(&json).unwrap();
897        assert!((config.quality_threshold - restored.quality_threshold).abs() < f32::EPSILON);
898    }
899
900    #[test]
901    fn test_data_quality_metrics_serialize() {
902        let metrics = DataQualityMetrics {
903            novelty: 0.5,
904            diversity: 0.6,
905            difficulty: 0.7,
906            coverage: 0.8,
907            bug_rate: 0.15,
908        };
909        let json = serde_json::to_string(&metrics).unwrap();
910        let restored: DataQualityMetrics = serde_json::from_str(&json).unwrap();
911        assert!((metrics.diversity - restored.diversity).abs() < 0.001);
912    }
913
914    // ========== Integration Tests ==========
915
916    #[test]
917    fn test_full_pipeline_flow() {
918        let mut pipeline = CodexPipeline::new(PipelineConfig {
919            quality_threshold: 0.2, // Low threshold to pass more
920            batch_size: 10,
921            ..Default::default()
922        });
923
924        // Generate codes
925        let codes = sample_codes();
926
927        // Run dry (no actual oracle)
928        let result = pipeline.run_dry(&codes);
929
930        // Verify stages ran
931        assert_eq!(result.stages.len(), 3);
932        assert!(result.stage("quality_gate").is_some());
933        assert!(result.stage("defect_priority").is_some());
934        assert!(result.stage("active_learning").is_some());
935
936        // Verify reduction
937        assert!(result.oracle_reduction >= 1.0);
938    }
939
940    #[test]
941    fn test_pipeline_oracle_reduction() {
942        let mut pipeline = CodexPipeline::new(PipelineConfig {
943            quality_threshold: 0.6, // Higher threshold for more filtering
944            batch_size: 2,          // Small batch
945            ..Default::default()
946        });
947
948        // Generate many codes
949        let mut codes = Vec::new();
950        for i in 0..100 {
951            codes.push(GeneratedCode {
952                code: format!("x_{i} = {i}"),
953                language: Language::Python,
954                ast_depth: 1,
955                features: vec![],
956            });
957        }
958
959        let result = pipeline.run_dry(&codes);
960
961        // Should have significant reduction
962        assert!(result.oracle_calls <= 20); // Much less than 100
963    }
964}
965
966/// Property-based tests
967#[cfg(test)]
968mod proptests {
969    use super::*;
970    use proptest::prelude::*;
971
972    proptest! {
973        /// Quality threshold is clamped properly
974        #[test]
975        fn prop_quality_threshold_valid(threshold in -0.5f32..1.5) {
976            let config = PipelineConfig {
977                quality_threshold: threshold.clamp(0.0, 1.0),
978                ..Default::default()
979            };
980            prop_assert!(config.quality_threshold >= 0.0);
981            prop_assert!(config.quality_threshold <= 1.0);
982        }
983
984        /// Oracle reduction is always >= 1 (or infinity for zero calls)
985        #[test]
986        fn prop_oracle_reduction_bounded(total in 1usize..1000, calls in 0usize..1000) {
987            let reduction = if calls == 0 {
988                f32::INFINITY
989            } else {
990                total as f32 / calls as f32
991            };
992
993            if calls > 0 {
994                prop_assert!(reduction >= total as f32 / calls as f32);
995            }
996        }
997
998        /// Overall quality score is bounded [0, 1]
999        #[test]
1000        fn prop_quality_overall_bounded(
1001            novelty in 0.0f32..1.0,
1002            diversity in 0.0f32..1.0,
1003            difficulty in 0.0f32..1.0,
1004            coverage in 0.0f32..1.0,
1005            bug_rate in 0.0f32..1.0,
1006        ) {
1007            let metrics = DataQualityMetrics {
1008                novelty,
1009                diversity,
1010                difficulty,
1011                coverage,
1012                bug_rate,
1013            };
1014
1015            let overall = metrics.overall();
1016            prop_assert!(overall >= 0.0);
1017            prop_assert!(overall <= 1.0);
1018        }
1019    }
1020}