Skip to main content

verificar/data/
mod.rs

1//! Data pipeline for storing verified test cases
2//!
3//! This module handles the storage and retrieval of verified
4//! (source, target, correctness) tuples in Parquet format.
5//!
6//! # Features
7//!
8//! - Large-scale parallel generation with progress tracking
9//! - Automatic Parquet sharding for large datasets
10//! - Support for all sampling strategies
11
12#[cfg(feature = "parquet")]
13pub mod parquet;
14
15pub mod corpus;
16pub mod pipeline;
17
18pub use corpus::{CorpusFormat, CorpusManager, CorpusMetadata, TrainingCorpus};
19pub use pipeline::{DataPipeline, PipelineConfig, PipelineStats, PipelineStrategy};
20
21use serde::{Deserialize, Serialize};
22use uuid::Uuid;
23
24use crate::generator::GeneratedCode;
25use crate::mutator::MutationOperator;
26use crate::oracle::VerificationResult;
27use crate::Language;
28
29/// Verified transpilation tuple for ML training
30///
31/// Represents a successful transpilation with both source and target code.
32/// Used for LLM fine-tuning with entrenar.
33#[derive(Debug, Clone, Serialize, Deserialize)]
34pub struct VerifiedTuple {
35    /// Source language
36    pub source_language: Language,
37    /// Target language
38    pub target_language: Language,
39    /// Original source code
40    pub source_code: String,
41    /// Transpiled target code
42    pub target_code: String,
43    /// Whether the transpilation was verified correct (I/O equivalent)
44    pub is_correct: bool,
45    /// Execution time in milliseconds
46    pub execution_time_ms: u64,
47}
48
49impl VerifiedTuple {
50    /// Create from a test case (only if transpilation succeeded)
51    #[must_use]
52    pub fn from_test_case(test_case: &TestCase) -> Option<Self> {
53        let target_code = test_case.target_code.as_ref()?;
54        Some(Self {
55            source_language: test_case.source_language,
56            target_language: test_case.target_language,
57            source_code: test_case.source_code.clone(),
58            target_code: target_code.clone(),
59            is_correct: matches!(test_case.result, TestResult::Pass),
60            execution_time_ms: 0, // Not tracked in TestCase
61        })
62    }
63}
64
65/// Test case with full metadata
66///
67/// From spec Section 8.1: Generated test case schema.
68#[derive(Debug, Clone, Serialize, Deserialize)]
69pub struct TestCase {
70    /// Unique identifier
71    pub id: Uuid,
72
73    /// Source language
74    pub source_language: Language,
75
76    /// Source code
77    pub source_code: String,
78
79    /// Target language
80    pub target_language: Language,
81
82    /// Transpiled code (if successful)
83    pub target_code: Option<String>,
84
85    /// Verification result
86    pub result: TestResult,
87
88    /// Features for ML
89    pub features: CodeFeatures,
90
91    /// Generation metadata
92    pub metadata: GenerationMetadata,
93}
94
95/// Test result enum
96#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
97pub enum TestResult {
98    /// I/O equivalent
99    Pass,
100    /// Transpilation failed
101    TranspileError(String),
102    /// Output mismatch
103    OutputMismatch {
104        /// Expected output
105        expected: String,
106        /// Actual output
107        actual: String,
108    },
109    /// Timeout
110    Timeout {
111        /// Timeout limit in milliseconds
112        limit_ms: u64,
113    },
114    /// Runtime error
115    RuntimeError {
116        /// Phase where error occurred
117        phase: String,
118        /// Error message
119        error: String,
120    },
121}
122
123/// Features extracted from source code for ML
124#[derive(Debug, Clone, Default, Serialize, Deserialize)]
125pub struct CodeFeatures {
126    /// AST depth
127    pub ast_depth: u32,
128    /// Number of operators
129    pub num_operators: u32,
130    /// Number of control flow statements
131    pub num_control_flow: u32,
132    /// Cyclomatic complexity
133    pub cyclomatic_complexity: f32,
134    /// Number of type coercions
135    pub num_type_coercions: u32,
136    /// Uses edge values (0, -1, MAX_INT, etc.)
137    pub uses_edge_values: bool,
138}
139
140/// Metadata about how the test case was generated
141#[derive(Debug, Clone, Serialize, Deserialize)]
142pub struct GenerationMetadata {
143    /// Generation strategy used
144    pub strategy: String,
145    /// Mutation operators applied
146    pub mutation_operators: Vec<String>,
147    /// Timestamp
148    pub timestamp: String,
149    /// Transpiler version
150    pub transpiler_version: String,
151}
152
153impl TestCase {
154    /// Create a new test case from generation and verification results
155    #[must_use]
156    pub fn new(
157        generated: &GeneratedCode,
158        verification: &VerificationResult,
159        transpiler_version: &str,
160    ) -> Self {
161        Self {
162            id: Uuid::new_v4(),
163            source_language: generated.language,
164            source_code: generated.code.clone(),
165            target_language: verification.target_language,
166            target_code: Some(verification.target_code.clone()),
167            result: TestResult::from_verification(verification),
168            features: CodeFeatures {
169                ast_depth: generated.ast_depth as u32,
170                ..Default::default()
171            },
172            metadata: GenerationMetadata {
173                strategy: "unknown".to_string(),
174                mutation_operators: vec![],
175                timestamp: chrono_lite_timestamp(),
176                transpiler_version: transpiler_version.to_string(),
177            },
178        }
179    }
180}
181
182impl TestResult {
183    /// Convert from verification result
184    fn from_verification(verification: &VerificationResult) -> Self {
185        match &verification.verdict {
186            crate::oracle::Verdict::Pass => Self::Pass,
187            crate::oracle::Verdict::OutputMismatch { expected, actual } => Self::OutputMismatch {
188                expected: expected.clone(),
189                actual: actual.clone(),
190            },
191            crate::oracle::Verdict::Timeout { limit_ms, .. } => Self::Timeout {
192                limit_ms: *limit_ms,
193            },
194            crate::oracle::Verdict::RuntimeError { phase, error } => Self::RuntimeError {
195                phase: phase.to_string(),
196                error: error.clone(),
197            },
198        }
199    }
200}
201
202/// Simple timestamp without chrono dependency
203fn chrono_lite_timestamp() -> String {
204    use std::time::{SystemTime, UNIX_EPOCH};
205    let duration = SystemTime::now()
206        .duration_since(UNIX_EPOCH)
207        .unwrap_or_default();
208    format!("{}", duration.as_secs())
209}
210
211/// Builder for test cases with mutations
212#[derive(Debug, Default)]
213pub struct TestCaseBuilder {
214    source_code: Option<String>,
215    source_language: Option<Language>,
216    mutation_operators: Vec<MutationOperator>,
217    strategy: Option<String>,
218}
219
220impl TestCaseBuilder {
221    /// Create a new builder
222    #[must_use]
223    pub fn new() -> Self {
224        Self::default()
225    }
226
227    /// Set the source code
228    #[must_use]
229    pub fn source_code(mut self, code: impl Into<String>) -> Self {
230        self.source_code = Some(code.into());
231        self
232    }
233
234    /// Set the source language
235    #[must_use]
236    pub fn source_language(mut self, language: Language) -> Self {
237        self.source_language = Some(language);
238        self
239    }
240
241    /// Add a mutation operator
242    #[must_use]
243    pub fn mutation_operator(mut self, operator: MutationOperator) -> Self {
244        self.mutation_operators.push(operator);
245        self
246    }
247
248    /// Set the generation strategy
249    #[must_use]
250    pub fn strategy(mut self, strategy: impl Into<String>) -> Self {
251        self.strategy = Some(strategy.into());
252        self
253    }
254}
255
256#[cfg(test)]
257mod tests {
258    use super::*;
259    use crate::oracle::Verdict;
260
261    #[test]
262    fn test_test_result_pass() {
263        let result = TestResult::Pass;
264        assert_eq!(result, TestResult::Pass);
265    }
266
267    #[test]
268    fn test_test_result_transpile_error() {
269        let result = TestResult::TranspileError("syntax error".to_string());
270        assert!(matches!(result, TestResult::TranspileError(_)));
271    }
272
273    #[test]
274    fn test_test_result_output_mismatch() {
275        let result = TestResult::OutputMismatch {
276            expected: "hello".to_string(),
277            actual: "world".to_string(),
278        };
279        assert!(matches!(result, TestResult::OutputMismatch { .. }));
280    }
281
282    #[test]
283    fn test_test_result_timeout() {
284        let result = TestResult::Timeout { limit_ms: 5000 };
285        if let TestResult::Timeout { limit_ms } = result {
286            assert_eq!(limit_ms, 5000);
287        } else {
288            panic!("Expected Timeout");
289        }
290    }
291
292    #[test]
293    fn test_test_result_runtime_error() {
294        let result = TestResult::RuntimeError {
295            phase: "source".to_string(),
296            error: "division by zero".to_string(),
297        };
298        assert!(matches!(result, TestResult::RuntimeError { .. }));
299    }
300
301    #[test]
302    fn test_code_features_default() {
303        let features = CodeFeatures::default();
304        assert_eq!(features.ast_depth, 0);
305        assert_eq!(features.num_operators, 0);
306        assert_eq!(features.num_control_flow, 0);
307        assert!((features.cyclomatic_complexity - 0.0).abs() < f32::EPSILON);
308        assert_eq!(features.num_type_coercions, 0);
309        assert!(!features.uses_edge_values);
310    }
311
312    #[test]
313    fn test_code_features_custom() {
314        let features = CodeFeatures {
315            ast_depth: 5,
316            num_operators: 10,
317            num_control_flow: 3,
318            cyclomatic_complexity: 4.5,
319            num_type_coercions: 2,
320            uses_edge_values: true,
321        };
322        assert_eq!(features.ast_depth, 5);
323        assert!(features.uses_edge_values);
324    }
325
326    #[test]
327    fn test_test_case_builder() {
328        let builder = TestCaseBuilder::new()
329            .source_code("x = 1")
330            .source_language(Language::Python)
331            .mutation_operator(MutationOperator::Aor)
332            .strategy("exhaustive");
333
334        assert_eq!(builder.source_code, Some("x = 1".to_string()));
335        assert_eq!(builder.source_language, Some(Language::Python));
336        assert_eq!(builder.mutation_operators.len(), 1);
337        assert_eq!(builder.strategy, Some("exhaustive".to_string()));
338    }
339
340    #[test]
341    fn test_test_case_builder_multiple_operators() {
342        let builder = TestCaseBuilder::new()
343            .mutation_operator(MutationOperator::Aor)
344            .mutation_operator(MutationOperator::Ror)
345            .mutation_operator(MutationOperator::Lor);
346
347        assert_eq!(builder.mutation_operators.len(), 3);
348    }
349
350    #[test]
351    fn test_chrono_lite_timestamp() {
352        let ts = chrono_lite_timestamp();
353        // Should be a numeric string
354        assert!(!ts.is_empty());
355        assert!(ts.parse::<u64>().is_ok());
356    }
357
358    #[test]
359    fn test_generation_metadata_debug() {
360        let metadata = GenerationMetadata {
361            strategy: "exhaustive".to_string(),
362            mutation_operators: vec!["AOR".to_string()],
363            timestamp: "123456".to_string(),
364            transpiler_version: "0.1.0".to_string(),
365        };
366        let debug = format!("{:?}", metadata);
367        assert!(debug.contains("exhaustive"));
368    }
369
370    #[test]
371    fn test_test_result_from_verdict_pass() {
372        let verification = crate::oracle::VerificationResult {
373            source_code: "print(1)".to_string(),
374            source_language: Language::Python,
375            target_code: "fn main() {}".to_string(),
376            target_language: Language::Rust,
377            verdict: Verdict::Pass,
378            source_result: None,
379            target_result: None,
380        };
381        let result = TestResult::from_verification(&verification);
382        assert_eq!(result, TestResult::Pass);
383    }
384
385    #[test]
386    fn test_test_result_from_verdict_mismatch() {
387        use crate::oracle::Phase;
388        let verification = crate::oracle::VerificationResult {
389            source_code: "print(1)".to_string(),
390            source_language: Language::Python,
391            target_code: "fn main() {}".to_string(),
392            target_language: Language::Rust,
393            verdict: Verdict::OutputMismatch {
394                expected: "1".to_string(),
395                actual: "2".to_string(),
396            },
397            source_result: None,
398            target_result: None,
399        };
400        let result = TestResult::from_verification(&verification);
401        assert!(matches!(result, TestResult::OutputMismatch { .. }));
402    }
403
404    #[test]
405    fn test_test_result_from_verdict_timeout() {
406        use crate::oracle::Phase;
407        let verification = crate::oracle::VerificationResult {
408            source_code: "while True: pass".to_string(),
409            source_language: Language::Python,
410            target_code: "loop {}".to_string(),
411            target_language: Language::Rust,
412            verdict: Verdict::Timeout {
413                phase: Phase::Source,
414                limit_ms: 5000,
415            },
416            source_result: None,
417            target_result: None,
418        };
419        let result = TestResult::from_verification(&verification);
420        assert!(matches!(result, TestResult::Timeout { .. }));
421    }
422
423    #[test]
424    fn test_test_result_from_verdict_runtime_error() {
425        use crate::oracle::Phase;
426        let verification = crate::oracle::VerificationResult {
427            source_code: "1/0".to_string(),
428            source_language: Language::Python,
429            target_code: "panic!()".to_string(),
430            target_language: Language::Rust,
431            verdict: Verdict::RuntimeError {
432                phase: Phase::Source,
433                error: "division by zero".to_string(),
434            },
435            source_result: None,
436            target_result: None,
437        };
438        let result = TestResult::from_verification(&verification);
439        assert!(matches!(result, TestResult::RuntimeError { .. }));
440    }
441
442    #[test]
443    fn test_test_case_new() {
444        let generated = crate::generator::GeneratedCode {
445            code: "print(1)".to_string(),
446            language: Language::Python,
447            ast_depth: 1,
448            features: vec!["print".to_string()],
449        };
450
451        let verification = crate::oracle::VerificationResult {
452            source_code: "print(1)".to_string(),
453            source_language: Language::Python,
454            target_code: "fn main() { println!(\"1\"); }".to_string(),
455            target_language: Language::Rust,
456            verdict: Verdict::Pass,
457            source_result: None,
458            target_result: None,
459        };
460
461        let test_case = TestCase::new(&generated, &verification, "0.1.0");
462
463        assert_eq!(test_case.source_language, Language::Python);
464        assert_eq!(test_case.target_language, Language::Rust);
465        assert!(test_case.target_code.is_some());
466        assert_eq!(test_case.result, TestResult::Pass);
467        assert_eq!(test_case.metadata.transpiler_version, "0.1.0");
468    }
469
470    #[test]
471    fn test_test_case_debug() {
472        let generated = crate::generator::GeneratedCode {
473            code: "x = 1".to_string(),
474            language: Language::Python,
475            ast_depth: 1,
476            features: vec![],
477        };
478
479        let verification = crate::oracle::VerificationResult {
480            source_code: "x = 1".to_string(),
481            source_language: Language::Python,
482            target_code: "let x = 1;".to_string(),
483            target_language: Language::Rust,
484            verdict: Verdict::Pass,
485            source_result: None,
486            target_result: None,
487        };
488
489        let test_case = TestCase::new(&generated, &verification, "0.1.0");
490        let debug = format!("{:?}", test_case);
491        assert!(debug.contains("TestCase"));
492    }
493
494    #[test]
495    fn test_test_case_clone() {
496        let generated = crate::generator::GeneratedCode {
497            code: "x = 1".to_string(),
498            language: Language::Python,
499            ast_depth: 1,
500            features: vec![],
501        };
502
503        let verification = crate::oracle::VerificationResult {
504            source_code: "x = 1".to_string(),
505            source_language: Language::Python,
506            target_code: "let x = 1;".to_string(),
507            target_language: Language::Rust,
508            verdict: Verdict::Pass,
509            source_result: None,
510            target_result: None,
511        };
512
513        let test_case = TestCase::new(&generated, &verification, "0.1.0");
514        let cloned = test_case.clone();
515        assert_eq!(cloned.source_code, test_case.source_code);
516    }
517}