use crate::bootstrap::{get_bootstrap_patterns, BootstrapPattern};
use crate::error::OracleError;
use serde::{Deserialize, Serialize};
use std::path::Path;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TrainingExample {
pub error_code: String,
pub decision: String,
pub fix_diff: String,
pub description: String,
pub source: String,
pub verified: bool,
pub success_count: u32,
pub failure_count: u32,
}
impl TrainingExample {
pub fn from_bootstrap(pattern: &BootstrapPattern) -> Self {
Self {
error_code: pattern.error_code.to_string(),
decision: pattern.decision.to_string(),
fix_diff: pattern.fix_diff.to_string(),
description: pattern.description.to_string(),
source: "bootstrap".to_string(),
verified: true, success_count: 0,
failure_count: 0,
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ChatMLMessage {
pub role: String,
pub content: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ChatMLConversation {
pub messages: Vec<ChatMLMessage>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AlpacaExample {
pub instruction: String,
pub input: String,
pub output: String,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ExportFormat {
Jsonl,
ChatML,
Alpaca,
Parquet,
}
pub struct DatasetExporter {
examples: Vec<TrainingExample>,
}
impl DatasetExporter {
pub fn new() -> Self {
let bootstrap_patterns = get_bootstrap_patterns();
let examples = bootstrap_patterns.iter().map(TrainingExample::from_bootstrap).collect();
Self { examples }
}
pub fn empty() -> Self {
Self { examples: Vec::new() }
}
pub fn add_example(&mut self, example: TrainingExample) {
self.examples.push(example);
}
pub fn examples(&self) -> &[TrainingExample] {
&self.examples
}
pub fn len(&self) -> usize {
self.examples.len()
}
pub fn is_empty(&self) -> bool {
self.examples.is_empty()
}
pub fn export_jsonl(&self, path: impl AsRef<Path>) -> Result<usize, OracleError> {
let path = path.as_ref();
let mut output = String::new();
for example in &self.examples {
let json = serde_json::to_string(example).map_err(|e| {
OracleError::ExportError(format!("Failed to serialize example: {}", e))
})?;
output.push_str(&json);
output.push('\n');
}
std::fs::write(path, &output).map_err(|e| {
OracleError::ExportError(format!("Failed to write JSONL to {}: {}", path.display(), e))
})?;
Ok(self.examples.len())
}
pub fn export_chatml(&self, path: impl AsRef<Path>) -> Result<usize, OracleError> {
let path = path.as_ref();
let mut output = String::new();
for example in &self.examples {
let conversation = ChatMLConversation {
messages: vec![
ChatMLMessage {
role: "user".to_string(),
content: format!(
"Fix the following Rust compilation error:\n\nError: {} - {}\n\nContext:\n{}",
example.error_code, example.description, example.fix_diff.lines().filter(|l| l.starts_with('-')).collect::<Vec<_>>().join("\n")
),
},
ChatMLMessage {
role: "assistant".to_string(),
content: format!(
"Apply this fix ({}):\n\n{}",
example.decision,
example.fix_diff.lines().filter(|l| l.starts_with('+')).collect::<Vec<_>>().join("\n")
),
},
],
};
let json = serde_json::to_string(&conversation).map_err(|e| {
OracleError::ExportError(format!("Failed to serialize ChatML: {}", e))
})?;
output.push_str(&json);
output.push('\n');
}
std::fs::write(path, &output).map_err(|e| {
OracleError::ExportError(format!("Failed to write ChatML to {}: {}", path.display(), e))
})?;
Ok(self.examples.len())
}
pub fn export_alpaca(&self, path: impl AsRef<Path>) -> Result<usize, OracleError> {
let path = path.as_ref();
let mut output = String::new();
for example in &self.examples {
let alpaca = AlpacaExample {
instruction: format!(
"Fix the Rust compilation error {} ({}).",
example.error_code, example.description
),
input: example
.fix_diff
.lines()
.filter(|l| l.starts_with('-'))
.map(|l| l.trim_start_matches('-').trim())
.collect::<Vec<_>>()
.join("\n"),
output: example
.fix_diff
.lines()
.filter(|l| l.starts_with('+'))
.map(|l| l.trim_start_matches('+').trim())
.collect::<Vec<_>>()
.join("\n"),
};
let json = serde_json::to_string(&alpaca).map_err(|e| {
OracleError::ExportError(format!("Failed to serialize Alpaca: {}", e))
})?;
output.push_str(&json);
output.push('\n');
}
std::fs::write(path, &output).map_err(|e| {
OracleError::ExportError(format!("Failed to write Alpaca to {}: {}", path.display(), e))
})?;
Ok(self.examples.len())
}
#[cfg(feature = "dataset")]
pub fn export_parquet(&self, path: impl AsRef<Path>) -> Result<usize, OracleError> {
use alimentar::ArrowDataset;
use arrow::array::{BooleanArray, StringArray, UInt32Array};
use arrow::datatypes::{DataType, Field, Schema};
use arrow::record_batch::RecordBatch;
use std::sync::Arc;
let path = path.as_ref();
let error_codes: StringArray =
self.examples.iter().map(|e| Some(e.error_code.as_str())).collect();
let decisions: StringArray =
self.examples.iter().map(|e| Some(e.decision.as_str())).collect();
let fix_diffs: StringArray =
self.examples.iter().map(|e| Some(e.fix_diff.as_str())).collect();
let descriptions: StringArray =
self.examples.iter().map(|e| Some(e.description.as_str())).collect();
let sources: StringArray = self.examples.iter().map(|e| Some(e.source.as_str())).collect();
let verified: BooleanArray = self.examples.iter().map(|e| Some(e.verified)).collect();
let success_counts: UInt32Array =
self.examples.iter().map(|e| Some(e.success_count)).collect();
let failure_counts: UInt32Array =
self.examples.iter().map(|e| Some(e.failure_count)).collect();
let schema = Arc::new(Schema::new(vec![
Field::new("error_code", DataType::Utf8, false),
Field::new("decision", DataType::Utf8, false),
Field::new("fix_diff", DataType::Utf8, false),
Field::new("description", DataType::Utf8, false),
Field::new("source", DataType::Utf8, false),
Field::new("verified", DataType::Boolean, false),
Field::new("success_count", DataType::UInt32, false),
Field::new("failure_count", DataType::UInt32, false),
]));
let batch = RecordBatch::try_new(
schema,
vec![
Arc::new(error_codes),
Arc::new(decisions),
Arc::new(fix_diffs),
Arc::new(descriptions),
Arc::new(sources),
Arc::new(verified),
Arc::new(success_counts),
Arc::new(failure_counts),
],
)
.map_err(|e| OracleError::ExportError(format!("Failed to create RecordBatch: {}", e)))?;
let dataset = ArrowDataset::from_batch(batch).map_err(|e| {
OracleError::ExportError(format!("Failed to create Arrow dataset: {}", e))
})?;
dataset
.to_parquet(path)
.map_err(|e| OracleError::ExportError(format!("Failed to write Parquet: {}", e)))?;
Ok(self.examples.len())
}
#[cfg(not(feature = "dataset"))]
pub fn export_parquet(&self, _path: impl AsRef<Path>) -> Result<usize, OracleError> {
Err(OracleError::ExportError(
"Parquet export requires the 'dataset' feature. Build with --features dataset"
.to_string(),
))
}
pub fn stats(&self) -> DatasetStats {
let mut by_error_code: std::collections::HashMap<String, usize> =
std::collections::HashMap::new();
let mut by_decision: std::collections::HashMap<String, usize> =
std::collections::HashMap::new();
let mut by_source: std::collections::HashMap<String, usize> =
std::collections::HashMap::new();
let mut verified_count = 0;
for example in &self.examples {
*by_error_code.entry(example.error_code.clone()).or_default() += 1;
*by_decision.entry(example.decision.clone()).or_default() += 1;
*by_source.entry(example.source.clone()).or_default() += 1;
if example.verified {
verified_count += 1;
}
}
DatasetStats {
total: self.examples.len(),
verified: verified_count,
by_error_code,
by_decision,
by_source,
}
}
}
impl Default for DatasetExporter {
fn default() -> Self {
Self::new()
}
}
#[derive(Debug, Clone)]
pub struct DatasetStats {
pub total: usize,
pub verified: usize,
pub by_error_code: std::collections::HashMap<String, usize>,
pub by_decision: std::collections::HashMap<String, usize>,
pub by_source: std::collections::HashMap<String, usize>,
}
impl DatasetStats {
pub fn to_markdown(&self) -> String {
let mut s = String::new();
s.push_str("## Dataset Statistics\n\n");
s.push_str(&format!("- **Total examples**: {}\n", self.total));
s.push_str(&format!("- **Verified**: {}\n", self.verified));
s.push_str(&format!(
"- **Verification rate**: {:.1}%\n\n",
if self.total > 0 { (self.verified as f64 / self.total as f64) * 100.0 } else { 0.0 }
));
s.push_str("### By Error Code\n\n");
s.push_str("| Error Code | Count |\n");
s.push_str("|------------|-------|\n");
let mut codes: Vec<_> = self.by_error_code.iter().collect();
codes.sort_by_key(|(k, _)| *k);
for (code, count) in codes {
s.push_str(&format!("| {} | {} |\n", code, count));
}
s.push_str("\n### By Decision Type\n\n");
s.push_str("| Decision | Count |\n");
s.push_str("|----------|-------|\n");
let mut decisions: Vec<_> = self.by_decision.iter().collect();
decisions.sort_by_key(|(_, v)| std::cmp::Reverse(*v));
for (decision, count) in decisions {
s.push_str(&format!("| {} | {} |\n", decision, count));
}
s.push_str("\n### By Source\n\n");
s.push_str("| Source | Count |\n");
s.push_str("|--------|-------|\n");
let mut sources: Vec<_> = self.by_source.iter().collect();
sources.sort_by_key(|(_, v)| std::cmp::Reverse(*v));
for (source, count) in sources {
s.push_str(&format!("| {} | {} |\n", source, count));
}
s
}
}
pub fn generate_dataset_card(stats: &DatasetStats) -> String {
format!(
r#"---
license: mit
task_categories:
- text2text-generation
language:
- en
tags:
- code
- rust
- c
- transpiler
- compiler-errors
- code-repair
size_categories:
- n<1K
---
# Decy Oracle Patterns Dataset
Fix patterns for C→Rust transpilation errors, learned by the decy CITL (Compiler-in-the-Loop Training) oracle.
## Dataset Description
This dataset contains error→fix pairs for common Rust compilation errors that occur during C-to-Rust transpilation.
Each example maps a rustc error code to a fix pattern that resolves the error.
### Use Cases
- **Fine-tuning LLMs** for code repair tasks
- **Training code completion models** for Rust
- **Building retrieval-augmented generation (RAG)** systems for compiler error fixing
- **Studying common C→Rust migration patterns**
## Dataset Structure
### Fields
| Field | Type | Description |
|-------|------|-------------|
| `error_code` | string | Rust compiler error code (e.g., "E0308") |
| `decision` | string | Fix strategy category (e.g., "type_coercion") |
| `fix_diff` | string | Unified diff showing the fix |
| `description` | string | Human-readable explanation |
| `source` | string | Pattern origin (bootstrap, training, imported) |
| `verified` | bool | Whether fix was verified by rustc |
| `success_count` | int | Times pattern succeeded |
| `failure_count` | int | Times pattern failed |
{}
## Usage
### With HuggingFace Datasets
```python
from datasets import load_dataset
dataset = load_dataset("paiml/decy-oracle-patterns")
print(dataset["train"][0])
```
### With alimentar (Rust)
```rust
use alimentar::hf_hub::HfDataset;
let dataset = HfDataset::builder("paiml/decy-oracle-patterns")
.split("train")
.build()?
.download()?;
```
## Error Codes Covered
| Code | Description | Count |
|------|-------------|-------|
| E0308 | Type mismatch | Common in C→Rust type conversions |
| E0133 | Unsafe block required | Raw pointer operations |
| E0382 | Use of moved value | Ownership violations |
| E0499 | Multiple mutable borrows | Borrow checker errors |
| E0506 | Cannot assign to borrowed | Mutation during borrow |
| E0515 | Cannot return reference to local | Lifetime errors |
| E0597 | Value does not live long enough | Lifetime errors |
## License
MIT License - see [LICENSE](LICENSE) for details.
## Citation
```bibtex
@software{{decy2025,
author = {{paiml}},
title = {{decy: C-to-Rust Transpiler with CITL Oracle}},
year = {{2025}},
publisher = {{GitHub}},
url = {{https://github.com/paiml/decy}}
}}
```
## Related Projects
- [decy](https://github.com/paiml/decy) - C→Rust transpiler
- [depyler](https://github.com/paiml/depyler) - Python→Rust transpiler
- [entrenar](https://github.com/paiml/entrenar) - CITL training framework
- [alimentar](https://github.com/paiml/alimentar) - Data loading library
"#,
stats.to_markdown()
)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_dataset_exporter_new() {
let exporter = DatasetExporter::new();
assert!(!exporter.is_empty());
assert!(exporter.len() >= 20); }
#[test]
fn test_dataset_exporter_empty() {
let exporter = DatasetExporter::empty();
assert!(exporter.is_empty());
assert_eq!(exporter.len(), 0);
}
#[test]
fn test_add_example() {
let mut exporter = DatasetExporter::empty();
exporter.add_example(TrainingExample {
error_code: "E0308".to_string(),
decision: "test".to_string(),
fix_diff: "- old\n+ new".to_string(),
description: "Test pattern".to_string(),
source: "test".to_string(),
verified: true,
success_count: 0,
failure_count: 0,
});
assert_eq!(exporter.len(), 1);
}
#[test]
fn test_export_jsonl() {
let exporter = DatasetExporter::new();
let temp_dir = tempfile::tempdir().unwrap();
let path = temp_dir.path().join("patterns.jsonl");
let count = exporter.export_jsonl(&path).unwrap();
assert!(count > 0);
assert!(path.exists());
let content = std::fs::read_to_string(&path).unwrap();
let lines: Vec<&str> = content.lines().collect();
assert_eq!(lines.len(), count);
for line in lines {
let _: TrainingExample = serde_json::from_str(line).unwrap();
}
}
#[test]
fn test_export_chatml() {
let exporter = DatasetExporter::new();
let temp_dir = tempfile::tempdir().unwrap();
let path = temp_dir.path().join("patterns_chatml.jsonl");
let count = exporter.export_chatml(&path).unwrap();
assert!(count > 0);
assert!(path.exists());
let content = std::fs::read_to_string(&path).unwrap();
let lines: Vec<&str> = content.lines().collect();
assert_eq!(lines.len(), count);
for line in lines {
let conv: ChatMLConversation = serde_json::from_str(line).unwrap();
assert_eq!(conv.messages.len(), 2);
assert_eq!(conv.messages[0].role, "user");
assert_eq!(conv.messages[1].role, "assistant");
}
}
#[test]
fn test_export_alpaca() {
let exporter = DatasetExporter::new();
let temp_dir = tempfile::tempdir().unwrap();
let path = temp_dir.path().join("patterns_alpaca.jsonl");
let count = exporter.export_alpaca(&path).unwrap();
assert!(count > 0);
assert!(path.exists());
let content = std::fs::read_to_string(&path).unwrap();
let lines: Vec<&str> = content.lines().collect();
assert_eq!(lines.len(), count);
for line in lines {
let alpaca: AlpacaExample = serde_json::from_str(line).unwrap();
assert!(!alpaca.instruction.is_empty());
}
}
#[test]
fn test_stats() {
let exporter = DatasetExporter::new();
let stats = exporter.stats();
assert!(stats.total > 0);
assert!(!stats.by_error_code.is_empty());
assert!(!stats.by_decision.is_empty());
assert!(stats.by_source.contains_key("bootstrap"));
}
#[test]
fn test_stats_markdown() {
let exporter = DatasetExporter::new();
let stats = exporter.stats();
let markdown = stats.to_markdown();
assert!(markdown.contains("Dataset Statistics"));
assert!(markdown.contains("Error Code"));
assert!(markdown.contains("Decision"));
}
#[test]
fn test_generate_dataset_card() {
let exporter = DatasetExporter::new();
let stats = exporter.stats();
let card = generate_dataset_card(&stats);
assert!(card.contains("license: mit"));
assert!(card.contains("decy"));
assert!(card.contains("paiml"));
assert!(card.contains("Dataset Statistics"));
}
#[test]
fn test_dataset_exporter_default() {
let exporter = DatasetExporter::default();
assert!(!exporter.is_empty());
}
#[test]
fn test_export_parquet_no_feature() {
let exporter = DatasetExporter::empty();
let temp_dir = tempfile::tempdir().unwrap();
let path = temp_dir.path().join("test.parquet");
let result = exporter.export_parquet(&path);
assert!(result.is_err());
}
#[test]
fn test_stats_empty_dataset() {
let exporter = DatasetExporter::empty();
let stats = exporter.stats();
assert_eq!(stats.total, 0);
assert_eq!(stats.verified, 0);
assert!(stats.by_error_code.is_empty());
}
#[test]
fn test_stats_markdown_empty() {
let exporter = DatasetExporter::empty();
let stats = exporter.stats();
let md = stats.to_markdown();
assert!(md.contains("Total examples"));
assert!(md.contains("0"));
}
#[test]
fn test_export_format_debug_clone() {
let fmt = ExportFormat::Jsonl;
let cloned = fmt;
assert_eq!(fmt, cloned);
let debug = format!("{:?}", fmt);
assert!(debug.contains("Jsonl"));
}
#[test]
fn test_export_format_all_variants() {
let formats = [
ExportFormat::Jsonl,
ExportFormat::ChatML,
ExportFormat::Alpaca,
ExportFormat::Parquet,
];
for fmt in &formats {
let debug = format!("{:?}", fmt);
assert!(!debug.is_empty());
}
assert_ne!(ExportFormat::Jsonl, ExportFormat::ChatML);
assert_ne!(ExportFormat::Alpaca, ExportFormat::Parquet);
}
#[test]
fn test_add_example_and_access() {
let mut exporter = DatasetExporter::empty();
let example = TrainingExample {
error_code: "E0308".to_string(),
decision: "type_coercion".to_string(),
fix_diff: "- old\n+ new".to_string(),
description: "Type mismatch fix".to_string(),
source: "test".to_string(),
verified: false,
success_count: 5,
failure_count: 2,
};
exporter.add_example(example);
let examples = exporter.examples();
assert_eq!(examples.len(), 1);
assert_eq!(examples[0].success_count, 5);
assert!(!examples[0].verified);
}
#[test]
fn test_stats_unverified_examples() {
let mut exporter = DatasetExporter::empty();
exporter.add_example(TrainingExample {
error_code: "E0308".to_string(),
decision: "cast".to_string(),
fix_diff: "- a\n+ b".to_string(),
description: "test".to_string(),
source: "training".to_string(),
verified: false,
success_count: 0,
failure_count: 0,
});
let stats = exporter.stats();
assert_eq!(stats.total, 1);
assert_eq!(stats.verified, 0);
assert!(stats.by_source.contains_key("training"));
}
#[test]
fn test_training_example_from_bootstrap() {
use crate::bootstrap::get_bootstrap_patterns;
let patterns = get_bootstrap_patterns();
let pattern = &patterns[0];
let example = TrainingExample::from_bootstrap(pattern);
assert_eq!(example.error_code, pattern.error_code);
assert_eq!(example.decision, pattern.decision);
assert_eq!(example.source, "bootstrap");
assert!(example.verified);
}
#[test]
fn test_export_jsonl_write_error() {
let exporter = DatasetExporter::new();
let result = exporter.export_jsonl("/nonexistent/dir/patterns.jsonl");
assert!(result.is_err());
let err = format!("{}", result.unwrap_err());
assert!(err.contains("Failed to write JSONL"));
}
#[test]
fn test_export_chatml_write_error() {
let exporter = DatasetExporter::new();
let result = exporter.export_chatml("/nonexistent/dir/patterns.chatml");
assert!(result.is_err());
let err = format!("{}", result.unwrap_err());
assert!(err.contains("Failed to write ChatML"));
}
#[test]
fn test_export_alpaca_write_error() {
let exporter = DatasetExporter::new();
let result = exporter.export_alpaca("/nonexistent/dir/patterns.alpaca");
assert!(result.is_err());
let err = format!("{}", result.unwrap_err());
assert!(err.contains("Failed to write Alpaca"));
}
#[test]
fn test_stats_verification_rate_nonzero() {
let mut exporter = DatasetExporter::empty();
exporter.add_example(TrainingExample {
error_code: "E0308".to_string(),
decision: "cast".to_string(),
fix_diff: "- a\n+ b".to_string(),
description: "test".to_string(),
source: "test".to_string(),
verified: true,
success_count: 1,
failure_count: 0,
});
exporter.add_example(TrainingExample {
error_code: "E0133".to_string(),
decision: "unsafe".to_string(),
fix_diff: "- c\n+ d".to_string(),
description: "test2".to_string(),
source: "test".to_string(),
verified: false,
success_count: 0,
failure_count: 1,
});
let stats = exporter.stats();
let md = stats.to_markdown();
assert!(md.contains("50.0%"));
assert!(md.contains("E0308"));
assert!(md.contains("E0133"));
assert!(md.contains("cast"));
assert!(md.contains("unsafe"));
}
}