use serde::{Deserialize, Serialize};
use std::collections::HashSet;
use std::path::Path;
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct TrainingError {
pub error_code: String,
pub message: String,
pub context: String,
pub file: String,
pub hash: String,
pub timestamp: String,
pub cycle: u32,
}
impl TrainingError {
pub fn new(
error_code: impl Into<String>,
message: impl Into<String>,
context: impl Into<String>,
file: impl Into<String>,
cycle: u32,
) -> Self {
let error_code = error_code.into();
let message = message.into();
let hash = Self::compute_hash(&error_code, &message);
Self {
error_code,
message,
context: context.into(),
file: file.into(),
hash,
timestamp: chrono::Utc::now().to_rfc3339(),
cycle,
}
}
pub fn compute_hash(error_code: &str, message: &str) -> String {
use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};
let mut hasher = DefaultHasher::new();
error_code.hash(&mut hasher);
message.hash(&mut hasher);
format!("{:016x}", hasher.finish())
}
}
#[derive(Debug, Default)]
pub struct TrainingCorpus {
errors: Vec<TrainingError>,
seen_hashes: HashSet<String>,
}
impl TrainingCorpus {
pub fn new() -> Self {
Self::default()
}
pub fn load(path: &Path) -> std::io::Result<Self> {
let mut corpus = Self::new();
if path.exists() {
let content = std::fs::read_to_string(path)?;
for line in content.lines() {
if let Ok(error) = serde_json::from_str::<TrainingError>(line) {
corpus.insert(error);
}
}
}
Ok(corpus)
}
pub fn save(&self, path: &Path) -> std::io::Result<()> {
use std::io::Write;
let mut file = std::fs::File::create(path)?;
for error in &self.errors {
writeln!(file, "{}", serde_json::to_string(error).unwrap())?;
}
Ok(())
}
pub fn insert(&mut self, error: TrainingError) -> bool {
if self.seen_hashes.contains(&error.hash) {
false
} else {
self.seen_hashes.insert(error.hash.clone());
self.errors.push(error);
true
}
}
pub fn len(&self) -> usize {
self.errors.len()
}
pub fn is_empty(&self) -> bool {
self.errors.is_empty()
}
pub fn errors(&self) -> &[TrainingError] {
&self.errors
}
pub fn merge(&mut self, other: TrainingCorpus) -> usize {
let before = self.len();
for error in other.errors {
self.insert(error);
}
self.len() - before
}
}
#[cfg(test)]
mod tests {
use super::*;
use tempfile::tempdir;
#[test]
fn test_training_error_hash_deterministic() {
let hash1 = TrainingError::compute_hash("E0308", "mismatched types");
let hash2 = TrainingError::compute_hash("E0308", "mismatched types");
assert_eq!(hash1, hash2);
}
#[test]
fn test_training_error_hash_different_for_different_inputs() {
let hash1 = TrainingError::compute_hash("E0308", "mismatched types");
let hash2 = TrainingError::compute_hash("E0308", "different message");
let hash3 = TrainingError::compute_hash("E0599", "mismatched types");
assert_ne!(hash1, hash2);
assert_ne!(hash1, hash3);
}
#[test]
fn test_training_error_new_generates_hash() {
let error = TrainingError::new("E0308", "mismatched types", "ctx", "file.py", 0);
assert!(!error.hash.is_empty());
assert_eq!(error.hash.len(), 16); }
#[test]
fn test_corpus_deduplicates_by_hash() {
let mut corpus = TrainingCorpus::new();
let e1 = TrainingError::new("E0308", "mismatched types", "ctx", "file1.py", 0);
let e2 = TrainingError::new("E0308", "mismatched types", "ctx", "file2.py", 1); let e3 = TrainingError::new("E0599", "method not found", "ctx", "file3.py", 0);
assert!(corpus.insert(e1)); assert!(!corpus.insert(e2)); assert!(corpus.insert(e3));
assert_eq!(corpus.len(), 2);
}
#[test]
fn test_corpus_save_and_load_roundtrip() {
let dir = tempdir().unwrap();
let path = dir.path().join("corpus.jsonl");
let mut corpus = TrainingCorpus::new();
corpus.insert(TrainingError::new("E0308", "type error", "ctx", "a.py", 0));
corpus.insert(TrainingError::new(
"E0599",
"method error",
"ctx",
"b.py",
0,
));
corpus.save(&path).unwrap();
let loaded = TrainingCorpus::load(&path).unwrap();
assert_eq!(loaded.len(), 2);
let errors = loaded.errors();
assert!(errors.iter().any(|e| e.error_code == "E0308"));
assert!(errors.iter().any(|e| e.error_code == "E0599"));
}
#[test]
fn test_corpus_load_deduplicates_existing_file() {
let dir = tempdir().unwrap();
let path = dir.path().join("corpus.jsonl");
std::fs::write(
&path,
r#"{"error_code":"E0308","message":"type error","context":"","file":"a.py","hash":"abc123","timestamp":"","cycle":0}
{"error_code":"E0308","message":"type error","context":"","file":"a.py","hash":"abc123","timestamp":"","cycle":1}
{"error_code":"E0308","message":"type error","context":"","file":"a.py","hash":"abc123","timestamp":"","cycle":2}
"#,
)
.unwrap();
let corpus = TrainingCorpus::load(&path).unwrap();
assert_eq!(corpus.len(), 1); }
#[test]
fn test_corpus_merge_returns_new_count() {
let mut corpus1 = TrainingCorpus::new();
corpus1.insert(TrainingError::new("E0308", "error1", "", "", 0));
corpus1.insert(TrainingError::new("E0599", "error2", "", "", 0));
let mut corpus2 = TrainingCorpus::new();
corpus2.insert(TrainingError::new("E0599", "error2", "", "", 1)); corpus2.insert(TrainingError::new("E0277", "error3", "", "", 1));
let new_count = corpus1.merge(corpus2);
assert_eq!(new_count, 1); assert_eq!(corpus1.len(), 3);
}
#[test]
fn test_corpus_empty_file_loads_ok() {
let dir = tempdir().unwrap();
let path = dir.path().join("empty.jsonl");
std::fs::write(&path, "").unwrap();
let corpus = TrainingCorpus::load(&path).unwrap();
assert!(corpus.is_empty());
}
#[test]
fn test_corpus_nonexistent_file_loads_empty() {
let dir = tempdir().unwrap();
let path = dir.path().join("nonexistent.jsonl");
let corpus = TrainingCorpus::load(&path).unwrap();
assert!(corpus.is_empty());
}
#[test]
fn test_corpus_handles_malformed_json_lines() {
let dir = tempdir().unwrap();
let path = dir.path().join("corpus.jsonl");
std::fs::write(
&path,
r#"{"error_code":"E0308","message":"valid","context":"","file":"","hash":"abc","timestamp":"","cycle":0}
not valid json
{"error_code":"E0599","message":"also valid","context":"","file":"","hash":"def","timestamp":"","cycle":0}
"#,
)
.unwrap();
let corpus = TrainingCorpus::load(&path).unwrap();
assert_eq!(corpus.len(), 2); }
}