1use crate::categories::ErrorCategory;
4use crate::features::ErrorFeatures;
5use serde::{Deserialize, Serialize};
6use std::path::Path;
7
8#[derive(Debug, Clone, Serialize, Deserialize)]
10pub struct TrainingExample {
11 pub exit_code: i32,
13 pub stderr: String,
15 pub command: Option<String>,
17 pub category: ErrorCategory,
19}
20
21pub struct Corpus {
23 examples: Vec<TrainingExample>,
24}
25
26impl Default for Corpus {
27 fn default() -> Self {
28 Self::new()
29 }
30}
31
32impl Corpus {
33 #[must_use]
35 pub fn new() -> Self {
36 Self {
37 examples: Vec::new(),
38 }
39 }
40
41 #[must_use]
43 pub fn from_examples(examples: Vec<TrainingExample>) -> Self {
44 Self { examples }
45 }
46
47 pub fn load(path: &Path) -> anyhow::Result<Self> {
52 let content = std::fs::read_to_string(path)?;
53 let examples: Vec<TrainingExample> = serde_json::from_str(&content)?;
54 Ok(Self { examples })
55 }
56
57 pub fn save(&self, path: &Path) -> anyhow::Result<()> {
62 let content = serde_json::to_string_pretty(&self.examples)?;
63 std::fs::write(path, content)?;
64 Ok(())
65 }
66
67 pub fn add(&mut self, example: TrainingExample) {
69 self.examples.push(example);
70 }
71
72 #[must_use]
74 pub fn generate_synthetic(count: usize) -> Self {
75 let mut examples = Vec::with_capacity(count);
76 let mut rng_seed = 42u64;
77
78 let templates: &[(i32, &str, ErrorCategory)] = &[
80 (
82 1,
83 "bash: syntax error near unexpected token 'done'",
84 ErrorCategory::SyntaxUnexpectedToken,
85 ),
86 (
87 1,
88 "bash: unexpected EOF while looking for matching '\"'",
89 ErrorCategory::SyntaxQuoteMismatch,
90 ),
91 (
92 1,
93 "bash: syntax error: unexpected end of file",
94 ErrorCategory::SyntaxBracketMismatch,
95 ),
96 (
97 1,
98 "bash: syntax error near unexpected token ')'",
99 ErrorCategory::SyntaxBracketMismatch,
100 ),
101 (
102 2,
103 "bash: line 5: syntax error: operand expected",
104 ErrorCategory::SyntaxMissingOperand,
105 ),
106 (
108 127,
109 "bash: foobar: command not found",
110 ErrorCategory::CommandNotFound,
111 ),
112 (
113 127,
114 "zsh: command not found: nonexistent",
115 ErrorCategory::CommandNotFound,
116 ),
117 (
118 126,
119 "bash: ./script.sh: Permission denied",
120 ErrorCategory::CommandPermissionDenied,
121 ),
122 (
123 1,
124 "grep: invalid option -- 'z'",
125 ErrorCategory::CommandInvalidOption,
126 ),
127 (
128 1,
129 "ls: option requires an argument -- 'w'",
130 ErrorCategory::CommandMissingArgument,
131 ),
132 (
134 1,
135 "cat: /nonexistent: No such file or directory",
136 ErrorCategory::FileNotFound,
137 ),
138 (
139 1,
140 "rm: cannot remove '/root/secret': Permission denied",
141 ErrorCategory::FilePermissionDenied,
142 ),
143 (
144 1,
145 "cat: /tmp: Is a directory",
146 ErrorCategory::FileIsDirectory,
147 ),
148 (
149 1,
150 "cd: /etc/passwd: Not a directory",
151 ErrorCategory::FileNotDirectory,
152 ),
153 (
154 1,
155 "bash: cannot redirect: Too many open files",
156 ErrorCategory::FileTooManyOpen,
157 ),
158 (
160 1,
161 "bash: VAR: unbound variable",
162 ErrorCategory::VariableUnbound,
163 ),
164 (
165 1,
166 "bash: PATH: readonly variable",
167 ErrorCategory::VariableReadonly,
168 ),
169 (
170 1,
171 "bash: ${foo: bad substitution",
172 ErrorCategory::VariableBadSubstitution,
173 ),
174 (141, "", ErrorCategory::PipeBroken), (137, "Killed", ErrorCategory::ProcessSignaled), (
178 1,
179 "Command exited with status 1",
180 ErrorCategory::ProcessExitNonZero,
181 ),
182 (
183 124,
184 "timeout: the monitored command timed out",
185 ErrorCategory::ProcessTimeout,
186 ),
187 (
189 1,
190 "bash: /dev/full: No space left on device",
191 ErrorCategory::RedirectFailed,
192 ),
193 (
194 1,
195 "bash: warning: here-document delimited by end-of-file (wanted 'EOF')",
196 ErrorCategory::HereDocUnterminated,
197 ),
198 ];
199
200 for i in 0..count {
201 rng_seed = rng_seed
203 .wrapping_mul(6_364_136_223_846_793_005)
204 .wrapping_add(1);
205 let idx = (rng_seed as usize) % templates.len();
206 let (exit_code, stderr, category) =
208 templates
209 .get(idx)
210 .copied()
211 .unwrap_or((1, "unknown error", ErrorCategory::Unknown));
212
213 let varied_stderr = match i % 5 {
215 0 => format!("{stderr} (variant {i})"),
216 1 => format!("line {}: {stderr}", (rng_seed % 100) + 1),
217 2 => stderr.to_uppercase(),
218 3 => format!("{stderr}\nAdditional context line"),
219 _ => stderr.to_string(),
220 };
221
222 let command = if i % 3 == 0 {
223 Some(format!("test_command_{}", i % 10))
224 } else {
225 None
226 };
227
228 examples.push(TrainingExample {
229 exit_code,
230 stderr: varied_stderr,
231 command,
232 category,
233 });
234 }
235
236 Self { examples }
237 }
238
239 #[must_use]
241 pub fn to_training_data(&self) -> (Vec<Vec<f32>>, Vec<u8>) {
242 let mut x = Vec::with_capacity(self.examples.len());
243 let mut y = Vec::with_capacity(self.examples.len());
244
245 for example in &self.examples {
246 let features = ErrorFeatures::extract(
247 example.exit_code,
248 &example.stderr,
249 example.command.as_deref(),
250 );
251 x.push(features.features);
252 y.push(example.category.to_label_index() as u8);
253 }
254
255 (x, y)
256 }
257
258 #[must_use]
260 pub fn len(&self) -> usize {
261 self.examples.len()
262 }
263
264 #[must_use]
266 pub fn is_empty(&self) -> bool {
267 self.examples.is_empty()
268 }
269
270 #[must_use]
272 pub fn examples(&self) -> &[TrainingExample] {
273 &self.examples
274 }
275
276 #[must_use]
278 pub fn category_distribution(&self) -> std::collections::HashMap<ErrorCategory, usize> {
279 let mut dist = std::collections::HashMap::new();
280 for example in &self.examples {
281 *dist.entry(example.category).or_insert(0) += 1;
282 }
283 dist
284 }
285}
286
287#[cfg(test)]
288mod tests {
289 use super::*;
290
291 #[test]
292 fn test_generate_synthetic() {
293 let corpus = Corpus::generate_synthetic(100);
294 assert_eq!(corpus.len(), 100);
295 }
296
297 #[test]
298 fn test_to_training_data() {
299 let corpus = Corpus::generate_synthetic(50);
300 let (x, y) = corpus.to_training_data();
301
302 assert_eq!(x.len(), 50);
303 assert_eq!(y.len(), 50);
304 assert_eq!(x[0].len(), ErrorFeatures::SIZE);
305 }
306
307 #[test]
308 fn test_category_distribution() {
309 let corpus = Corpus::generate_synthetic(1000);
310 let dist = corpus.category_distribution();
311
312 assert!(dist.len() > 5, "Expected diverse categories");
314 }
315
316 #[test]
317 fn test_corpus_save_load() {
318 let corpus = Corpus::generate_synthetic(10);
319 let temp_dir = tempfile::tempdir().expect("Failed to create temp dir");
320 let path = temp_dir.path().join("test_corpus.json");
321
322 corpus.save(&path).expect("Failed to save");
323 let loaded = Corpus::load(&path).expect("Failed to load");
324
325 assert_eq!(corpus.len(), loaded.len());
326 }
327
328 #[test]
329 fn test_training_labels_valid() {
330 let corpus = Corpus::generate_synthetic(100);
331 let (_, y) = corpus.to_training_data();
332
333 for label in y {
334 assert!(label < ErrorCategory::COUNT as u8);
336 }
337 }
338
339 #[test]
340 fn test_example_serialization() {
341 let example = TrainingExample {
342 exit_code: 127,
343 stderr: "command not found".to_string(),
344 command: Some("foo".to_string()),
345 category: ErrorCategory::CommandNotFound,
346 };
347
348 let json = serde_json::to_string(&example).expect("Failed to serialize");
349 let parsed: TrainingExample = serde_json::from_str(&json).expect("Failed to parse");
350
351 assert_eq!(parsed.exit_code, 127);
352 assert_eq!(parsed.category, ErrorCategory::CommandNotFound);
353 }
354}