1#![allow(clippy::indexing_slicing, clippy::expect_used)] use crate::categories::ErrorCategory;
5use crate::features::ErrorFeatures;
6use serde::{Deserialize, Serialize};
7use std::path::Path;
8
9#[derive(Debug, Clone, Serialize, Deserialize)]
11pub struct TrainingExample {
12 pub exit_code: i32,
14 pub stderr: String,
16 pub command: Option<String>,
18 pub category: ErrorCategory,
20}
21
22pub struct Corpus {
24 examples: Vec<TrainingExample>,
25}
26
27impl Default for Corpus {
28 fn default() -> Self {
29 Self::new()
30 }
31}
32
33impl Corpus {
34 #[must_use]
36 pub fn new() -> Self {
37 Self {
38 examples: Vec::new(),
39 }
40 }
41
42 #[must_use]
44 pub fn from_examples(examples: Vec<TrainingExample>) -> Self {
45 Self { examples }
46 }
47
48 pub fn load(path: &Path) -> anyhow::Result<Self> {
53 let content = std::fs::read_to_string(path)?;
54 let examples: Vec<TrainingExample> = serde_json::from_str(&content)?;
55 Ok(Self { examples })
56 }
57
58 pub fn save(&self, path: &Path) -> anyhow::Result<()> {
63 let content = serde_json::to_string_pretty(&self.examples)?;
64 std::fs::write(path, content)?;
65 Ok(())
66 }
67
68 pub fn add(&mut self, example: TrainingExample) {
70 self.examples.push(example);
71 }
72
73 #[must_use]
75 pub fn generate_synthetic(count: usize) -> Self {
76 let mut examples = Vec::with_capacity(count);
77 let mut rng_seed = 42u64;
78
79 let templates: &[(i32, &str, ErrorCategory)] = &[
81 (
83 1,
84 "bash: syntax error near unexpected token 'done'",
85 ErrorCategory::SyntaxUnexpectedToken,
86 ),
87 (
88 1,
89 "bash: unexpected EOF while looking for matching '\"'",
90 ErrorCategory::SyntaxQuoteMismatch,
91 ),
92 (
93 1,
94 "bash: syntax error: unexpected end of file",
95 ErrorCategory::SyntaxBracketMismatch,
96 ),
97 (
98 1,
99 "bash: syntax error near unexpected token ')'",
100 ErrorCategory::SyntaxBracketMismatch,
101 ),
102 (
103 2,
104 "bash: line 5: syntax error: operand expected",
105 ErrorCategory::SyntaxMissingOperand,
106 ),
107 (
109 127,
110 "bash: foobar: command not found",
111 ErrorCategory::CommandNotFound,
112 ),
113 (
114 127,
115 "zsh: command not found: nonexistent",
116 ErrorCategory::CommandNotFound,
117 ),
118 (
119 126,
120 "bash: ./script.sh: Permission denied",
121 ErrorCategory::CommandPermissionDenied,
122 ),
123 (
124 1,
125 "grep: invalid option -- 'z'",
126 ErrorCategory::CommandInvalidOption,
127 ),
128 (
129 1,
130 "ls: option requires an argument -- 'w'",
131 ErrorCategory::CommandMissingArgument,
132 ),
133 (
135 1,
136 "cat: /nonexistent: No such file or directory",
137 ErrorCategory::FileNotFound,
138 ),
139 (
140 1,
141 "rm: cannot remove '/root/secret': Permission denied",
142 ErrorCategory::FilePermissionDenied,
143 ),
144 (
145 1,
146 "cat: /tmp: Is a directory",
147 ErrorCategory::FileIsDirectory,
148 ),
149 (
150 1,
151 "cd: /etc/passwd: Not a directory",
152 ErrorCategory::FileNotDirectory,
153 ),
154 (
155 1,
156 "bash: cannot redirect: Too many open files",
157 ErrorCategory::FileTooManyOpen,
158 ),
159 (
161 1,
162 "bash: VAR: unbound variable",
163 ErrorCategory::VariableUnbound,
164 ),
165 (
166 1,
167 "bash: PATH: readonly variable",
168 ErrorCategory::VariableReadonly,
169 ),
170 (
171 1,
172 "bash: ${foo: bad substitution",
173 ErrorCategory::VariableBadSubstitution,
174 ),
175 (141, "", ErrorCategory::PipeBroken), (137, "Killed", ErrorCategory::ProcessSignaled), (
179 1,
180 "Command exited with status 1",
181 ErrorCategory::ProcessExitNonZero,
182 ),
183 (
184 124,
185 "timeout: the monitored command timed out",
186 ErrorCategory::ProcessTimeout,
187 ),
188 (
190 1,
191 "bash: /dev/full: No space left on device",
192 ErrorCategory::RedirectFailed,
193 ),
194 (
195 1,
196 "bash: warning: here-document delimited by end-of-file (wanted 'EOF')",
197 ErrorCategory::HereDocUnterminated,
198 ),
199 ];
200
201 for i in 0..count {
202 rng_seed = rng_seed
204 .wrapping_mul(6_364_136_223_846_793_005)
205 .wrapping_add(1);
206 let idx = (rng_seed as usize) % templates.len();
207 let (exit_code, stderr, category) =
209 templates
210 .get(idx)
211 .copied()
212 .unwrap_or((1, "unknown error", ErrorCategory::Unknown));
213
214 let varied_stderr = match i % 5 {
216 0 => format!("{stderr} (variant {i})"),
217 1 => format!("line {}: {stderr}", (rng_seed % 100) + 1),
218 2 => stderr.to_uppercase(),
219 3 => format!("{stderr}\nAdditional context line"),
220 _ => stderr.to_string(),
221 };
222
223 let command = if i % 3 == 0 {
224 Some(format!("test_command_{}", i % 10))
225 } else {
226 None
227 };
228
229 examples.push(TrainingExample {
230 exit_code,
231 stderr: varied_stderr,
232 command,
233 category,
234 });
235 }
236
237 Self { examples }
238 }
239
240 #[must_use]
242 pub fn to_training_data(&self) -> (Vec<Vec<f32>>, Vec<u8>) {
243 let mut x = Vec::with_capacity(self.examples.len());
244 let mut y = Vec::with_capacity(self.examples.len());
245
246 for example in &self.examples {
247 let features = ErrorFeatures::extract(
248 example.exit_code,
249 &example.stderr,
250 example.command.as_deref(),
251 );
252 x.push(features.features);
253 y.push(example.category.to_label_index() as u8);
254 }
255
256 (x, y)
257 }
258
259 #[must_use]
261 pub fn len(&self) -> usize {
262 self.examples.len()
263 }
264
265 #[must_use]
267 pub fn is_empty(&self) -> bool {
268 self.examples.is_empty()
269 }
270
271 #[must_use]
273 pub fn examples(&self) -> &[TrainingExample] {
274 &self.examples
275 }
276
277 #[must_use]
279 pub fn category_distribution(&self) -> std::collections::HashMap<ErrorCategory, usize> {
280 let mut dist = std::collections::HashMap::new();
281 for example in &self.examples {
282 *dist.entry(example.category).or_insert(0) += 1;
283 }
284 dist
285 }
286}
287
288#[cfg(test)]
289mod tests {
290 use super::*;
291
292 #[test]
293 fn test_generate_synthetic() {
294 let corpus = Corpus::generate_synthetic(100);
295 assert_eq!(corpus.len(), 100);
296 }
297
298 #[test]
299 fn test_to_training_data() {
300 let corpus = Corpus::generate_synthetic(50);
301 let (x, y) = corpus.to_training_data();
302
303 assert_eq!(x.len(), 50);
304 assert_eq!(y.len(), 50);
305 assert_eq!(x[0].len(), ErrorFeatures::SIZE);
306 }
307
308 #[test]
309 fn test_category_distribution() {
310 let corpus = Corpus::generate_synthetic(1000);
311 let dist = corpus.category_distribution();
312
313 assert!(dist.len() > 5, "Expected diverse categories");
315 }
316
317 #[test]
318 fn test_corpus_save_load() {
319 let corpus = Corpus::generate_synthetic(10);
320 let temp_dir = tempfile::tempdir().expect("Failed to create temp dir");
321 let path = temp_dir.path().join("test_corpus.json");
322
323 corpus.save(&path).expect("Failed to save");
324 let loaded = Corpus::load(&path).expect("Failed to load");
325
326 assert_eq!(corpus.len(), loaded.len());
327 }
328
329 #[test]
330 fn test_training_labels_valid() {
331 let corpus = Corpus::generate_synthetic(100);
332 let (_, y) = corpus.to_training_data();
333
334 for label in y {
335 assert!(label < ErrorCategory::COUNT as u8);
337 }
338 }
339
340 #[test]
341 fn test_example_serialization() {
342 let example = TrainingExample {
343 exit_code: 127,
344 stderr: "command not found".to_string(),
345 command: Some("foo".to_string()),
346 category: ErrorCategory::CommandNotFound,
347 };
348
349 let json = serde_json::to_string(&example).expect("Failed to serialize");
350 let parsed: TrainingExample = serde_json::from_str(&json).expect("Failed to parse");
351
352 assert_eq!(parsed.exit_code, 127);
353 assert_eq!(parsed.category, ErrorCategory::CommandNotFound);
354 }
355}