bashrs_oracle/
corpus.rs

1//! Training corpus management for ML model.
2
3use crate::categories::ErrorCategory;
4use crate::features::ErrorFeatures;
5use serde::{Deserialize, Serialize};
6use std::path::Path;
7
8/// Training example with features and label.
9#[derive(Debug, Clone, Serialize, Deserialize)]
10pub struct TrainingExample {
11    /// Exit code from command execution.
12    pub exit_code: i32,
13    /// Standard error output.
14    pub stderr: String,
15    /// Optional command that was executed.
16    pub command: Option<String>,
17    /// Error category label.
18    pub category: ErrorCategory,
19}
20
21/// Training corpus management.
22pub struct Corpus {
23    examples: Vec<TrainingExample>,
24}
25
26impl Default for Corpus {
27    fn default() -> Self {
28        Self::new()
29    }
30}
31
32impl Corpus {
33    /// Create empty corpus.
34    #[must_use]
35    pub fn new() -> Self {
36        Self {
37            examples: Vec::new(),
38        }
39    }
40
41    /// Create corpus from examples.
42    #[must_use]
43    pub fn from_examples(examples: Vec<TrainingExample>) -> Self {
44        Self { examples }
45    }
46
47    /// Load corpus from JSON file.
48    ///
49    /// # Errors
50    /// Returns error if file cannot be read or parsed.
51    pub fn load(path: &Path) -> anyhow::Result<Self> {
52        let content = std::fs::read_to_string(path)?;
53        let examples: Vec<TrainingExample> = serde_json::from_str(&content)?;
54        Ok(Self { examples })
55    }
56
57    /// Save corpus to JSON file.
58    ///
59    /// # Errors
60    /// Returns error if file cannot be written.
61    pub fn save(&self, path: &Path) -> anyhow::Result<()> {
62        let content = serde_json::to_string_pretty(&self.examples)?;
63        std::fs::write(path, content)?;
64        Ok(())
65    }
66
67    /// Add a training example.
68    pub fn add(&mut self, example: TrainingExample) {
69        self.examples.push(example);
70    }
71
72    /// Generate synthetic training data.
73    #[must_use]
74    pub fn generate_synthetic(count: usize) -> Self {
75        let mut examples = Vec::with_capacity(count);
76        let mut rng_seed = 42u64;
77
78        // Template: (exit_code, stderr, category)
79        let templates: &[(i32, &str, ErrorCategory)] = &[
80            // Syntax errors
81            (
82                1,
83                "bash: syntax error near unexpected token 'done'",
84                ErrorCategory::SyntaxUnexpectedToken,
85            ),
86            (
87                1,
88                "bash: unexpected EOF while looking for matching '\"'",
89                ErrorCategory::SyntaxQuoteMismatch,
90            ),
91            (
92                1,
93                "bash: syntax error: unexpected end of file",
94                ErrorCategory::SyntaxBracketMismatch,
95            ),
96            (
97                1,
98                "bash: syntax error near unexpected token ')'",
99                ErrorCategory::SyntaxBracketMismatch,
100            ),
101            (
102                2,
103                "bash: line 5: syntax error: operand expected",
104                ErrorCategory::SyntaxMissingOperand,
105            ),
106            // Command errors
107            (
108                127,
109                "bash: foobar: command not found",
110                ErrorCategory::CommandNotFound,
111            ),
112            (
113                127,
114                "zsh: command not found: nonexistent",
115                ErrorCategory::CommandNotFound,
116            ),
117            (
118                126,
119                "bash: ./script.sh: Permission denied",
120                ErrorCategory::CommandPermissionDenied,
121            ),
122            (
123                1,
124                "grep: invalid option -- 'z'",
125                ErrorCategory::CommandInvalidOption,
126            ),
127            (
128                1,
129                "ls: option requires an argument -- 'w'",
130                ErrorCategory::CommandMissingArgument,
131            ),
132            // File errors
133            (
134                1,
135                "cat: /nonexistent: No such file or directory",
136                ErrorCategory::FileNotFound,
137            ),
138            (
139                1,
140                "rm: cannot remove '/root/secret': Permission denied",
141                ErrorCategory::FilePermissionDenied,
142            ),
143            (
144                1,
145                "cat: /tmp: Is a directory",
146                ErrorCategory::FileIsDirectory,
147            ),
148            (
149                1,
150                "cd: /etc/passwd: Not a directory",
151                ErrorCategory::FileNotDirectory,
152            ),
153            (
154                1,
155                "bash: cannot redirect: Too many open files",
156                ErrorCategory::FileTooManyOpen,
157            ),
158            // Variable errors
159            (
160                1,
161                "bash: VAR: unbound variable",
162                ErrorCategory::VariableUnbound,
163            ),
164            (
165                1,
166                "bash: PATH: readonly variable",
167                ErrorCategory::VariableReadonly,
168            ),
169            (
170                1,
171                "bash: ${foo: bad substitution",
172                ErrorCategory::VariableBadSubstitution,
173            ),
174            // Process errors
175            (141, "", ErrorCategory::PipeBroken), // SIGPIPE = 128 + 13
176            (137, "Killed", ErrorCategory::ProcessSignaled), // SIGKILL = 128 + 9
177            (
178                1,
179                "Command exited with status 1",
180                ErrorCategory::ProcessExitNonZero,
181            ),
182            (
183                124,
184                "timeout: the monitored command timed out",
185                ErrorCategory::ProcessTimeout,
186            ),
187            // Redirect errors
188            (
189                1,
190                "bash: /dev/full: No space left on device",
191                ErrorCategory::RedirectFailed,
192            ),
193            (
194                1,
195                "bash: warning: here-document delimited by end-of-file (wanted 'EOF')",
196                ErrorCategory::HereDocUnterminated,
197            ),
198        ];
199
200        for i in 0..count {
201            // Simple LCG for reproducibility
202            rng_seed = rng_seed
203                .wrapping_mul(6_364_136_223_846_793_005)
204                .wrapping_add(1);
205            let idx = (rng_seed as usize) % templates.len();
206            // Safety: idx is always within bounds due to modulo, but use unwrap_or for clippy
207            let (exit_code, stderr, category) =
208                templates
209                    .get(idx)
210                    .copied()
211                    .unwrap_or((1, "unknown error", ErrorCategory::Unknown));
212
213            // Add variation to make examples more diverse
214            let varied_stderr = match i % 5 {
215                0 => format!("{stderr} (variant {i})"),
216                1 => format!("line {}: {stderr}", (rng_seed % 100) + 1),
217                2 => stderr.to_uppercase(),
218                3 => format!("{stderr}\nAdditional context line"),
219                _ => stderr.to_string(),
220            };
221
222            let command = if i % 3 == 0 {
223                Some(format!("test_command_{}", i % 10))
224            } else {
225                None
226            };
227
228            examples.push(TrainingExample {
229                exit_code,
230                stderr: varied_stderr,
231                command,
232                category,
233            });
234        }
235
236        Self { examples }
237    }
238
239    /// Convert to feature matrix (X) and labels (y).
240    #[must_use]
241    pub fn to_training_data(&self) -> (Vec<Vec<f32>>, Vec<u8>) {
242        let mut x = Vec::with_capacity(self.examples.len());
243        let mut y = Vec::with_capacity(self.examples.len());
244
245        for example in &self.examples {
246            let features = ErrorFeatures::extract(
247                example.exit_code,
248                &example.stderr,
249                example.command.as_deref(),
250            );
251            x.push(features.features);
252            y.push(example.category.to_label_index() as u8);
253        }
254
255        (x, y)
256    }
257
258    /// Number of examples in corpus.
259    #[must_use]
260    pub fn len(&self) -> usize {
261        self.examples.len()
262    }
263
264    /// Check if corpus is empty.
265    #[must_use]
266    pub fn is_empty(&self) -> bool {
267        self.examples.is_empty()
268    }
269
270    /// Get examples as slice.
271    #[must_use]
272    pub fn examples(&self) -> &[TrainingExample] {
273        &self.examples
274    }
275
276    /// Get category distribution for analysis.
277    #[must_use]
278    pub fn category_distribution(&self) -> std::collections::HashMap<ErrorCategory, usize> {
279        let mut dist = std::collections::HashMap::new();
280        for example in &self.examples {
281            *dist.entry(example.category).or_insert(0) += 1;
282        }
283        dist
284    }
285}
286
287#[cfg(test)]
288mod tests {
289    use super::*;
290
291    #[test]
292    fn test_generate_synthetic() {
293        let corpus = Corpus::generate_synthetic(100);
294        assert_eq!(corpus.len(), 100);
295    }
296
297    #[test]
298    fn test_to_training_data() {
299        let corpus = Corpus::generate_synthetic(50);
300        let (x, y) = corpus.to_training_data();
301
302        assert_eq!(x.len(), 50);
303        assert_eq!(y.len(), 50);
304        assert_eq!(x[0].len(), ErrorFeatures::SIZE);
305    }
306
307    #[test]
308    fn test_category_distribution() {
309        let corpus = Corpus::generate_synthetic(1000);
310        let dist = corpus.category_distribution();
311
312        // Should have multiple categories represented
313        assert!(dist.len() > 5, "Expected diverse categories");
314    }
315
316    #[test]
317    fn test_corpus_save_load() {
318        let corpus = Corpus::generate_synthetic(10);
319        let temp_dir = tempfile::tempdir().expect("Failed to create temp dir");
320        let path = temp_dir.path().join("test_corpus.json");
321
322        corpus.save(&path).expect("Failed to save");
323        let loaded = Corpus::load(&path).expect("Failed to load");
324
325        assert_eq!(corpus.len(), loaded.len());
326    }
327
328    #[test]
329    fn test_training_labels_valid() {
330        let corpus = Corpus::generate_synthetic(100);
331        let (_, y) = corpus.to_training_data();
332
333        for label in y {
334            // Labels should be valid indices (0-22)
335            assert!(label < ErrorCategory::COUNT as u8);
336        }
337    }
338
339    #[test]
340    fn test_example_serialization() {
341        let example = TrainingExample {
342            exit_code: 127,
343            stderr: "command not found".to_string(),
344            command: Some("foo".to_string()),
345            category: ErrorCategory::CommandNotFound,
346        };
347
348        let json = serde_json::to_string(&example).expect("Failed to serialize");
349        let parsed: TrainingExample = serde_json::from_str(&json).expect("Failed to parse");
350
351        assert_eq!(parsed.exit_code, 127);
352        assert_eq!(parsed.category, ErrorCategory::CommandNotFound);
353    }
354}