Skip to main content

bashrs_oracle/
corpus.rs

1//! Training corpus management for ML model.
2#![allow(clippy::indexing_slicing, clippy::expect_used)] // Test code uses expect and indexing
3
4use crate::categories::ErrorCategory;
5use crate::features::ErrorFeatures;
6use serde::{Deserialize, Serialize};
7use std::path::Path;
8
9/// Training example with features and label.
10#[derive(Debug, Clone, Serialize, Deserialize)]
11pub struct TrainingExample {
12    /// Exit code from command execution.
13    pub exit_code: i32,
14    /// Standard error output.
15    pub stderr: String,
16    /// Optional command that was executed.
17    pub command: Option<String>,
18    /// Error category label.
19    pub category: ErrorCategory,
20}
21
22/// Training corpus management.
23pub struct Corpus {
24    examples: Vec<TrainingExample>,
25}
26
27impl Default for Corpus {
28    fn default() -> Self {
29        Self::new()
30    }
31}
32
33impl Corpus {
34    /// Create empty corpus.
35    #[must_use]
36    pub fn new() -> Self {
37        Self {
38            examples: Vec::new(),
39        }
40    }
41
42    /// Create corpus from examples.
43    #[must_use]
44    pub fn from_examples(examples: Vec<TrainingExample>) -> Self {
45        Self { examples }
46    }
47
48    /// Load corpus from JSON file.
49    ///
50    /// # Errors
51    /// Returns error if file cannot be read or parsed.
52    pub fn load(path: &Path) -> anyhow::Result<Self> {
53        let content = std::fs::read_to_string(path)?;
54        let examples: Vec<TrainingExample> = serde_json::from_str(&content)?;
55        Ok(Self { examples })
56    }
57
58    /// Save corpus to JSON file.
59    ///
60    /// # Errors
61    /// Returns error if file cannot be written.
62    pub fn save(&self, path: &Path) -> anyhow::Result<()> {
63        let content = serde_json::to_string_pretty(&self.examples)?;
64        std::fs::write(path, content)?;
65        Ok(())
66    }
67
68    /// Add a training example.
69    pub fn add(&mut self, example: TrainingExample) {
70        self.examples.push(example);
71    }
72
73    /// Generate synthetic training data.
74    #[must_use]
75    pub fn generate_synthetic(count: usize) -> Self {
76        let mut examples = Vec::with_capacity(count);
77        let mut rng_seed = 42u64;
78
79        // Template: (exit_code, stderr, category)
80        let templates: &[(i32, &str, ErrorCategory)] = &[
81            // Syntax errors
82            (
83                1,
84                "bash: syntax error near unexpected token 'done'",
85                ErrorCategory::SyntaxUnexpectedToken,
86            ),
87            (
88                1,
89                "bash: unexpected EOF while looking for matching '\"'",
90                ErrorCategory::SyntaxQuoteMismatch,
91            ),
92            (
93                1,
94                "bash: syntax error: unexpected end of file",
95                ErrorCategory::SyntaxBracketMismatch,
96            ),
97            (
98                1,
99                "bash: syntax error near unexpected token ')'",
100                ErrorCategory::SyntaxBracketMismatch,
101            ),
102            (
103                2,
104                "bash: line 5: syntax error: operand expected",
105                ErrorCategory::SyntaxMissingOperand,
106            ),
107            // Command errors
108            (
109                127,
110                "bash: foobar: command not found",
111                ErrorCategory::CommandNotFound,
112            ),
113            (
114                127,
115                "zsh: command not found: nonexistent",
116                ErrorCategory::CommandNotFound,
117            ),
118            (
119                126,
120                "bash: ./script.sh: Permission denied",
121                ErrorCategory::CommandPermissionDenied,
122            ),
123            (
124                1,
125                "grep: invalid option -- 'z'",
126                ErrorCategory::CommandInvalidOption,
127            ),
128            (
129                1,
130                "ls: option requires an argument -- 'w'",
131                ErrorCategory::CommandMissingArgument,
132            ),
133            // File errors
134            (
135                1,
136                "cat: /nonexistent: No such file or directory",
137                ErrorCategory::FileNotFound,
138            ),
139            (
140                1,
141                "rm: cannot remove '/root/secret': Permission denied",
142                ErrorCategory::FilePermissionDenied,
143            ),
144            (
145                1,
146                "cat: /tmp: Is a directory",
147                ErrorCategory::FileIsDirectory,
148            ),
149            (
150                1,
151                "cd: /etc/passwd: Not a directory",
152                ErrorCategory::FileNotDirectory,
153            ),
154            (
155                1,
156                "bash: cannot redirect: Too many open files",
157                ErrorCategory::FileTooManyOpen,
158            ),
159            // Variable errors
160            (
161                1,
162                "bash: VAR: unbound variable",
163                ErrorCategory::VariableUnbound,
164            ),
165            (
166                1,
167                "bash: PATH: readonly variable",
168                ErrorCategory::VariableReadonly,
169            ),
170            (
171                1,
172                "bash: ${foo: bad substitution",
173                ErrorCategory::VariableBadSubstitution,
174            ),
175            // Process errors
176            (141, "", ErrorCategory::PipeBroken), // SIGPIPE = 128 + 13
177            (137, "Killed", ErrorCategory::ProcessSignaled), // SIGKILL = 128 + 9
178            (
179                1,
180                "Command exited with status 1",
181                ErrorCategory::ProcessExitNonZero,
182            ),
183            (
184                124,
185                "timeout: the monitored command timed out",
186                ErrorCategory::ProcessTimeout,
187            ),
188            // Redirect errors
189            (
190                1,
191                "bash: /dev/full: No space left on device",
192                ErrorCategory::RedirectFailed,
193            ),
194            (
195                1,
196                "bash: warning: here-document delimited by end-of-file (wanted 'EOF')",
197                ErrorCategory::HereDocUnterminated,
198            ),
199        ];
200
201        for i in 0..count {
202            // Simple LCG for reproducibility
203            rng_seed = rng_seed
204                .wrapping_mul(6_364_136_223_846_793_005)
205                .wrapping_add(1);
206            let idx = (rng_seed as usize) % templates.len();
207            // Safety: idx is always within bounds due to modulo, but use unwrap_or for clippy
208            let (exit_code, stderr, category) =
209                templates
210                    .get(idx)
211                    .copied()
212                    .unwrap_or((1, "unknown error", ErrorCategory::Unknown));
213
214            // Add variation to make examples more diverse
215            let varied_stderr = match i % 5 {
216                0 => format!("{stderr} (variant {i})"),
217                1 => format!("line {}: {stderr}", (rng_seed % 100) + 1),
218                2 => stderr.to_uppercase(),
219                3 => format!("{stderr}\nAdditional context line"),
220                _ => stderr.to_string(),
221            };
222
223            let command = if i % 3 == 0 {
224                Some(format!("test_command_{}", i % 10))
225            } else {
226                None
227            };
228
229            examples.push(TrainingExample {
230                exit_code,
231                stderr: varied_stderr,
232                command,
233                category,
234            });
235        }
236
237        Self { examples }
238    }
239
240    /// Convert to feature matrix (X) and labels (y).
241    #[must_use]
242    pub fn to_training_data(&self) -> (Vec<Vec<f32>>, Vec<u8>) {
243        let mut x = Vec::with_capacity(self.examples.len());
244        let mut y = Vec::with_capacity(self.examples.len());
245
246        for example in &self.examples {
247            let features = ErrorFeatures::extract(
248                example.exit_code,
249                &example.stderr,
250                example.command.as_deref(),
251            );
252            x.push(features.features);
253            y.push(example.category.to_label_index() as u8);
254        }
255
256        (x, y)
257    }
258
259    /// Number of examples in corpus.
260    #[must_use]
261    pub fn len(&self) -> usize {
262        self.examples.len()
263    }
264
265    /// Check if corpus is empty.
266    #[must_use]
267    pub fn is_empty(&self) -> bool {
268        self.examples.is_empty()
269    }
270
271    /// Get examples as slice.
272    #[must_use]
273    pub fn examples(&self) -> &[TrainingExample] {
274        &self.examples
275    }
276
277    /// Get category distribution for analysis.
278    #[must_use]
279    pub fn category_distribution(&self) -> std::collections::HashMap<ErrorCategory, usize> {
280        let mut dist = std::collections::HashMap::new();
281        for example in &self.examples {
282            *dist.entry(example.category).or_insert(0) += 1;
283        }
284        dist
285    }
286}
287
288#[cfg(test)]
289mod tests {
290    use super::*;
291
292    #[test]
293    fn test_generate_synthetic() {
294        let corpus = Corpus::generate_synthetic(100);
295        assert_eq!(corpus.len(), 100);
296    }
297
298    #[test]
299    fn test_to_training_data() {
300        let corpus = Corpus::generate_synthetic(50);
301        let (x, y) = corpus.to_training_data();
302
303        assert_eq!(x.len(), 50);
304        assert_eq!(y.len(), 50);
305        assert_eq!(x[0].len(), ErrorFeatures::SIZE);
306    }
307
308    #[test]
309    fn test_category_distribution() {
310        let corpus = Corpus::generate_synthetic(1000);
311        let dist = corpus.category_distribution();
312
313        // Should have multiple categories represented
314        assert!(dist.len() > 5, "Expected diverse categories");
315    }
316
317    #[test]
318    fn test_corpus_save_load() {
319        let corpus = Corpus::generate_synthetic(10);
320        let temp_dir = tempfile::tempdir().expect("Failed to create temp dir");
321        let path = temp_dir.path().join("test_corpus.json");
322
323        corpus.save(&path).expect("Failed to save");
324        let loaded = Corpus::load(&path).expect("Failed to load");
325
326        assert_eq!(corpus.len(), loaded.len());
327    }
328
329    #[test]
330    fn test_training_labels_valid() {
331        let corpus = Corpus::generate_synthetic(100);
332        let (_, y) = corpus.to_training_data();
333
334        for label in y {
335            // Labels should be valid indices (0-22)
336            assert!(label < ErrorCategory::COUNT as u8);
337        }
338    }
339
340    #[test]
341    fn test_example_serialization() {
342        let example = TrainingExample {
343            exit_code: 127,
344            stderr: "command not found".to_string(),
345            command: Some("foo".to_string()),
346            category: ErrorCategory::CommandNotFound,
347        };
348
349        let json = serde_json::to_string(&example).expect("Failed to serialize");
350        let parsed: TrainingExample = serde_json::from_str(&json).expect("Failed to parse");
351
352        assert_eq!(parsed.exit_code, 127);
353        assert_eq!(parsed.category, ErrorCategory::CommandNotFound);
354    }
355}