Skip to main content

bashrs_oracle/
features.rs

1//! Feature extraction for ML model.
2//!
3//! 64 features carefully chosen to capture error patterns:
4//! - Numeric features normalized to [0, 1]
5//! - Categorical features one-hot encoded
6//! - Text features converted to bag-of-words indicators
7#![allow(clippy::indexing_slicing)] // Test assertions use direct indexing for clarity
8
9/// Feature vector for ML model (64 features).
10#[derive(Debug, Clone)]
11pub struct ErrorFeatures {
12    /// Feature vector (always 64 elements).
13    pub features: Vec<f32>,
14}
15
16impl ErrorFeatures {
17    /// Feature vector size.
18    pub const SIZE: usize = 64;
19
20    /// Extract features from error message and context.
21    #[must_use]
22    pub fn extract(exit_code: i32, stderr: &str, command: Option<&str>) -> Self {
23        let mut features = Vec::with_capacity(Self::SIZE);
24
25        // === Exit code features (0-5) ===
26        features.push(exit_code as f32 / 255.0); // Normalized exit code
27        features.push(if exit_code == 1 { 1.0 } else { 0.0 }); // General error
28        features.push(if exit_code == 2 { 1.0 } else { 0.0 }); // Misuse
29        features.push(if exit_code == 126 { 1.0 } else { 0.0 }); // Permission denied
30        features.push(if exit_code == 127 { 1.0 } else { 0.0 }); // Command not found
31        features.push(if exit_code == 128 { 1.0 } else { 0.0 }); // Signal base
32
33        // === Signal features (6-9) ===
34        features.push(if exit_code == 130 { 1.0 } else { 0.0 }); // SIGINT (Ctrl+C)
35        features.push(if exit_code == 137 { 1.0 } else { 0.0 }); // SIGKILL
36        features.push(if exit_code == 141 { 1.0 } else { 0.0 }); // SIGPIPE
37        features.push(if exit_code == 143 { 1.0 } else { 0.0 }); // SIGTERM
38
39        // === Error message length features (10-11) ===
40        let stderr_len = stderr.len();
41        features.push((stderr_len as f32 / 1000.0).min(1.0)); // Normalized length
42        features.push((stderr.lines().count() as f32 / 10.0).min(1.0)); // Line count
43
44        // === Keyword indicators - bag of words (12-31) ===
45        let stderr_lower = stderr.to_lowercase();
46
47        // File/path related
48        features.push(if stderr_lower.contains("not found") {
49            1.0
50        } else {
51            0.0
52        });
53        features.push(if stderr_lower.contains("no such file") {
54            1.0
55        } else {
56            0.0
57        });
58        features.push(if stderr_lower.contains("permission denied") {
59            1.0
60        } else {
61            0.0
62        });
63        features.push(if stderr_lower.contains("is a directory") {
64            1.0
65        } else {
66            0.0
67        });
68        features.push(if stderr_lower.contains("not a directory") {
69            1.0
70        } else {
71            0.0
72        });
73        features.push(if stderr_lower.contains("too many open") {
74            1.0
75        } else {
76            0.0
77        });
78
79        // Syntax related
80        features.push(if stderr_lower.contains("syntax error") {
81            1.0
82        } else {
83            0.0
84        });
85        features.push(if stderr_lower.contains("unexpected") {
86            1.0
87        } else {
88            0.0
89        });
90        features.push(if stderr_lower.contains("unmatched") {
91            1.0
92        } else {
93            0.0
94        });
95        features.push(if stderr_lower.contains("unterminated") {
96            1.0
97        } else {
98            0.0
99        });
100
101        // Variable related
102        features.push(if stderr_lower.contains("unbound variable") {
103            1.0
104        } else {
105            0.0
106        });
107        features.push(if stderr_lower.contains("bad substitution") {
108            1.0
109        } else {
110            0.0
111        });
112        features.push(if stderr_lower.contains("readonly") {
113            1.0
114        } else {
115            0.0
116        });
117
118        // Command related
119        features.push(if stderr_lower.contains("command not found") {
120            1.0
121        } else {
122            0.0
123        });
124        features.push(if stderr_lower.contains("invalid option") {
125            1.0
126        } else {
127            0.0
128        });
129        features.push(if stderr_lower.contains("missing") {
130            1.0
131        } else {
132            0.0
133        });
134
135        // Process/pipe related
136        features.push(if stderr_lower.contains("broken pipe") {
137            1.0
138        } else {
139            0.0
140        });
141        features.push(if stderr_lower.contains("killed") {
142            1.0
143        } else {
144            0.0
145        });
146        features.push(if stderr_lower.contains("timeout") {
147            1.0
148        } else {
149            0.0
150        });
151        features.push(if stderr_lower.contains("timed out") {
152            1.0
153        } else {
154            0.0
155        });
156
157        // === Quote/bracket analysis (32-37) ===
158        let single_quotes = stderr.matches('\'').count();
159        let double_quotes = stderr.matches('"').count();
160        let parens = stderr.matches('(').count() + stderr.matches(')').count();
161        let brackets = stderr.matches('[').count() + stderr.matches(']').count();
162        let braces = stderr.matches('{').count() + stderr.matches('}').count();
163
164        features.push((single_quotes as f32 / 10.0).min(1.0));
165        features.push((double_quotes as f32 / 10.0).min(1.0));
166        features.push(if !single_quotes.is_multiple_of(2) {
167            1.0
168        } else {
169            0.0
170        }); // Odd = mismatch
171        features.push(if !double_quotes.is_multiple_of(2) {
172            1.0
173        } else {
174            0.0
175        }); // Odd = mismatch
176        features.push(((parens + brackets + braces) as f32 / 20.0).min(1.0));
177        features.push(if !(parens + brackets + braces).is_multiple_of(2) {
178            1.0
179        } else {
180            0.0
181        }); // Odd = mismatch
182
183        // === Line position features (38-41) ===
184        let has_line_num = stderr_lower.contains("line ");
185        let has_column = stderr_lower.contains("column ") || stderr_lower.contains("col ");
186        features.push(if has_line_num { 1.0 } else { 0.0 });
187        features.push(if has_column { 1.0 } else { 0.0 });
188        features.push(if stderr_lower.contains("near") {
189            1.0
190        } else {
191            0.0
192        });
193        features.push(if stderr_lower.contains("expected") {
194            1.0
195        } else {
196            0.0
197        });
198
199        // === Command features (42-49) ===
200        if let Some(cmd) = command {
201            let cmd_len = cmd.len();
202            features.push((cmd_len as f32 / 100.0).min(1.0)); // Command length
203            features.push(if cmd.contains('|') { 1.0 } else { 0.0 }); // Pipeline
204            features.push(if cmd.contains('>') { 1.0 } else { 0.0 }); // Output redirect
205            features.push(if cmd.contains('<') { 1.0 } else { 0.0 }); // Input redirect
206            features.push(if cmd.contains("2>") { 1.0 } else { 0.0 }); // Stderr redirect
207            features.push(if cmd.starts_with("sudo") { 1.0 } else { 0.0 }); // Sudo
208            features.push(if cmd.contains("&&") || cmd.contains("||") {
209                1.0
210            } else {
211                0.0
212            }); // Compound
213            features.push(if cmd.contains('$') { 1.0 } else { 0.0 }); // Variables
214        } else {
215            features.extend([0.0; 8]);
216        }
217
218        // === Shell-specific keywords (50-57) ===
219        features.push(if stderr_lower.contains("bash:") {
220            1.0
221        } else {
222            0.0
223        });
224        features.push(if stderr_lower.contains("sh:") {
225            1.0
226        } else {
227            0.0
228        });
229        features.push(if stderr_lower.contains("zsh:") {
230            1.0
231        } else {
232            0.0
233        });
234        features.push(if stderr_lower.contains("dash:") {
235            1.0
236        } else {
237            0.0
238        });
239        features.push(if stderr_lower.contains("ksh:") {
240            1.0
241        } else {
242            0.0
243        });
244        features.push(if stderr_lower.contains("fish:") {
245            1.0
246        } else {
247            0.0
248        });
249        features.push(if stderr_lower.contains("cannot") {
250            1.0
251        } else {
252            0.0
253        });
254        features.push(if stderr_lower.contains("failed") {
255            1.0
256        } else {
257            0.0
258        });
259
260        // === Additional error indicators (58-63) ===
261        features.push(if stderr_lower.contains("error") {
262            1.0
263        } else {
264            0.0
265        });
266        features.push(if stderr_lower.contains("warning") {
267            1.0
268        } else {
269            0.0
270        });
271        features.push(if stderr_lower.contains("fatal") {
272            1.0
273        } else {
274            0.0
275        });
276        features.push(if stderr_lower.contains("abort") {
277            1.0
278        } else {
279            0.0
280        });
281        features.push(if stderr_lower.contains("segmentation") {
282            1.0
283        } else {
284            0.0
285        });
286        features.push(if stderr_lower.contains("core dump") {
287            1.0
288        } else {
289            0.0
290        });
291
292        // Ensure exactly 64 features
293        debug_assert_eq!(features.len(), Self::SIZE, "Feature count mismatch");
294
295        Self { features }
296    }
297
298    /// Convert to slice for ML model.
299    #[must_use]
300    pub fn as_slice(&self) -> &[f32] {
301        &self.features
302    }
303
304    /// Get feature by index with name for debugging.
305    #[must_use]
306    pub fn feature_name(index: usize) -> &'static str {
307        const NAMES: [&str; 64] = [
308            "exit_code_normalized",
309            "exit_code_is_1",
310            "exit_code_is_2",
311            "exit_code_is_126",
312            "exit_code_is_127",
313            "exit_code_is_128",
314            "signal_sigint",
315            "signal_sigkill",
316            "signal_sigpipe",
317            "signal_sigterm",
318            "stderr_length",
319            "stderr_line_count",
320            "kw_not_found",
321            "kw_no_such_file",
322            "kw_permission_denied",
323            "kw_is_directory",
324            "kw_not_directory",
325            "kw_too_many_open",
326            "kw_syntax_error",
327            "kw_unexpected",
328            "kw_unmatched",
329            "kw_unterminated",
330            "kw_unbound_variable",
331            "kw_bad_substitution",
332            "kw_readonly",
333            "kw_command_not_found",
334            "kw_invalid_option",
335            "kw_missing",
336            "kw_broken_pipe",
337            "kw_killed",
338            "kw_timeout",
339            "kw_timed_out",
340            "single_quote_count",
341            "double_quote_count",
342            "single_quote_mismatch",
343            "double_quote_mismatch",
344            "bracket_count",
345            "bracket_mismatch",
346            "has_line_number",
347            "has_column",
348            "has_near",
349            "has_expected",
350            "cmd_length",
351            "cmd_has_pipe",
352            "cmd_has_output_redirect",
353            "cmd_has_input_redirect",
354            "cmd_has_stderr_redirect",
355            "cmd_has_sudo",
356            "cmd_is_compound",
357            "cmd_has_variables",
358            "shell_bash",
359            "shell_sh",
360            "shell_zsh",
361            "shell_dash",
362            "shell_ksh",
363            "shell_fish",
364            "kw_cannot",
365            "kw_failed",
366            "kw_error",
367            "kw_warning",
368            "kw_fatal",
369            "kw_abort",
370            "kw_segmentation",
371            "kw_core_dump",
372        ];
373        NAMES.get(index).copied().unwrap_or("unknown")
374    }
375}
376
377#[cfg(test)]
378mod tests {
379    use super::*;
380
381    #[test]
382    fn test_feature_vector_size() {
383        let features = ErrorFeatures::extract(1, "test error", None);
384        assert_eq!(features.features.len(), ErrorFeatures::SIZE);
385    }
386
387    #[test]
388    fn test_exit_code_127_features() {
389        let features = ErrorFeatures::extract(127, "bash: foobar: command not found", None);
390        assert!((features.features[4] - 1.0).abs() < f32::EPSILON); // exit_code_is_127
391        assert!((features.features[25] - 1.0).abs() < f32::EPSILON); // kw_command_not_found
392    }
393
394    #[test]
395    fn test_exit_code_126_features() {
396        let features = ErrorFeatures::extract(126, "bash: ./script.sh: Permission denied", None);
397        assert!((features.features[3] - 1.0).abs() < f32::EPSILON); // exit_code_is_126
398        assert!((features.features[14] - 1.0).abs() < f32::EPSILON); // kw_permission_denied
399    }
400
401    #[test]
402    fn test_syntax_error_features() {
403        let features =
404            ErrorFeatures::extract(1, "bash: syntax error near unexpected token 'done'", None);
405        assert!((features.features[18] - 1.0).abs() < f32::EPSILON); // kw_syntax_error
406        assert!((features.features[19] - 1.0).abs() < f32::EPSILON); // kw_unexpected
407        assert!((features.features[40] - 1.0).abs() < f32::EPSILON); // has_near
408    }
409
410    #[test]
411    fn test_quote_mismatch_detection() {
412        let features = ErrorFeatures::extract(1, "unexpected EOF looking for matching '\"'", None);
413        assert!((features.features[35] - 1.0).abs() < f32::EPSILON); // double_quote_mismatch (odd count)
414    }
415
416    #[test]
417    fn test_command_features() {
418        let features =
419            ErrorFeatures::extract(1, "error", Some("cat file.txt | grep 'test' > output.txt"));
420        assert!(features.features[43] > 0.0); // cmd_has_pipe
421        assert!(features.features[44] > 0.0); // cmd_has_output_redirect
422    }
423
424    #[test]
425    fn test_signal_features() {
426        let features = ErrorFeatures::extract(141, "", None); // SIGPIPE
427        assert!((features.features[8] - 1.0).abs() < f32::EPSILON); // signal_sigpipe
428    }
429
430    #[test]
431    fn test_shell_detection() {
432        let features_bash = ErrorFeatures::extract(1, "bash: error", None);
433        assert!((features_bash.features[50] - 1.0).abs() < f32::EPSILON); // shell_bash
434
435        let features_zsh = ErrorFeatures::extract(1, "zsh: error", None);
436        assert!((features_zsh.features[52] - 1.0).abs() < f32::EPSILON); // shell_zsh
437    }
438
439    #[test]
440    fn test_file_not_found_features() {
441        let features =
442            ErrorFeatures::extract(1, "cat: /nonexistent: No such file or directory", None);
443        assert!((features.features[13] - 1.0).abs() < f32::EPSILON); // kw_no_such_file
444    }
445
446    #[test]
447    fn test_unbound_variable_features() {
448        let features = ErrorFeatures::extract(1, "bash: VAR: unbound variable", None);
449        assert!((features.features[22] - 1.0).abs() < f32::EPSILON); // kw_unbound_variable
450    }
451
452    #[test]
453    fn test_feature_names_coverage() {
454        for i in 0..ErrorFeatures::SIZE {
455            let name = ErrorFeatures::feature_name(i);
456            assert_ne!(name, "unknown", "Feature {i} has no name");
457        }
458    }
459
460    #[test]
461    fn test_normalization_bounds() {
462        // Very long error message
463        let long_stderr = "x".repeat(10000);
464        let features = ErrorFeatures::extract(255, &long_stderr, Some(&"x".repeat(1000)));
465
466        for (i, &val) in features.features.iter().enumerate() {
467            assert!(
468                (0.0..=1.0).contains(&val),
469                "Feature {i} ({}) out of bounds: {val}",
470                ErrorFeatures::feature_name(i)
471            );
472        }
473    }
474}