bashrs_oracle/
features.rs

1//! Feature extraction for ML model.
2//!
3//! 64 features carefully chosen to capture error patterns:
4//! - Numeric features normalized to [0, 1]
5//! - Categorical features one-hot encoded
6//! - Text features converted to bag-of-words indicators
7
8/// Feature vector for ML model (64 features).
9#[derive(Debug, Clone)]
10pub struct ErrorFeatures {
11    /// Feature vector (always 64 elements).
12    pub features: Vec<f32>,
13}
14
15impl ErrorFeatures {
16    /// Feature vector size.
17    pub const SIZE: usize = 64;
18
19    /// Extract features from error message and context.
20    #[must_use]
21    pub fn extract(exit_code: i32, stderr: &str, command: Option<&str>) -> Self {
22        let mut features = Vec::with_capacity(Self::SIZE);
23
24        // === Exit code features (0-5) ===
25        features.push(exit_code as f32 / 255.0); // Normalized exit code
26        features.push(if exit_code == 1 { 1.0 } else { 0.0 }); // General error
27        features.push(if exit_code == 2 { 1.0 } else { 0.0 }); // Misuse
28        features.push(if exit_code == 126 { 1.0 } else { 0.0 }); // Permission denied
29        features.push(if exit_code == 127 { 1.0 } else { 0.0 }); // Command not found
30        features.push(if exit_code == 128 { 1.0 } else { 0.0 }); // Signal base
31
32        // === Signal features (6-9) ===
33        features.push(if exit_code == 130 { 1.0 } else { 0.0 }); // SIGINT (Ctrl+C)
34        features.push(if exit_code == 137 { 1.0 } else { 0.0 }); // SIGKILL
35        features.push(if exit_code == 141 { 1.0 } else { 0.0 }); // SIGPIPE
36        features.push(if exit_code == 143 { 1.0 } else { 0.0 }); // SIGTERM
37
38        // === Error message length features (10-11) ===
39        let stderr_len = stderr.len();
40        features.push((stderr_len as f32 / 1000.0).min(1.0)); // Normalized length
41        features.push((stderr.lines().count() as f32 / 10.0).min(1.0)); // Line count
42
43        // === Keyword indicators - bag of words (12-31) ===
44        let stderr_lower = stderr.to_lowercase();
45
46        // File/path related
47        features.push(if stderr_lower.contains("not found") {
48            1.0
49        } else {
50            0.0
51        });
52        features.push(if stderr_lower.contains("no such file") {
53            1.0
54        } else {
55            0.0
56        });
57        features.push(if stderr_lower.contains("permission denied") {
58            1.0
59        } else {
60            0.0
61        });
62        features.push(if stderr_lower.contains("is a directory") {
63            1.0
64        } else {
65            0.0
66        });
67        features.push(if stderr_lower.contains("not a directory") {
68            1.0
69        } else {
70            0.0
71        });
72        features.push(if stderr_lower.contains("too many open") {
73            1.0
74        } else {
75            0.0
76        });
77
78        // Syntax related
79        features.push(if stderr_lower.contains("syntax error") {
80            1.0
81        } else {
82            0.0
83        });
84        features.push(if stderr_lower.contains("unexpected") {
85            1.0
86        } else {
87            0.0
88        });
89        features.push(if stderr_lower.contains("unmatched") {
90            1.0
91        } else {
92            0.0
93        });
94        features.push(if stderr_lower.contains("unterminated") {
95            1.0
96        } else {
97            0.0
98        });
99
100        // Variable related
101        features.push(if stderr_lower.contains("unbound variable") {
102            1.0
103        } else {
104            0.0
105        });
106        features.push(if stderr_lower.contains("bad substitution") {
107            1.0
108        } else {
109            0.0
110        });
111        features.push(if stderr_lower.contains("readonly") {
112            1.0
113        } else {
114            0.0
115        });
116
117        // Command related
118        features.push(if stderr_lower.contains("command not found") {
119            1.0
120        } else {
121            0.0
122        });
123        features.push(if stderr_lower.contains("invalid option") {
124            1.0
125        } else {
126            0.0
127        });
128        features.push(if stderr_lower.contains("missing") {
129            1.0
130        } else {
131            0.0
132        });
133
134        // Process/pipe related
135        features.push(if stderr_lower.contains("broken pipe") {
136            1.0
137        } else {
138            0.0
139        });
140        features.push(if stderr_lower.contains("killed") {
141            1.0
142        } else {
143            0.0
144        });
145        features.push(if stderr_lower.contains("timeout") {
146            1.0
147        } else {
148            0.0
149        });
150        features.push(if stderr_lower.contains("timed out") {
151            1.0
152        } else {
153            0.0
154        });
155
156        // === Quote/bracket analysis (32-37) ===
157        let single_quotes = stderr.matches('\'').count();
158        let double_quotes = stderr.matches('"').count();
159        let parens = stderr.matches('(').count() + stderr.matches(')').count();
160        let brackets = stderr.matches('[').count() + stderr.matches(']').count();
161        let braces = stderr.matches('{').count() + stderr.matches('}').count();
162
163        features.push((single_quotes as f32 / 10.0).min(1.0));
164        features.push((double_quotes as f32 / 10.0).min(1.0));
165        features.push(if !single_quotes.is_multiple_of(2) {
166            1.0
167        } else {
168            0.0
169        }); // Odd = mismatch
170        features.push(if !double_quotes.is_multiple_of(2) {
171            1.0
172        } else {
173            0.0
174        }); // Odd = mismatch
175        features.push(((parens + brackets + braces) as f32 / 20.0).min(1.0));
176        features.push(if !(parens + brackets + braces).is_multiple_of(2) {
177            1.0
178        } else {
179            0.0
180        }); // Odd = mismatch
181
182        // === Line position features (38-41) ===
183        let has_line_num = stderr_lower.contains("line ");
184        let has_column = stderr_lower.contains("column ") || stderr_lower.contains("col ");
185        features.push(if has_line_num { 1.0 } else { 0.0 });
186        features.push(if has_column { 1.0 } else { 0.0 });
187        features.push(if stderr_lower.contains("near") {
188            1.0
189        } else {
190            0.0
191        });
192        features.push(if stderr_lower.contains("expected") {
193            1.0
194        } else {
195            0.0
196        });
197
198        // === Command features (42-49) ===
199        if let Some(cmd) = command {
200            let cmd_len = cmd.len();
201            features.push((cmd_len as f32 / 100.0).min(1.0)); // Command length
202            features.push(if cmd.contains('|') { 1.0 } else { 0.0 }); // Pipeline
203            features.push(if cmd.contains('>') { 1.0 } else { 0.0 }); // Output redirect
204            features.push(if cmd.contains('<') { 1.0 } else { 0.0 }); // Input redirect
205            features.push(if cmd.contains("2>") { 1.0 } else { 0.0 }); // Stderr redirect
206            features.push(if cmd.starts_with("sudo") { 1.0 } else { 0.0 }); // Sudo
207            features.push(if cmd.contains("&&") || cmd.contains("||") {
208                1.0
209            } else {
210                0.0
211            }); // Compound
212            features.push(if cmd.contains('$') { 1.0 } else { 0.0 }); // Variables
213        } else {
214            features.extend([0.0; 8]);
215        }
216
217        // === Shell-specific keywords (50-57) ===
218        features.push(if stderr_lower.contains("bash:") {
219            1.0
220        } else {
221            0.0
222        });
223        features.push(if stderr_lower.contains("sh:") {
224            1.0
225        } else {
226            0.0
227        });
228        features.push(if stderr_lower.contains("zsh:") {
229            1.0
230        } else {
231            0.0
232        });
233        features.push(if stderr_lower.contains("dash:") {
234            1.0
235        } else {
236            0.0
237        });
238        features.push(if stderr_lower.contains("ksh:") {
239            1.0
240        } else {
241            0.0
242        });
243        features.push(if stderr_lower.contains("fish:") {
244            1.0
245        } else {
246            0.0
247        });
248        features.push(if stderr_lower.contains("cannot") {
249            1.0
250        } else {
251            0.0
252        });
253        features.push(if stderr_lower.contains("failed") {
254            1.0
255        } else {
256            0.0
257        });
258
259        // === Additional error indicators (58-63) ===
260        features.push(if stderr_lower.contains("error") {
261            1.0
262        } else {
263            0.0
264        });
265        features.push(if stderr_lower.contains("warning") {
266            1.0
267        } else {
268            0.0
269        });
270        features.push(if stderr_lower.contains("fatal") {
271            1.0
272        } else {
273            0.0
274        });
275        features.push(if stderr_lower.contains("abort") {
276            1.0
277        } else {
278            0.0
279        });
280        features.push(if stderr_lower.contains("segmentation") {
281            1.0
282        } else {
283            0.0
284        });
285        features.push(if stderr_lower.contains("core dump") {
286            1.0
287        } else {
288            0.0
289        });
290
291        // Ensure exactly 64 features
292        debug_assert_eq!(features.len(), Self::SIZE, "Feature count mismatch");
293
294        Self { features }
295    }
296
297    /// Convert to slice for ML model.
298    #[must_use]
299    pub fn as_slice(&self) -> &[f32] {
300        &self.features
301    }
302
303    /// Get feature by index with name for debugging.
304    #[must_use]
305    pub fn feature_name(index: usize) -> &'static str {
306        match index {
307            0 => "exit_code_normalized",
308            1 => "exit_code_is_1",
309            2 => "exit_code_is_2",
310            3 => "exit_code_is_126",
311            4 => "exit_code_is_127",
312            5 => "exit_code_is_128",
313            6 => "signal_sigint",
314            7 => "signal_sigkill",
315            8 => "signal_sigpipe",
316            9 => "signal_sigterm",
317            10 => "stderr_length",
318            11 => "stderr_line_count",
319            12 => "kw_not_found",
320            13 => "kw_no_such_file",
321            14 => "kw_permission_denied",
322            15 => "kw_is_directory",
323            16 => "kw_not_directory",
324            17 => "kw_too_many_open",
325            18 => "kw_syntax_error",
326            19 => "kw_unexpected",
327            20 => "kw_unmatched",
328            21 => "kw_unterminated",
329            22 => "kw_unbound_variable",
330            23 => "kw_bad_substitution",
331            24 => "kw_readonly",
332            25 => "kw_command_not_found",
333            26 => "kw_invalid_option",
334            27 => "kw_missing",
335            28 => "kw_broken_pipe",
336            29 => "kw_killed",
337            30 => "kw_timeout",
338            31 => "kw_timed_out",
339            32 => "single_quote_count",
340            33 => "double_quote_count",
341            34 => "single_quote_mismatch",
342            35 => "double_quote_mismatch",
343            36 => "bracket_count",
344            37 => "bracket_mismatch",
345            38 => "has_line_number",
346            39 => "has_column",
347            40 => "has_near",
348            41 => "has_expected",
349            42 => "cmd_length",
350            43 => "cmd_has_pipe",
351            44 => "cmd_has_output_redirect",
352            45 => "cmd_has_input_redirect",
353            46 => "cmd_has_stderr_redirect",
354            47 => "cmd_has_sudo",
355            48 => "cmd_is_compound",
356            49 => "cmd_has_variables",
357            50 => "shell_bash",
358            51 => "shell_sh",
359            52 => "shell_zsh",
360            53 => "shell_dash",
361            54 => "shell_ksh",
362            55 => "shell_fish",
363            56 => "kw_cannot",
364            57 => "kw_failed",
365            58 => "kw_error",
366            59 => "kw_warning",
367            60 => "kw_fatal",
368            61 => "kw_abort",
369            62 => "kw_segmentation",
370            63 => "kw_core_dump",
371            _ => "unknown",
372        }
373    }
374}
375
376#[cfg(test)]
377mod tests {
378    use super::*;
379
380    #[test]
381    fn test_feature_vector_size() {
382        let features = ErrorFeatures::extract(1, "test error", None);
383        assert_eq!(features.features.len(), ErrorFeatures::SIZE);
384    }
385
386    #[test]
387    fn test_exit_code_127_features() {
388        let features = ErrorFeatures::extract(127, "bash: foobar: command not found", None);
389        assert!((features.features[4] - 1.0).abs() < f32::EPSILON); // exit_code_is_127
390        assert!((features.features[25] - 1.0).abs() < f32::EPSILON); // kw_command_not_found
391    }
392
393    #[test]
394    fn test_exit_code_126_features() {
395        let features = ErrorFeatures::extract(126, "bash: ./script.sh: Permission denied", None);
396        assert!((features.features[3] - 1.0).abs() < f32::EPSILON); // exit_code_is_126
397        assert!((features.features[14] - 1.0).abs() < f32::EPSILON); // kw_permission_denied
398    }
399
400    #[test]
401    fn test_syntax_error_features() {
402        let features =
403            ErrorFeatures::extract(1, "bash: syntax error near unexpected token 'done'", None);
404        assert!((features.features[18] - 1.0).abs() < f32::EPSILON); // kw_syntax_error
405        assert!((features.features[19] - 1.0).abs() < f32::EPSILON); // kw_unexpected
406        assert!((features.features[40] - 1.0).abs() < f32::EPSILON); // has_near
407    }
408
409    #[test]
410    fn test_quote_mismatch_detection() {
411        let features = ErrorFeatures::extract(1, "unexpected EOF looking for matching '\"'", None);
412        assert!((features.features[35] - 1.0).abs() < f32::EPSILON); // double_quote_mismatch (odd count)
413    }
414
415    #[test]
416    fn test_command_features() {
417        let features =
418            ErrorFeatures::extract(1, "error", Some("cat file.txt | grep 'test' > output.txt"));
419        assert!(features.features[43] > 0.0); // cmd_has_pipe
420        assert!(features.features[44] > 0.0); // cmd_has_output_redirect
421    }
422
423    #[test]
424    fn test_signal_features() {
425        let features = ErrorFeatures::extract(141, "", None); // SIGPIPE
426        assert!((features.features[8] - 1.0).abs() < f32::EPSILON); // signal_sigpipe
427    }
428
429    #[test]
430    fn test_shell_detection() {
431        let features_bash = ErrorFeatures::extract(1, "bash: error", None);
432        assert!((features_bash.features[50] - 1.0).abs() < f32::EPSILON); // shell_bash
433
434        let features_zsh = ErrorFeatures::extract(1, "zsh: error", None);
435        assert!((features_zsh.features[52] - 1.0).abs() < f32::EPSILON); // shell_zsh
436    }
437
438    #[test]
439    fn test_file_not_found_features() {
440        let features =
441            ErrorFeatures::extract(1, "cat: /nonexistent: No such file or directory", None);
442        assert!((features.features[13] - 1.0).abs() < f32::EPSILON); // kw_no_such_file
443    }
444
445    #[test]
446    fn test_unbound_variable_features() {
447        let features = ErrorFeatures::extract(1, "bash: VAR: unbound variable", None);
448        assert!((features.features[22] - 1.0).abs() < f32::EPSILON); // kw_unbound_variable
449    }
450
451    #[test]
452    fn test_feature_names_coverage() {
453        for i in 0..ErrorFeatures::SIZE {
454            let name = ErrorFeatures::feature_name(i);
455            assert_ne!(name, "unknown", "Feature {i} has no name");
456        }
457    }
458
459    #[test]
460    fn test_normalization_bounds() {
461        // Very long error message
462        let long_stderr = "x".repeat(10000);
463        let features = ErrorFeatures::extract(255, &long_stderr, Some(&"x".repeat(1000)));
464
465        for (i, &val) in features.features.iter().enumerate() {
466            assert!(
467                (0.0..=1.0).contains(&val),
468                "Feature {i} ({}) out of bounds: {val}",
469                ErrorFeatures::feature_name(i)
470            );
471        }
472    }
473}