aprender_shell/
history.rs

1//! Shell history file parsing
2
3use std::fs::File;
4use std::io::Read;
5use std::path::PathBuf;
6
7/// Parser for shell history files
8pub struct HistoryParser;
9
10impl HistoryParser {
11    pub fn new() -> Self {
12        Self
13    }
14
15    /// Auto-detect the shell history file
16    pub fn find_history_file() -> Option<PathBuf> {
17        let home = dirs::home_dir()?;
18
19        // Try in order of preference
20        let candidates = [
21            home.join(".zsh_history"),
22            home.join(".bash_history"),
23            home.join(".local/share/fish/fish_history"),
24            home.join(".history"),
25        ];
26
27        candidates.into_iter().find(|p| p.exists())
28    }
29
30    /// Parse a history file into commands
31    pub fn parse_file(&self, path: &PathBuf) -> std::io::Result<Vec<String>> {
32        let mut file = File::open(path)?;
33        let mut bytes = Vec::new();
34        file.read_to_end(&mut bytes)?;
35
36        // Convert to string, replacing invalid UTF-8 with replacement char
37        let content = String::from_utf8_lossy(&bytes);
38
39        let mut commands = Vec::new();
40
41        for line in content.lines() {
42            if let Some(cmd) = self.parse_line(line) {
43                if !cmd.is_empty() && self.is_valid_command(&cmd) {
44                    commands.push(cmd);
45                }
46            }
47        }
48
49        Ok(commands)
50    }
51
52    /// Parse a single history line (handles zsh extended format)
53    ///
54    /// Handles:
55    /// - ZSH extended format (`: timestamp:0;command`)
56    /// - Fish format (`- cmd: command`)
57    /// - Plain format (bash)
58    /// - Comment stripping
59    /// - Shell no-op filtering
60    fn parse_line(&self, line: &str) -> Option<String> {
61        let line = line.trim();
62
63        if line.is_empty() {
64            return None;
65        }
66
67        // Skip comment-only lines (issue #91)
68        if line.starts_with('#') {
69            return None;
70        }
71
72        // Skip shell no-ops (issue #91)
73        // `: ` is a valid ZSH timestamp prefix, but `: ` followed by non-numeric is a no-op
74        if line == ":"
75            || (line.starts_with(": ") && !line.chars().nth(2).is_some_and(|c| c.is_ascii_digit()))
76        {
77            // Check if it's NOT a ZSH timestamp (starts with digit after ": ")
78            if !line.starts_with(": ") || !line.chars().nth(2).is_some_and(|c| c.is_ascii_digit()) {
79                return None;
80            }
81        }
82
83        // ZSH extended history format: ": timestamp:0;command"
84        if line.starts_with(": ") {
85            if let Some(pos) = line.find(';') {
86                let cmd = &line[pos + 1..];
87                return Some(self.strip_inline_comment(cmd));
88            }
89        }
90
91        // Fish history format: "- cmd: command"
92        if let Some(cmd) = line.strip_prefix("- cmd: ") {
93            return Some(self.strip_inline_comment(cmd));
94        }
95
96        // Plain format (bash) - strip inline comments
97        Some(self.strip_inline_comment(line))
98    }
99
100    /// Strip inline comments from commands while preserving quoted strings
101    ///
102    /// # Examples
103    /// - `git status # check` -> `git status`
104    /// - `echo "hello #world"` -> `echo "hello #world"` (preserved in quotes)
105    /// - `gh issue view #123` -> `gh issue view #123` (preserved - issue number)
106    fn strip_inline_comment(&self, cmd: &str) -> String {
107        let mut result = String::with_capacity(cmd.len());
108        let mut in_single_quote = false;
109        let mut in_double_quote = false;
110        let chars: Vec<char> = cmd.chars().collect();
111
112        let mut i = 0;
113        while i < chars.len() {
114            let ch = chars[i];
115            let prev_char = if i > 0 { chars[i - 1] } else { '\0' };
116            let next_char = chars.get(i + 1).copied();
117
118            // Handle quote state
119            if ch == '\'' && !in_double_quote && prev_char != '\\' {
120                in_single_quote = !in_single_quote;
121            } else if ch == '"' && !in_single_quote && prev_char != '\\' {
122                in_double_quote = !in_double_quote;
123            }
124
125            // Check for inline comment (# preceded by whitespace, not in quotes)
126            // BUT preserve #123 style issue numbers (# followed by digit)
127            if ch == '#' && !in_single_quote && !in_double_quote && prev_char.is_whitespace() {
128                // Check if # is followed by a digit (issue number like #123)
129                if let Some(next) = next_char {
130                    if next.is_ascii_digit() {
131                        // This is an issue number, preserve it
132                        result.push(ch);
133                        i += 1;
134                        continue;
135                    }
136                }
137                // Found inline comment, stop here and trim trailing whitespace
138                return result.trim_end().to_string();
139            }
140
141            result.push(ch);
142            i += 1;
143        }
144
145        result.trim().to_string()
146    }
147
148    /// Filter out commands we don't want to learn
149    fn is_valid_command(&self, cmd: &str) -> bool {
150        // Skip very short commands
151        if cmd.len() < 2 {
152            return false;
153        }
154
155        // Skip malformed/incomplete commands (multiline artifacts)
156        if self.is_malformed(cmd) {
157            return false;
158        }
159
160        // Skip corrupted commands (missing spaces before flags)
161        if self.has_corrupted_tokens(cmd) {
162            return false;
163        }
164
165        // Skip commands with sensitive patterns
166        let sensitive = [
167            "password",
168            "passwd",
169            "secret",
170            "token",
171            "api_key",
172            "AWS_SECRET",
173            "GITHUB_TOKEN",
174            "Authorization:",
175        ];
176
177        let cmd_lower = cmd.to_lowercase();
178        for pattern in sensitive {
179            if cmd_lower.contains(&pattern.to_lowercase()) {
180                return false;
181            }
182        }
183
184        // Skip history manipulation
185        if cmd.starts_with("history") || cmd.starts_with("fc ") {
186            return false;
187        }
188
189        true
190    }
191
192    /// Check for malformed commands (incomplete multiline, etc.)
193    fn is_malformed(&self, cmd: &str) -> bool {
194        let trimmed = cmd.trim();
195
196        // Lone backslash or backslash with whitespace
197        if trimmed == "\\" || trimmed.ends_with("\\ ") {
198            return true;
199        }
200
201        // Incomplete brace/bracket patterns
202        if trimmed.starts_with('}') || trimmed.starts_with(')') || trimmed.starts_with(']') {
203            return true;
204        }
205
206        // Commands starting with flags are multiline continuation artifacts (issue #91)
207        // e.g., "--context 3" from "git diff \n  --context 3"
208        if trimmed.starts_with("--")
209            || trimmed.starts_with('-')
210                && trimmed
211                    .chars()
212                    .nth(1)
213                    .is_some_and(|c| c.is_ascii_alphabetic())
214        {
215            return true;
216        }
217
218        false
219    }
220
221    /// Check for corrupted tokens like "commit-m" (missing space before flag)
222    fn has_corrupted_tokens(&self, cmd: &str) -> bool {
223        // Common subcommands that should never have flags directly attached
224        let subcommands = [
225            "commit", "checkout", "clone", "push", "pull", "merge", "rebase", "status", "add",
226            "build", "run", "test", "install",
227        ];
228
229        for token in cmd.split_whitespace() {
230            if let Some(dash_pos) = token.find('-') {
231                if dash_pos > 0 && dash_pos < token.len() - 1 {
232                    let before = &token[..dash_pos];
233                    let after = &token[dash_pos + 1..];
234
235                    // Pattern: subcommand-flag (e.g., "commit-m", "add-A")
236                    if subcommands.contains(&before) && (after.len() <= 2 || after.starts_with('-'))
237                    {
238                        return true;
239                    }
240                }
241            }
242        }
243
244        false
245    }
246}
247
248impl Default for HistoryParser {
249    fn default() -> Self {
250        Self::new()
251    }
252}
253
254#[cfg(test)]
255mod tests {
256    use super::*;
257
258    #[test]
259    fn test_parse_zsh_extended() {
260        let parser = HistoryParser::new();
261        let line = ": 1699900000:0;git status";
262        assert_eq!(parser.parse_line(line), Some("git status".to_string()));
263    }
264
265    #[test]
266    fn test_parse_bash() {
267        let parser = HistoryParser::new();
268        let line = "ls -la";
269        assert_eq!(parser.parse_line(line), Some("ls -la".to_string()));
270    }
271
272    #[test]
273    fn test_filter_sensitive() {
274        let parser = HistoryParser::new();
275        assert!(!parser.is_valid_command("export API_KEY=secret123"));
276        assert!(!parser.is_valid_command("echo $PASSWORD"));
277        assert!(parser.is_valid_command("git push origin main"));
278    }
279
280    #[test]
281    fn test_filter_short() {
282        let parser = HistoryParser::new();
283        assert!(!parser.is_valid_command("l"));
284        assert!(parser.is_valid_command("ls"));
285    }
286
287    // ==================== EXTREME TDD: Corrupted Command Filtering ====================
288
289    #[test]
290    fn test_filter_corrupted_commands() {
291        let parser = HistoryParser::new();
292
293        // Corrupted: missing space before flag
294        assert!(
295            !parser.is_valid_command("git commit-m test"),
296            "Should reject 'commit-m' (missing space)"
297        );
298        assert!(
299            !parser.is_valid_command("git add-A"),
300            "Should reject 'add-A' (missing space)"
301        );
302        assert!(
303            !parser.is_valid_command("cargo build-r"),
304            "Should reject 'build-r' (missing space)"
305        );
306
307        // Valid: proper spacing
308        assert!(
309            parser.is_valid_command("git commit -m test"),
310            "Should accept 'commit -m' (proper spacing)"
311        );
312        assert!(
313            parser.is_valid_command("git add -A"),
314            "Should accept 'add -A' (proper spacing)"
315        );
316
317        // Valid: legitimate hyphenated words
318        assert!(
319            parser.is_valid_command("git checkout feature-branch"),
320            "Should accept 'feature-branch' (legitimate hyphen)"
321        );
322        assert!(
323            parser.is_valid_command("npm install lodash-es"),
324            "Should accept 'lodash-es' (package name)"
325        );
326    }
327
328    #[test]
329    fn test_filter_malformed_multiline() {
330        let parser = HistoryParser::new();
331
332        // ZSH sometimes captures incomplete multiline commands
333        assert!(
334            !parser.is_valid_command("}\\ "),
335            "Should reject incomplete multiline"
336        );
337        assert!(
338            !parser.is_valid_command("\\"),
339            "Should reject lone backslash"
340        );
341    }
342
343    // ==================== Issue #91: History Parsing Fixes ====================
344
345    #[test]
346    fn test_skip_comment_lines() {
347        let parser = HistoryParser::new();
348        assert!(parser.parse_line("# this is a comment").is_none());
349        assert!(parser.parse_line("  # indented comment").is_none());
350        assert!(parser.parse_line("#").is_none());
351    }
352
353    #[test]
354    fn test_skip_shell_noops() {
355        let parser = HistoryParser::new();
356        assert!(parser.parse_line(":").is_none());
357        assert!(parser.parse_line(": ignored text").is_none());
358        assert!(parser.parse_line(":  spaces after").is_none());
359    }
360
361    #[test]
362    fn test_preserve_zsh_timestamp_format() {
363        let parser = HistoryParser::new();
364        // ZSH timestamp format should NOT be treated as no-op
365        assert_eq!(
366            parser.parse_line(": 1699900000:0;git status"),
367            Some("git status".to_string())
368        );
369    }
370
371    #[test]
372    fn test_strip_inline_comments() {
373        let parser = HistoryParser::new();
374        assert_eq!(
375            parser.parse_line("git status # check status"),
376            Some("git status".to_string())
377        );
378        assert_eq!(
379            parser.parse_line("cargo build --release # optimized"),
380            Some("cargo build --release".to_string())
381        );
382    }
383
384    #[test]
385    fn test_preserve_hash_in_arguments() {
386        let parser = HistoryParser::new();
387        // Issue numbers should be preserved (no space before #)
388        assert_eq!(
389            parser.parse_line("gh issue view #123"),
390            Some("gh issue view #123".to_string())
391        );
392        // Quoted strings should preserve #
393        assert_eq!(
394            parser.parse_line(r#"echo "hello #world""#),
395            Some(r#"echo "hello #world""#.to_string())
396        );
397        assert_eq!(
398            parser.parse_line("echo '#hashtag'"),
399            Some("echo '#hashtag'".to_string())
400        );
401    }
402
403    #[test]
404    fn test_inline_comment_with_quotes() {
405        let parser = HistoryParser::new();
406        // Comment after quoted string
407        assert_eq!(
408            parser.parse_line(r#"echo "hello" # comment"#),
409            Some(r#"echo "hello""#.to_string())
410        );
411    }
412}