aprender_shell/
corpus.rs

1//! Corpus management for synthetic shell command training
2//!
3//! Provides utilities for loading, validating, and managing training corpora
4//! for shell completion models. Supports privacy-safe synthetic data generation.
5
6use std::collections::HashSet;
7use std::path::Path;
8
9/// Patterns that indicate sensitive data - these should NEVER appear in a public corpus
10const SENSITIVE_PATTERNS: &[&str] = &[
11    // Credentials (specific patterns to avoid false positives)
12    "password=",
13    "password:",
14    "passwd=",
15    "passwd:",
16    "PASSWORD",
17    "PASSWD",
18    // Secret patterns (avoid catching "kubectl create secret")
19    "secret=",
20    "secret:",
21    "SECRET_",
22    "_SECRET",
23    // API keys
24    "api_key",
25    "apikey",
26    "api-key",
27    "bearer",
28    // Auth patterns (more specific to avoid --author false positive)
29    "auth_token",
30    "auth=",
31    "AUTH_",
32    "authorization:",
33    // AWS
34    "AKIA",
35    "aws_access",
36    "aws_secret",
37    // Private paths
38    "/home/",
39    "/Users/",
40    "C:\\Users\\",
41    // Hostnames
42    ".internal",
43    ".local",
44    ".corp",
45    // SSH
46    "id_rsa",
47    "id_ed25519",
48    ".pem",
49    // Environment
50    "export ",
51    "ENV=",
52];
53
54/// Result type for corpus operations
55pub type CorpusResult<T> = Result<T, CorpusError>;
56
57/// Errors that can occur during corpus operations
58#[derive(Debug, Clone, PartialEq, Eq)]
59pub enum CorpusError {
60    /// Corpus file not found
61    NotFound(String),
62    /// Corpus contains sensitive patterns
63    SensitiveData { line: usize, pattern: String },
64    /// Corpus is empty
65    Empty,
66    /// Invalid format
67    InvalidFormat(String),
68    /// IO error
69    IoError(String),
70}
71
72impl std::fmt::Display for CorpusError {
73    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
74        match self {
75            Self::NotFound(path) => write!(f, "Corpus not found: {path}"),
76            Self::SensitiveData { line, pattern } => {
77                write!(f, "Sensitive pattern '{pattern}' found at line {line}")
78            }
79            Self::Empty => write!(f, "Corpus is empty"),
80            Self::InvalidFormat(msg) => write!(f, "Invalid format: {msg}"),
81            Self::IoError(msg) => write!(f, "IO error: {msg}"),
82        }
83    }
84}
85
86impl std::error::Error for CorpusError {}
87
88/// A validated corpus of shell commands
89#[derive(Debug, Clone)]
90pub struct Corpus {
91    /// Commands in the corpus
92    commands: Vec<String>,
93    /// Unique command prefixes for coverage analysis
94    prefixes: HashSet<String>,
95}
96
97impl Corpus {
98    /// Load corpus from a file, validating for sensitive data
99    pub fn load<P: AsRef<Path>>(path: P) -> CorpusResult<Self> {
100        let path = path.as_ref();
101        let content =
102            std::fs::read_to_string(path).map_err(|e| CorpusError::IoError(e.to_string()))?;
103
104        Self::from_string(&content)
105    }
106
107    /// Create corpus from string content
108    pub fn from_string(content: &str) -> CorpusResult<Self> {
109        let mut commands = Vec::new();
110
111        for (line_num, line) in content.lines().enumerate() {
112            let line = line.trim();
113
114            // Skip empty lines and comments
115            if line.is_empty() || line.starts_with('#') {
116                continue;
117            }
118
119            // Validate for sensitive patterns
120            Self::validate_line(line, line_num + 1)?;
121
122            commands.push(line.to_string());
123        }
124
125        if commands.is_empty() {
126            return Err(CorpusError::Empty);
127        }
128
129        // Build prefix set for coverage analysis
130        let prefixes: HashSet<String> = commands
131            .iter()
132            .filter_map(|cmd| cmd.split_whitespace().next())
133            .map(String::from)
134            .collect();
135
136        Ok(Self { commands, prefixes })
137    }
138
139    /// Validate a single line for sensitive patterns
140    fn validate_line(line: &str, line_num: usize) -> CorpusResult<()> {
141        let lower = line.to_lowercase();
142
143        for pattern in SENSITIVE_PATTERNS {
144            if lower.contains(&pattern.to_lowercase()) {
145                return Err(CorpusError::SensitiveData {
146                    line: line_num,
147                    pattern: (*pattern).to_string(),
148                });
149            }
150        }
151
152        Ok(())
153    }
154
155    /// Get commands for training
156    pub fn commands(&self) -> &[String] {
157        &self.commands
158    }
159
160    /// Get number of commands
161    pub fn len(&self) -> usize {
162        self.commands.len()
163    }
164
165    /// Check if corpus is empty
166    pub fn is_empty(&self) -> bool {
167        self.commands.is_empty()
168    }
169
170    /// Get unique command prefixes (first word of each command)
171    pub fn prefixes(&self) -> &HashSet<String> {
172        &self.prefixes
173    }
174
175    /// Get coverage statistics
176    pub fn coverage_stats(&self) -> CorpusStats {
177        let mut token_counts: std::collections::HashMap<String, usize> =
178            std::collections::HashMap::new();
179        let mut total_tokens = 0;
180
181        for cmd in &self.commands {
182            for token in cmd.split_whitespace() {
183                *token_counts.entry(token.to_string()).or_insert(0) += 1;
184                total_tokens += 1;
185            }
186        }
187
188        CorpusStats {
189            total_commands: self.commands.len(),
190            unique_prefixes: self.prefixes.len(),
191            unique_tokens: token_counts.len(),
192            total_tokens,
193        }
194    }
195}
196
197/// Statistics about a corpus
198#[derive(Debug, Clone, PartialEq, Eq)]
199pub struct CorpusStats {
200    /// Total number of commands
201    pub total_commands: usize,
202    /// Number of unique command prefixes
203    pub unique_prefixes: usize,
204    /// Number of unique tokens
205    pub unique_tokens: usize,
206    /// Total tokens across all commands
207    pub total_tokens: usize,
208}
209
210// ============================================================================
211// TESTS - EXTREME TDD
212// ============================================================================
213
214#[cfg(test)]
215mod tests {
216    use super::*;
217
218    // ========================================================================
219    // Corpus Loading Tests
220    // ========================================================================
221
222    #[test]
223    fn test_corpus_from_string_basic() {
224        let content = "git status\ngit commit -m message\ncargo build";
225        let corpus = Corpus::from_string(content).expect("should parse");
226
227        assert_eq!(corpus.len(), 3);
228        assert_eq!(corpus.commands()[0], "git status");
229        assert_eq!(corpus.commands()[2], "cargo build");
230    }
231
232    #[test]
233    fn test_corpus_skips_empty_lines() {
234        let content = "git status\n\n\ncargo build\n";
235        let corpus = Corpus::from_string(content).expect("should parse");
236
237        assert_eq!(corpus.len(), 2);
238    }
239
240    #[test]
241    fn test_corpus_skips_comments() {
242        let content = "# This is a comment\ngit status\n# Another comment\ncargo build";
243        let corpus = Corpus::from_string(content).expect("should parse");
244
245        assert_eq!(corpus.len(), 2);
246    }
247
248    #[test]
249    fn test_corpus_trims_whitespace() {
250        let content = "  git status  \n\tcargo build\t";
251        let corpus = Corpus::from_string(content).expect("should parse");
252
253        assert_eq!(corpus.commands()[0], "git status");
254        assert_eq!(corpus.commands()[1], "cargo build");
255    }
256
257    #[test]
258    fn test_corpus_empty_returns_error() {
259        let content = "\n\n# Only comments\n";
260        let result = Corpus::from_string(content);
261
262        assert!(matches!(result, Err(CorpusError::Empty)));
263    }
264
265    // ========================================================================
266    // Sensitive Data Detection Tests (CRITICAL)
267    // ========================================================================
268
269    #[test]
270    fn test_detects_password_pattern() {
271        // PASSWORD in uppercase is detected
272        let content = "mysql -u root PASSWORD=secret123";
273        let result = Corpus::from_string(content);
274
275        assert!(matches!(result, Err(CorpusError::SensitiveData { .. })));
276    }
277
278    #[test]
279    fn test_detects_password_colon_pattern() {
280        // password: pattern is detected
281        let content = "curl -H 'password: secret123'";
282        let result = Corpus::from_string(content);
283
284        assert!(matches!(result, Err(CorpusError::SensitiveData { .. })));
285    }
286
287    #[test]
288    fn test_detects_api_key_pattern() {
289        let content = "curl -H 'api_key: secret123'";
290        let result = Corpus::from_string(content);
291
292        assert!(matches!(result, Err(CorpusError::SensitiveData { .. })));
293    }
294
295    #[test]
296    fn test_detects_aws_key_pattern() {
297        let content = "aws configure set aws_access_key_id AKIA123";
298        let result = Corpus::from_string(content);
299
300        assert!(matches!(result, Err(CorpusError::SensitiveData { .. })));
301    }
302
303    #[test]
304    fn test_detects_home_path() {
305        let content = "cd /home/username/projects";
306        let result = Corpus::from_string(content);
307
308        assert!(matches!(result, Err(CorpusError::SensitiveData { .. })));
309    }
310
311    #[test]
312    fn test_detects_users_path_mac() {
313        let content = "ls /Users/john/Documents";
314        let result = Corpus::from_string(content);
315
316        assert!(matches!(result, Err(CorpusError::SensitiveData { .. })));
317    }
318
319    #[test]
320    fn test_detects_internal_hostname() {
321        let content = "ssh server.internal";
322        let result = Corpus::from_string(content);
323
324        assert!(matches!(result, Err(CorpusError::SensitiveData { .. })));
325    }
326
327    #[test]
328    fn test_detects_ssh_key_path() {
329        let content = "ssh -i id_rsa user@server";
330        let result = Corpus::from_string(content);
331
332        assert!(matches!(result, Err(CorpusError::SensitiveData { .. })));
333    }
334
335    #[test]
336    fn test_detects_export_statement() {
337        let content = "export API_KEY=secret";
338        let result = Corpus::from_string(content);
339
340        assert!(matches!(result, Err(CorpusError::SensitiveData { .. })));
341    }
342
343    #[test]
344    fn test_sensitive_detection_case_insensitive() {
345        let content = "curl -H 'PASSWORD: test'";
346        let result = Corpus::from_string(content);
347
348        assert!(matches!(result, Err(CorpusError::SensitiveData { .. })));
349    }
350
351    #[test]
352    fn test_reports_correct_line_number() {
353        let content = "git status\ncargo build\ncurl -H 'api_key: x'";
354        let result = Corpus::from_string(content);
355
356        match result {
357            Err(CorpusError::SensitiveData { line, .. }) => {
358                assert_eq!(line, 3);
359            }
360            _ => panic!("Expected SensitiveData error"),
361        }
362    }
363
364    // ========================================================================
365    // Safe Commands Tests
366    // ========================================================================
367
368    #[test]
369    fn test_allows_safe_git_commands() {
370        let content = r#"
371git status
372git commit -m "feat: add feature"
373git push origin main
374git pull --rebase
375git checkout -b feature/new
376git log --oneline -10
377git diff HEAD~1
378git stash pop
379"#;
380        let corpus = Corpus::from_string(content).expect("should parse");
381        assert!(corpus.len() >= 8);
382    }
383
384    #[test]
385    fn test_allows_safe_docker_commands() {
386        let content = r#"
387docker build -t myapp .
388docker run -d -p 8080:80 nginx
389docker compose up -d
390docker ps -a
391docker logs container_name
392docker exec -it container_name bash
393"#;
394        let corpus = Corpus::from_string(content).expect("should parse");
395        assert!(corpus.len() >= 6);
396    }
397
398    #[test]
399    fn test_allows_safe_cargo_commands() {
400        let content = r#"
401cargo build --release
402cargo test --all-features
403cargo clippy -- -D warnings
404cargo fmt --check
405cargo run --example demo
406cargo doc --open
407"#;
408        let corpus = Corpus::from_string(content).expect("should parse");
409        assert!(corpus.len() >= 6);
410    }
411
412    // ========================================================================
413    // Coverage Statistics Tests
414    // ========================================================================
415
416    #[test]
417    fn test_prefixes_extraction() {
418        let content = "git status\ngit commit\ncargo build\ncargo test";
419        let corpus = Corpus::from_string(content).expect("should parse");
420
421        assert_eq!(corpus.prefixes().len(), 2);
422        assert!(corpus.prefixes().contains("git"));
423        assert!(corpus.prefixes().contains("cargo"));
424    }
425
426    #[test]
427    fn test_coverage_stats() {
428        let content = "git status\ngit commit -m msg";
429        let corpus = Corpus::from_string(content).expect("should parse");
430        let stats = corpus.coverage_stats();
431
432        assert_eq!(stats.total_commands, 2);
433        assert_eq!(stats.unique_prefixes, 1); // just "git"
434        assert_eq!(stats.total_tokens, 6); // git, status, git, commit, -m, msg
435    }
436
437    // ========================================================================
438    // Error Display Tests
439    // ========================================================================
440
441    #[test]
442    fn test_error_display_not_found() {
443        let err = CorpusError::NotFound("/path/to/file".into());
444        assert!(err.to_string().contains("/path/to/file"));
445    }
446
447    #[test]
448    fn test_error_display_sensitive() {
449        let err = CorpusError::SensitiveData {
450            line: 42,
451            pattern: "password".into(),
452        };
453        let msg = err.to_string();
454        assert!(msg.contains("password"));
455        assert!(msg.contains("42"));
456    }
457}