1use std::collections::HashSet;
7use std::path::Path;
8
9const SENSITIVE_PATTERNS: &[&str] = &[
11 "password=",
13 "password:",
14 "passwd=",
15 "passwd:",
16 "PASSWORD",
17 "PASSWD",
18 "secret=",
20 "secret:",
21 "SECRET_",
22 "_SECRET",
23 "api_key",
25 "apikey",
26 "api-key",
27 "bearer",
28 "auth_token",
30 "auth=",
31 "AUTH_",
32 "authorization:",
33 "AKIA",
35 "aws_access",
36 "aws_secret",
37 "/home/",
39 "/Users/",
40 "C:\\Users\\",
41 ".internal",
43 ".local",
44 ".corp",
45 "id_rsa",
47 "id_ed25519",
48 ".pem",
49 "export ",
51 "ENV=",
52];
53
54pub type CorpusResult<T> = Result<T, CorpusError>;
56
57#[derive(Debug, Clone, PartialEq, Eq)]
59pub enum CorpusError {
60 NotFound(String),
62 SensitiveData { line: usize, pattern: String },
64 Empty,
66 InvalidFormat(String),
68 IoError(String),
70}
71
72impl std::fmt::Display for CorpusError {
73 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
74 match self {
75 Self::NotFound(path) => write!(f, "Corpus not found: {path}"),
76 Self::SensitiveData { line, pattern } => {
77 write!(f, "Sensitive pattern '{pattern}' found at line {line}")
78 }
79 Self::Empty => write!(f, "Corpus is empty"),
80 Self::InvalidFormat(msg) => write!(f, "Invalid format: {msg}"),
81 Self::IoError(msg) => write!(f, "IO error: {msg}"),
82 }
83 }
84}
85
86impl std::error::Error for CorpusError {}
87
88#[derive(Debug, Clone)]
90pub struct Corpus {
91 commands: Vec<String>,
93 prefixes: HashSet<String>,
95}
96
97impl Corpus {
98 pub fn load<P: AsRef<Path>>(path: P) -> CorpusResult<Self> {
100 let path = path.as_ref();
101 let content =
102 std::fs::read_to_string(path).map_err(|e| CorpusError::IoError(e.to_string()))?;
103
104 Self::from_string(&content)
105 }
106
107 pub fn from_string(content: &str) -> CorpusResult<Self> {
109 let mut commands = Vec::new();
110
111 for (line_num, line) in content.lines().enumerate() {
112 let line = line.trim();
113
114 if line.is_empty() || line.starts_with('#') {
116 continue;
117 }
118
119 Self::validate_line(line, line_num + 1)?;
121
122 commands.push(line.to_string());
123 }
124
125 if commands.is_empty() {
126 return Err(CorpusError::Empty);
127 }
128
129 let prefixes: HashSet<String> = commands
131 .iter()
132 .filter_map(|cmd| cmd.split_whitespace().next())
133 .map(String::from)
134 .collect();
135
136 Ok(Self { commands, prefixes })
137 }
138
139 fn validate_line(line: &str, line_num: usize) -> CorpusResult<()> {
141 let lower = line.to_lowercase();
142
143 for pattern in SENSITIVE_PATTERNS {
144 if lower.contains(&pattern.to_lowercase()) {
145 return Err(CorpusError::SensitiveData {
146 line: line_num,
147 pattern: (*pattern).to_string(),
148 });
149 }
150 }
151
152 Ok(())
153 }
154
155 pub fn commands(&self) -> &[String] {
157 &self.commands
158 }
159
160 pub fn len(&self) -> usize {
162 self.commands.len()
163 }
164
165 pub fn is_empty(&self) -> bool {
167 self.commands.is_empty()
168 }
169
170 pub fn prefixes(&self) -> &HashSet<String> {
172 &self.prefixes
173 }
174
175 pub fn coverage_stats(&self) -> CorpusStats {
177 let mut token_counts: std::collections::HashMap<String, usize> =
178 std::collections::HashMap::new();
179 let mut total_tokens = 0;
180
181 for cmd in &self.commands {
182 for token in cmd.split_whitespace() {
183 *token_counts.entry(token.to_string()).or_insert(0) += 1;
184 total_tokens += 1;
185 }
186 }
187
188 CorpusStats {
189 total_commands: self.commands.len(),
190 unique_prefixes: self.prefixes.len(),
191 unique_tokens: token_counts.len(),
192 total_tokens,
193 }
194 }
195}
196
197#[derive(Debug, Clone, PartialEq, Eq)]
199pub struct CorpusStats {
200 pub total_commands: usize,
202 pub unique_prefixes: usize,
204 pub unique_tokens: usize,
206 pub total_tokens: usize,
208}
209
210#[cfg(test)]
215mod tests {
216 use super::*;
217
218 #[test]
223 fn test_corpus_from_string_basic() {
224 let content = "git status\ngit commit -m message\ncargo build";
225 let corpus = Corpus::from_string(content).expect("should parse");
226
227 assert_eq!(corpus.len(), 3);
228 assert_eq!(corpus.commands()[0], "git status");
229 assert_eq!(corpus.commands()[2], "cargo build");
230 }
231
232 #[test]
233 fn test_corpus_skips_empty_lines() {
234 let content = "git status\n\n\ncargo build\n";
235 let corpus = Corpus::from_string(content).expect("should parse");
236
237 assert_eq!(corpus.len(), 2);
238 }
239
240 #[test]
241 fn test_corpus_skips_comments() {
242 let content = "# This is a comment\ngit status\n# Another comment\ncargo build";
243 let corpus = Corpus::from_string(content).expect("should parse");
244
245 assert_eq!(corpus.len(), 2);
246 }
247
248 #[test]
249 fn test_corpus_trims_whitespace() {
250 let content = " git status \n\tcargo build\t";
251 let corpus = Corpus::from_string(content).expect("should parse");
252
253 assert_eq!(corpus.commands()[0], "git status");
254 assert_eq!(corpus.commands()[1], "cargo build");
255 }
256
257 #[test]
258 fn test_corpus_empty_returns_error() {
259 let content = "\n\n# Only comments\n";
260 let result = Corpus::from_string(content);
261
262 assert!(matches!(result, Err(CorpusError::Empty)));
263 }
264
265 #[test]
270 fn test_detects_password_pattern() {
271 let content = "mysql -u root PASSWORD=secret123";
273 let result = Corpus::from_string(content);
274
275 assert!(matches!(result, Err(CorpusError::SensitiveData { .. })));
276 }
277
278 #[test]
279 fn test_detects_password_colon_pattern() {
280 let content = "curl -H 'password: secret123'";
282 let result = Corpus::from_string(content);
283
284 assert!(matches!(result, Err(CorpusError::SensitiveData { .. })));
285 }
286
287 #[test]
288 fn test_detects_api_key_pattern() {
289 let content = "curl -H 'api_key: secret123'";
290 let result = Corpus::from_string(content);
291
292 assert!(matches!(result, Err(CorpusError::SensitiveData { .. })));
293 }
294
295 #[test]
296 fn test_detects_aws_key_pattern() {
297 let content = "aws configure set aws_access_key_id AKIA123";
298 let result = Corpus::from_string(content);
299
300 assert!(matches!(result, Err(CorpusError::SensitiveData { .. })));
301 }
302
303 #[test]
304 fn test_detects_home_path() {
305 let content = "cd /home/username/projects";
306 let result = Corpus::from_string(content);
307
308 assert!(matches!(result, Err(CorpusError::SensitiveData { .. })));
309 }
310
311 #[test]
312 fn test_detects_users_path_mac() {
313 let content = "ls /Users/john/Documents";
314 let result = Corpus::from_string(content);
315
316 assert!(matches!(result, Err(CorpusError::SensitiveData { .. })));
317 }
318
319 #[test]
320 fn test_detects_internal_hostname() {
321 let content = "ssh server.internal";
322 let result = Corpus::from_string(content);
323
324 assert!(matches!(result, Err(CorpusError::SensitiveData { .. })));
325 }
326
327 #[test]
328 fn test_detects_ssh_key_path() {
329 let content = "ssh -i id_rsa user@server";
330 let result = Corpus::from_string(content);
331
332 assert!(matches!(result, Err(CorpusError::SensitiveData { .. })));
333 }
334
335 #[test]
336 fn test_detects_export_statement() {
337 let content = "export API_KEY=secret";
338 let result = Corpus::from_string(content);
339
340 assert!(matches!(result, Err(CorpusError::SensitiveData { .. })));
341 }
342
343 #[test]
344 fn test_sensitive_detection_case_insensitive() {
345 let content = "curl -H 'PASSWORD: test'";
346 let result = Corpus::from_string(content);
347
348 assert!(matches!(result, Err(CorpusError::SensitiveData { .. })));
349 }
350
351 #[test]
352 fn test_reports_correct_line_number() {
353 let content = "git status\ncargo build\ncurl -H 'api_key: x'";
354 let result = Corpus::from_string(content);
355
356 match result {
357 Err(CorpusError::SensitiveData { line, .. }) => {
358 assert_eq!(line, 3);
359 }
360 _ => panic!("Expected SensitiveData error"),
361 }
362 }
363
364 #[test]
369 fn test_allows_safe_git_commands() {
370 let content = r#"
371git status
372git commit -m "feat: add feature"
373git push origin main
374git pull --rebase
375git checkout -b feature/new
376git log --oneline -10
377git diff HEAD~1
378git stash pop
379"#;
380 let corpus = Corpus::from_string(content).expect("should parse");
381 assert!(corpus.len() >= 8);
382 }
383
384 #[test]
385 fn test_allows_safe_docker_commands() {
386 let content = r#"
387docker build -t myapp .
388docker run -d -p 8080:80 nginx
389docker compose up -d
390docker ps -a
391docker logs container_name
392docker exec -it container_name bash
393"#;
394 let corpus = Corpus::from_string(content).expect("should parse");
395 assert!(corpus.len() >= 6);
396 }
397
398 #[test]
399 fn test_allows_safe_cargo_commands() {
400 let content = r#"
401cargo build --release
402cargo test --all-features
403cargo clippy -- -D warnings
404cargo fmt --check
405cargo run --example demo
406cargo doc --open
407"#;
408 let corpus = Corpus::from_string(content).expect("should parse");
409 assert!(corpus.len() >= 6);
410 }
411
412 #[test]
417 fn test_prefixes_extraction() {
418 let content = "git status\ngit commit\ncargo build\ncargo test";
419 let corpus = Corpus::from_string(content).expect("should parse");
420
421 assert_eq!(corpus.prefixes().len(), 2);
422 assert!(corpus.prefixes().contains("git"));
423 assert!(corpus.prefixes().contains("cargo"));
424 }
425
426 #[test]
427 fn test_coverage_stats() {
428 let content = "git status\ngit commit -m msg";
429 let corpus = Corpus::from_string(content).expect("should parse");
430 let stats = corpus.coverage_stats();
431
432 assert_eq!(stats.total_commands, 2);
433 assert_eq!(stats.unique_prefixes, 1); assert_eq!(stats.total_tokens, 6); }
436
437 #[test]
442 fn test_error_display_not_found() {
443 let err = CorpusError::NotFound("/path/to/file".into());
444 assert!(err.to_string().contains("/path/to/file"));
445 }
446
447 #[test]
448 fn test_error_display_sensitive() {
449 let err = CorpusError::SensitiveData {
450 line: 42,
451 pattern: "password".into(),
452 };
453 let msg = err.to_string();
454 assert!(msg.contains("password"));
455 assert!(msg.contains("42"));
456 }
457}