use std::collections::HashSet;
use std::path::Path;
const SENSITIVE_PATTERNS: &[&str] = &[
"password=",
"password:",
"passwd=",
"passwd:",
"PASSWORD",
"PASSWD",
"secret=",
"secret:",
"SECRET_",
"_SECRET",
"api_key",
"apikey",
"api-key",
"bearer",
"auth_token",
"auth=",
"AUTH_",
"authorization:",
"AKIA",
"aws_access",
"aws_secret",
"/home/",
"/Users/",
"C:\\Users\\",
".internal",
".local",
".corp",
"id_rsa",
"id_ed25519",
".pem",
"export ",
"ENV=",
];
pub type CorpusResult<T> = Result<T, CorpusError>;
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum CorpusError {
NotFound(String),
SensitiveData { line: usize, pattern: String },
Empty,
InvalidFormat(String),
IoError(String),
}
impl std::fmt::Display for CorpusError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::NotFound(path) => write!(f, "Corpus not found: {path}"),
Self::SensitiveData { line, pattern } => {
write!(f, "Sensitive pattern '{pattern}' found at line {line}")
}
Self::Empty => write!(f, "Corpus is empty"),
Self::InvalidFormat(msg) => write!(f, "Invalid format: {msg}"),
Self::IoError(msg) => write!(f, "IO error: {msg}"),
}
}
}
impl std::error::Error for CorpusError {}
#[derive(Debug, Clone)]
pub struct Corpus {
commands: Vec<String>,
prefixes: HashSet<String>,
}
impl Corpus {
pub fn load<P: AsRef<Path>>(path: P) -> CorpusResult<Self> {
let path = path.as_ref();
let content =
std::fs::read_to_string(path).map_err(|e| CorpusError::IoError(e.to_string()))?;
Self::from_string(&content)
}
pub fn from_string(content: &str) -> CorpusResult<Self> {
let mut commands = Vec::new();
for (line_num, line) in content.lines().enumerate() {
let line = line.trim();
if line.is_empty() || line.starts_with('#') {
continue;
}
Self::validate_line(line, line_num + 1)?;
commands.push(line.to_string());
}
if commands.is_empty() {
return Err(CorpusError::Empty);
}
let prefixes: HashSet<String> = commands
.iter()
.filter_map(|cmd| cmd.split_whitespace().next())
.map(String::from)
.collect();
Ok(Self { commands, prefixes })
}
fn validate_line(line: &str, line_num: usize) -> CorpusResult<()> {
let lower = line.to_lowercase();
for pattern in SENSITIVE_PATTERNS {
if lower.contains(&pattern.to_lowercase()) {
return Err(CorpusError::SensitiveData {
line: line_num,
pattern: (*pattern).to_string(),
});
}
}
Ok(())
}
pub fn commands(&self) -> &[String] {
&self.commands
}
pub fn len(&self) -> usize {
self.commands.len()
}
pub fn is_empty(&self) -> bool {
self.commands.is_empty()
}
pub fn prefixes(&self) -> &HashSet<String> {
&self.prefixes
}
pub fn coverage_stats(&self) -> CorpusStats {
let mut token_counts: std::collections::HashMap<String, usize> =
std::collections::HashMap::new();
let mut total_tokens = 0;
for cmd in &self.commands {
for token in cmd.split_whitespace() {
*token_counts.entry(token.to_string()).or_insert(0) += 1;
total_tokens += 1;
}
}
CorpusStats {
total_commands: self.commands.len(),
unique_prefixes: self.prefixes.len(),
unique_tokens: token_counts.len(),
total_tokens,
}
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct CorpusStats {
pub total_commands: usize,
pub unique_prefixes: usize,
pub unique_tokens: usize,
pub total_tokens: usize,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_corpus_from_string_basic() {
let content = "git status\ngit commit -m message\ncargo build";
let corpus = Corpus::from_string(content).expect("should parse");
assert_eq!(corpus.len(), 3);
assert_eq!(corpus.commands()[0], "git status");
assert_eq!(corpus.commands()[2], "cargo build");
}
#[test]
fn test_corpus_skips_empty_lines() {
let content = "git status\n\n\ncargo build\n";
let corpus = Corpus::from_string(content).expect("should parse");
assert_eq!(corpus.len(), 2);
}
#[test]
fn test_corpus_skips_comments() {
let content = "# This is a comment\ngit status\n# Another comment\ncargo build";
let corpus = Corpus::from_string(content).expect("should parse");
assert_eq!(corpus.len(), 2);
}
#[test]
fn test_corpus_trims_whitespace() {
let content = " git status \n\tcargo build\t";
let corpus = Corpus::from_string(content).expect("should parse");
assert_eq!(corpus.commands()[0], "git status");
assert_eq!(corpus.commands()[1], "cargo build");
}
#[test]
fn test_corpus_empty_returns_error() {
let content = "\n\n# Only comments\n";
let result = Corpus::from_string(content);
assert!(matches!(result, Err(CorpusError::Empty)));
}
#[test]
fn test_detects_password_pattern() {
let content = "mysql -u root PASSWORD=secret123";
let result = Corpus::from_string(content);
assert!(matches!(result, Err(CorpusError::SensitiveData { .. })));
}
#[test]
fn test_detects_password_colon_pattern() {
let content = "curl -H 'password: secret123'";
let result = Corpus::from_string(content);
assert!(matches!(result, Err(CorpusError::SensitiveData { .. })));
}
#[test]
fn test_detects_api_key_pattern() {
let content = "curl -H 'api_key: secret123'";
let result = Corpus::from_string(content);
assert!(matches!(result, Err(CorpusError::SensitiveData { .. })));
}
#[test]
fn test_detects_aws_key_pattern() {
let content = "aws configure set aws_access_key_id AKIA123";
let result = Corpus::from_string(content);
assert!(matches!(result, Err(CorpusError::SensitiveData { .. })));
}
#[test]
fn test_detects_home_path() {
let content = "cd /home/username/projects";
let result = Corpus::from_string(content);
assert!(matches!(result, Err(CorpusError::SensitiveData { .. })));
}
#[test]
fn test_detects_users_path_mac() {
let content = "ls /Users/john/Documents";
let result = Corpus::from_string(content);
assert!(matches!(result, Err(CorpusError::SensitiveData { .. })));
}
#[test]
fn test_detects_internal_hostname() {
let content = "ssh server.internal";
let result = Corpus::from_string(content);
assert!(matches!(result, Err(CorpusError::SensitiveData { .. })));
}
#[test]
fn test_detects_ssh_key_path() {
let content = "ssh -i id_rsa user@server";
let result = Corpus::from_string(content);
assert!(matches!(result, Err(CorpusError::SensitiveData { .. })));
}
#[test]
fn test_detects_export_statement() {
let content = "export API_KEY=secret";
let result = Corpus::from_string(content);
assert!(matches!(result, Err(CorpusError::SensitiveData { .. })));
}
#[test]
fn test_sensitive_detection_case_insensitive() {
let content = "curl -H 'PASSWORD: test'";
let result = Corpus::from_string(content);
assert!(matches!(result, Err(CorpusError::SensitiveData { .. })));
}
#[test]
fn test_reports_correct_line_number() {
let content = "git status\ncargo build\ncurl -H 'api_key: x'";
let result = Corpus::from_string(content);
match result {
Err(CorpusError::SensitiveData { line, .. }) => {
assert_eq!(line, 3);
}
_ => panic!("Expected SensitiveData error"),
}
}
#[test]
fn test_allows_safe_git_commands() {
let content = r#"
git status
git commit -m "feat: add feature"
git push origin main
git pull --rebase
git checkout -b feature/new
git log --oneline -10
git diff HEAD~1
git stash pop
"#;
let corpus = Corpus::from_string(content).expect("should parse");
assert!(corpus.len() >= 8);
}
#[test]
fn test_allows_safe_docker_commands() {
let content = r#"
docker build -t myapp .
docker run -d -p 8080:80 nginx
docker compose up -d
docker ps -a
docker logs container_name
docker exec -it container_name bash
"#;
let corpus = Corpus::from_string(content).expect("should parse");
assert!(corpus.len() >= 6);
}
#[test]
fn test_allows_safe_cargo_commands() {
let content = r#"
cargo build --release
cargo test --all-features
cargo clippy -- -D warnings
cargo fmt --check
cargo run --example demo
cargo doc --open
"#;
let corpus = Corpus::from_string(content).expect("should parse");
assert!(corpus.len() >= 6);
}
#[test]
fn test_prefixes_extraction() {
let content = "git status\ngit commit\ncargo build\ncargo test";
let corpus = Corpus::from_string(content).expect("should parse");
assert_eq!(corpus.prefixes().len(), 2);
assert!(corpus.prefixes().contains("git"));
assert!(corpus.prefixes().contains("cargo"));
}
#[test]
fn test_coverage_stats() {
let content = "git status\ngit commit -m msg";
let corpus = Corpus::from_string(content).expect("should parse");
let stats = corpus.coverage_stats();
assert_eq!(stats.total_commands, 2);
assert_eq!(stats.unique_prefixes, 1); assert_eq!(stats.total_tokens, 6); }
#[test]
fn test_error_display_not_found() {
let err = CorpusError::NotFound("/path/to/file".into());
assert!(err.to_string().contains("/path/to/file"));
}
#[test]
fn test_error_display_sensitive() {
let err = CorpusError::SensitiveData {
line: 42,
pattern: "password".into(),
};
let msg = err.to_string();
assert!(msg.contains("password"));
assert!(msg.contains("42"));
}
}