collet 0.1.0

Relentless agentic coding orchestrator with zero-drop agent loops
Documentation
//! PII / sensitive data detection filter.
//!
//! Scans user input for sensitive patterns (API keys, passwords, SSNs, etc.)
//! before sending to the LLM. Returns categorized findings so the TUI can
//! warn the user and let them decide whether to proceed.

use std::sync::LazyLock;

use regex::Regex;

/// A single detected sensitive data match.
#[derive(Debug, Clone)]
pub struct PiiMatch {
    pub category: &'static str,
    pub masked: String,
}

/// Pattern definition: category label + compiled regex.
struct Pattern {
    category: &'static str,
    regex: Regex,
}

static PATTERNS: LazyLock<Vec<Pattern>> = LazyLock::new(|| {
    let defs: &[(&str, &str)] = &[
        // API keys
        ("API Key (OpenAI)", r"sk-[a-zA-Z0-9]{20,}"),
        ("API Key (AWS)", r"AKIA[0-9A-Z]{16}"),
        ("API Key (Anthropic)", r"sk-ant-[a-zA-Z0-9\-]{20,}"),
        ("Bearer Token", r"Bearer\s+[a-zA-Z0-9\-._~+/]{20,}=*"),
        // Crypto / certs
        ("Private Key", r"-----BEGIN\s+(?:RSA\s+)?PRIVATE\s+KEY-----"),
        ("Certificate", r"-----BEGIN\s+CERTIFICATE-----"),
        // Credentials
        (
            "Password",
            r"(?i)(?:password|passwd|pwd|secret)\s*[=:]\s*\S+",
        ),
        (
            "DB Connection String",
            r"(?i)(?:mongodb|postgres|mysql|redis|sqlite)://\S+",
        ),
        // Personal identifiers
        ("Email", r"[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}"),
        ("US SSN", r"\b\d{3}-\d{2}-\d{4}\b"),
        (
            "Korean ID (Resident Registration Number)",
            r"\b\d{6}-[1-4]\d{6}\b",
        ),
        (
            "Credit Card",
            r"\b(?:4\d{3}|5[1-5]\d{2}|3[47]\d{2}|6(?:011|5\d{2}))[- ]?\d{4}[- ]?\d{4}[- ]?\d{3,4}\b",
        ),
        (
            "Phone Number",
            r"\b\+?\d{1,3}[-.\s]?\(?\d{2,4}\)?[-.\s]?\d{3,4}[-.\s]?\d{4}\b",
        ),
        // Infrastructure
        (
            "Internal IP",
            r"\b(?:10\.|172\.(?:1[6-9]|2\d|3[01])\.|192\.168\.)\d{1,3}\.\d{1,3}\b",
        ),
    ];

    defs.iter()
        .filter_map(|(cat, pat)| {
            Regex::new(pat).ok().map(|r| Pattern {
                category: cat,
                regex: r,
            })
        })
        .collect()
});

/// Scan input for sensitive data. Returns empty vec if clean.
pub fn scan(input: &str) -> Vec<PiiMatch> {
    let mut findings = Vec::new();

    for pattern in PATTERNS.iter() {
        for mat in pattern.regex.find_iter(input) {
            let raw = mat.as_str();
            let masked = mask_value(raw);
            findings.push(PiiMatch {
                category: pattern.category,
                masked,
            });
        }
    }

    // Deduplicate by (category, masked)
    findings.sort_by(|a, b| a.category.cmp(b.category).then(a.masked.cmp(&b.masked)));
    findings.dedup_by(|a, b| a.category == b.category && a.masked == b.masked);

    findings
}

/// Mask a sensitive value: show first 4 chars + ****
fn mask_value(raw: &str) -> String {
    let chars: Vec<char> = raw.chars().collect();
    if chars.len() <= 4 {
        "****".to_string()
    } else {
        let visible: String = chars[..4].iter().collect();
        format!("{visible}****")
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn detect_openai_key() {
        let input = "Use this key: sk-abcdefghijklmnopqrstuvwxyz";
        let results = scan(input);
        assert!(results.iter().any(|r| r.category.contains("OpenAI")));
    }

    #[test]
    fn detect_aws_key() {
        let input = "AKIAIOSFODNN7EXAMPLE";
        let results = scan(input);
        assert!(results.iter().any(|r| r.category.contains("AWS")));
    }

    #[test]
    fn detect_korean_id() {
        let input = "주민번호 901231-1234567";
        let results = scan(input);
        assert!(results.iter().any(|r| r.category.contains("Korean")));
    }

    #[test]
    fn detect_us_ssn() {
        let input = "SSN: 123-45-6789";
        let results = scan(input);
        assert!(results.iter().any(|r| r.category.contains("SSN")));
    }

    #[test]
    fn detect_email() {
        let input = "Contact: user@example.com";
        let results = scan(input);
        assert!(results.iter().any(|r| r.category.contains("Email")));
    }

    #[test]
    fn detect_password() {
        let input = "password=mysecret123";
        let results = scan(input);
        assert!(results.iter().any(|r| r.category.contains("Password")));
    }

    #[test]
    fn detect_private_key() {
        let input = "-----BEGIN RSA PRIVATE KEY-----";
        let results = scan(input);
        assert!(results.iter().any(|r| r.category.contains("Private Key")));
    }

    #[test]
    fn detect_db_connection() {
        let input = "postgres://user:pass@localhost:5432/db";
        let results = scan(input);
        assert!(results.iter().any(|r| r.category.contains("DB")));
    }

    #[test]
    fn clean_input_returns_empty() {
        let input = "Please fix the bug in src/main.rs";
        let results = scan(input);
        assert!(results.is_empty());
    }

    #[test]
    fn masking_works() {
        assert_eq!(mask_value("sk-abcdefg"), "sk-a****");
        assert_eq!(mask_value("abc"), "****");
    }
}