Skip to main content

aa_core/
scanner.rs

1//! Credential leak detection using Aho-Corasick multi-pattern scanning.
2//!
3//! Only compiled when the `std` feature is enabled. The [`CredentialScanner`]
4//! is pre-compiled at construction time so each call to [`CredentialScanner::scan`]
5//! pays zero pattern-compilation cost.
6
7use aho_corasick::AhoCorasick;
8
9// ---------------------------------------------------------------------------
10// AC literal patterns — order matters: earlier index wins on same-position match.
11// sk-ant- must precede sk- so Anthropic keys are not misclassified as OpenAI keys.
12// ---------------------------------------------------------------------------
13
14const AC_PATTERNS: &[&str] = &[
15    "sk-ant-",                               // 0  AnthropicKey
16    "sk-",                                   // 1  OpenAiKey
17    "AKIA",                                  // 2  AwsAccessKey
18    "\"type\": \"service_account\"",         // 3  GcpServiceAccount
19    "DefaultEndpointsProtocol=",             // 4  AzureConnectionString
20    "ghp_",                                  // 5  GitHubPat
21    "ghs_",                                  // 6  GitHubAppToken
22    "xoxb-",                                 // 7  SlackBotToken
23    "xoxp-",                                 // 8  SlackUserToken
24    "xoxa-",                                 // 9  SlackOAuthToken
25    "postgres://",                           // 10 PostgresUrl
26    "mysql://",                              // 11 MysqlUrl
27    "mongodb://",                            // 12 MongodbUrl
28    "-----BEGIN RSA PRIVATE KEY-----",       // 13 RsaPrivateKey
29    "-----BEGIN EC PRIVATE KEY-----",        // 14 EcPrivateKey
30    "-----BEGIN OPENSSH PRIVATE KEY-----",   // 15 OpensshPrivateKey
31    "-----BEGIN PRIVATE KEY-----",           // 16 PrivateKey
32    "-----BEGIN PGP PRIVATE KEY BLOCK-----", // 17 PgpPrivateKey
33];
34
35/// Maps AC pattern index → [`CredentialKind`].
36const AC_KINDS: &[CredentialKind] = &[
37    CredentialKind::AnthropicKey,          // 0
38    CredentialKind::OpenAiKey,             // 1
39    CredentialKind::AwsAccessKey,          // 2
40    CredentialKind::GcpServiceAccount,     // 3
41    CredentialKind::AzureConnectionString, // 4
42    CredentialKind::GitHubPat,             // 5
43    CredentialKind::GitHubAppToken,        // 6
44    CredentialKind::SlackBotToken,         // 7
45    CredentialKind::SlackUserToken,        // 8
46    CredentialKind::SlackOAuthToken,       // 9
47    CredentialKind::PostgresUrl,           // 10
48    CredentialKind::MysqlUrl,              // 11
49    CredentialKind::MongodbUrl,            // 12
50    CredentialKind::RsaPrivateKey,         // 13
51    CredentialKind::EcPrivateKey,          // 14
52    CredentialKind::OpensshPrivateKey,     // 15
53    CredentialKind::PrivateKey,            // 16
54    CredentialKind::PgpPrivateKey,         // 17
55];
56
57// ---------------------------------------------------------------------------
58// Public types
59// ---------------------------------------------------------------------------
60
61/// Category of a detected credential or sensitive value.
62#[derive(Debug, Clone, PartialEq, Eq)]
63#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
64pub enum CredentialKind {
65    // API keys
66    /// Anthropic API key (prefix `sk-ant-`).
67    AnthropicKey,
68    /// AWS access key ID (prefix `AKIA`).
69    AwsAccessKey,
70    /// GCP service account JSON credential (contains `"type": "service_account"`).
71    GcpServiceAccount,
72    /// OpenAI API key (prefix `sk-`).
73    OpenAiKey,
74    // Cloud credentials
75    /// Azure Storage connection string (prefix `DefaultEndpointsProtocol=`).
76    AzureConnectionString,
77    // Auth tokens
78    /// GitHub App installation token (prefix `ghs_`).
79    GitHubAppToken,
80    /// GitHub personal access token (prefix `ghp_`).
81    GitHubPat,
82    /// Slack bot token (prefix `xoxb-`).
83    SlackBotToken,
84    /// Slack OAuth token (prefix `xoxa-`).
85    SlackOAuthToken,
86    /// Slack user token (prefix `xoxp-`).
87    SlackUserToken,
88    // Database URLs
89    /// MongoDB connection URI (prefix `mongodb://`).
90    MongodbUrl,
91    /// MySQL connection URI (prefix `mysql://`).
92    MysqlUrl,
93    /// PostgreSQL connection URI (prefix `postgres://`).
94    PostgresUrl,
95    // Private keys
96    /// PEM-encoded EC private key (`-----BEGIN EC PRIVATE KEY-----`).
97    EcPrivateKey,
98    /// PEM-encoded OpenSSH private key (`-----BEGIN OPENSSH PRIVATE KEY-----`).
99    OpensshPrivateKey,
100    /// PEM-encoded PGP private key block (`-----BEGIN PGP PRIVATE KEY BLOCK-----`).
101    PgpPrivateKey,
102    /// PEM-encoded PKCS#8 private key (`-----BEGIN PRIVATE KEY-----`).
103    PrivateKey,
104    /// PEM-encoded RSA private key (`-----BEGIN RSA PRIVATE KEY-----`).
105    RsaPrivateKey,
106    // PII
107    /// Credit card number validated by the Luhn algorithm (13–19 digits).
108    CreditCardLuhn,
109    /// Email address containing `@` and a dot-separated domain.
110    EmailAddress,
111    /// US Social Security Number in `DDD-DD-DDDD` format.
112    SsnPattern,
113    // Generic
114    /// High-entropy token (Shannon entropy > 4.5 bits/char, length 20–64 bytes).
115    GenericHighEntropy,
116    // Policy-defined
117    /// A pattern defined in the policy document's `data.sensitive_patterns` field.
118    Custom,
119}
120
121impl CredentialKind {
122    /// Returns the string used in the `[REDACTED:<kind>]` label.
123    pub fn as_str(&self) -> &'static str {
124        match self {
125            Self::AnthropicKey => "AnthropicKey",
126            Self::AwsAccessKey => "AwsAccessKey",
127            Self::AzureConnectionString => "AzureConnectionString",
128            Self::CreditCardLuhn => "CreditCardLuhn",
129            Self::EcPrivateKey => "EcPrivateKey",
130            Self::EmailAddress => "EmailAddress",
131            Self::GcpServiceAccount => "GcpServiceAccount",
132            Self::GenericHighEntropy => "GenericHighEntropy",
133            Self::GitHubAppToken => "GitHubAppToken",
134            Self::GitHubPat => "GitHubPat",
135            Self::MongodbUrl => "MongodbUrl",
136            Self::MysqlUrl => "MysqlUrl",
137            Self::OpenAiKey => "OpenAiKey",
138            Self::OpensshPrivateKey => "OpensshPrivateKey",
139            Self::PgpPrivateKey => "PgpPrivateKey",
140            Self::PostgresUrl => "PostgresUrl",
141            Self::PrivateKey => "PrivateKey",
142            Self::RsaPrivateKey => "RsaPrivateKey",
143            Self::SlackBotToken => "SlackBotToken",
144            Self::SlackOAuthToken => "SlackOAuthToken",
145            Self::SlackUserToken => "SlackUserToken",
146            Self::SsnPattern => "SsnPattern",
147            Self::Custom => "Custom",
148        }
149    }
150}
151
152/// A single detected credential finding.
153///
154/// `offset` is the byte offset in the original text where the pattern was found.
155/// `matched` is the redacted label, e.g. `[REDACTED:AwsAccessKey]`. The raw
156/// secret is never stored.
157///
158/// The `end` field is intentionally private; it is used by [`ScanResult::redact`]
159/// to splice the original match without exposing raw length arithmetic to callers.
160#[derive(Debug, Clone, PartialEq, Eq)]
161#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
162pub struct CredentialFinding {
163    /// Category of the detected credential.
164    pub kind: CredentialKind,
165    /// Byte offset in the original text where the pattern begins.
166    pub offset: usize,
167    /// Redacted label replacing the secret, e.g. `[REDACTED:AwsAccessKey]`.
168    pub matched: String,
169    #[cfg_attr(feature = "serde", serde(skip))]
170    end: usize,
171}
172
173impl CredentialFinding {
174    fn new(kind: CredentialKind, offset: usize, end: usize) -> Self {
175        let label = format!("[REDACTED:{}]", kind.as_str());
176        Self {
177            kind,
178            offset,
179            matched: label,
180            end,
181        }
182    }
183
184    /// Construct a finding for a match produced by a policy-defined regex pattern.
185    ///
186    /// Used by `aa-gateway` when custom `data.sensitive_patterns` regexes match.
187    /// The `offset` and `end` are byte positions returned by the regex engine.
188    pub fn from_regex_match(offset: usize, end: usize) -> Self {
189        Self::new(CredentialKind::Custom, offset, end)
190    }
191}
192
193/// The result of a [`CredentialScanner::scan`] call.
194#[derive(Debug, Clone, PartialEq, Eq)]
195#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
196pub struct ScanResult {
197    /// All credential findings detected in the scanned text, sorted by byte offset.
198    pub findings: Vec<CredentialFinding>,
199}
200
201impl ScanResult {
202    /// Returns `true` if no credential findings were detected.
203    pub fn is_clean(&self) -> bool {
204        self.findings.is_empty()
205    }
206
207    /// Returns a copy of `text` with every finding replaced by its redacted label.
208    ///
209    /// Replacements are applied in reverse offset order so earlier byte positions
210    /// remain valid after each splice. The `end` field of each finding records the
211    /// original match boundary and is used here without being exposed in the public API.
212    pub fn redact(&self, text: &str) -> String {
213        let mut sorted: Vec<&CredentialFinding> = self.findings.iter().collect();
214        sorted.sort_by_key(|b| std::cmp::Reverse(b.offset));
215        let mut result = text.to_string();
216        for finding in sorted {
217            if finding.end <= result.len() && finding.offset <= finding.end {
218                result.replace_range(finding.offset..finding.end, &finding.matched);
219            }
220        }
221        result
222    }
223}
224
225/// Configuration for the credential scanner.
226///
227/// Controls whether scanning is enabled and allows adding custom literal
228/// patterns beyond the built-in set.
229#[derive(Debug, Clone, Default)]
230pub struct ScannerConfig {
231    /// When `true`, scanning is disabled and [`CredentialScanner::scan`] always
232    /// returns an empty [`ScanResult`].
233    pub disabled: bool,
234    /// Additional literal prefixes to detect as [`CredentialKind::Custom`].
235    /// Each string is compiled into the Aho-Corasick automaton alongside the
236    /// built-in patterns.
237    pub custom_patterns: Vec<String>,
238}
239
240/// Pre-compiled multi-pattern credential scanner.
241///
242/// Construct once with [`CredentialScanner::new`] (or [`CredentialScanner::with_config`])
243/// and call [`CredentialScanner::scan`] repeatedly. Pattern compilation happens at
244/// construction time; each scan call is O(n) in the length of the input text.
245pub struct CredentialScanner {
246    patterns: AhoCorasick,
247    /// Maps each AC pattern index to its [`CredentialKind`]. Built-in patterns
248    /// use the static `AC_KINDS` entries; custom patterns are appended as
249    /// [`CredentialKind::Custom`].
250    kinds: Vec<CredentialKind>,
251    /// When `true`, [`scan`](Self::scan) short-circuits and returns an empty result.
252    disabled: bool,
253}
254
255impl Default for CredentialScanner {
256    fn default() -> Self {
257        Self::new()
258    }
259}
260
261impl CredentialScanner {
262    /// Build the scanner with all built-in patterns and scanning enabled.
263    ///
264    /// # Panics
265    ///
266    /// Panics only if the hard-coded AC patterns are somehow invalid — this
267    /// cannot happen in practice.
268    pub fn new() -> Self {
269        Self::with_config(ScannerConfig::default())
270    }
271
272    /// Build the scanner from explicit configuration.
273    ///
274    /// Custom patterns are appended after the built-in set and are tagged as
275    /// [`CredentialKind::Custom`]. If `config.disabled` is true the scanner
276    /// is inert — [`scan`](Self::scan) always returns an empty result.
277    pub fn with_config(config: ScannerConfig) -> Self {
278        let mut all_patterns: Vec<&str> = AC_PATTERNS.to_vec();
279        // Collect custom pattern references — lifetime tied to `config`.
280        let custom_refs: Vec<&str> = config.custom_patterns.iter().map(|s| s.as_str()).collect();
281        all_patterns.extend_from_slice(&custom_refs);
282
283        let mut kinds: Vec<CredentialKind> = AC_KINDS.to_vec();
284        kinds.extend(std::iter::repeat(CredentialKind::Custom).take(config.custom_patterns.len()));
285
286        let ac = AhoCorasick::builder()
287            .match_kind(aho_corasick::MatchKind::LeftmostFirst)
288            .build(&all_patterns)
289            .expect("AC patterns are always valid");
290
291        Self {
292            patterns: ac,
293            kinds,
294            disabled: config.disabled,
295        }
296    }
297
298    /// Scan `text` for credential patterns and return a [`ScanResult`].
299    ///
300    /// Four passes are performed:
301    /// 1. Aho-Corasick literal prefix scan — O(n), 18 patterns covering API keys,
302    ///    auth tokens, cloud credentials, database URLs, and PEM private key headers.
303    /// 2. Credit card and SSN digit-sequence scan.
304    /// 3. Email address scan.
305    /// 4. High-entropy token scan (Shannon entropy > 4.5 bits/char, length 20–64).
306    pub fn scan(&self, text: &str) -> ScanResult {
307        if self.disabled {
308            return ScanResult { findings: Vec::new() };
309        }
310
311        let mut findings = Vec::new();
312
313        // Phase 1: AC literal prefix scan (API keys, auth tokens, cloud creds,
314        //          database URLs, PEM private key headers — 18 patterns + custom)
315        for mat in self.patterns.find_iter(text) {
316            let kind = self.kinds[mat.pattern()].clone();
317            let offset = mat.start();
318            let end = token_end(text, mat.end());
319            findings.push(CredentialFinding::new(kind, offset, end));
320        }
321
322        // Phase 2: PII — credit card numbers and SSN patterns
323        scan_digit_sequences(text, &mut findings);
324
325        // Phase 3: Email addresses
326        scan_emails(text, &mut findings);
327
328        // Phase 4: High-entropy tokens (Shannon entropy > 4.5 bits/char, length 20–64)
329        scan_high_entropy(text, &mut findings);
330
331        findings.sort_by_key(|f| f.offset);
332        ScanResult { findings }
333    }
334}
335
336// ---------------------------------------------------------------------------
337// Internal helpers
338// ---------------------------------------------------------------------------
339
340/// Returns the byte index of the first token-terminating character at or after
341/// `from`. Token terminators are whitespace and common delimiters.
342fn token_end(text: &str, from: usize) -> usize {
343    text[from..]
344        .find(|c: char| c.is_whitespace() || matches!(c, '"' | '\'' | ',' | ';' | ')' | ']' | '}'))
345        .map(|i| from + i)
346        .unwrap_or(text.len())
347}
348
349/// Returns `true` if `s` matches the SSN format `DDD-DD-DDDD` exactly.
350fn is_ssn(s: &str) -> bool {
351    let b = s.as_bytes();
352    b.len() == 11
353        && b[0..3].iter().all(u8::is_ascii_digit)
354        && b[3] == b'-'
355        && b[4..6].iter().all(u8::is_ascii_digit)
356        && b[6] == b'-'
357        && b[7..11].iter().all(u8::is_ascii_digit)
358}
359
360/// Returns `true` if `digits` (ASCII digit characters only, no separators) passes
361/// the Luhn checksum algorithm used by credit card numbers.
362fn luhn_valid(digits: &str) -> bool {
363    if digits.len() < 13 || digits.len() > 19 {
364        return false;
365    }
366    let mut sum = 0u32;
367    let mut double = false;
368    for ch in digits.chars().rev() {
369        let Some(d) = ch.to_digit(10) else {
370            return false;
371        };
372        let val = if double {
373            let v = d * 2;
374            if v > 9 {
375                v - 9
376            } else {
377                v
378            }
379        } else {
380            d
381        };
382        sum += val;
383        double = !double;
384    }
385    sum % 10 == 0
386}
387
388/// Scans `text` for credit card numbers (Luhn-validated) and SSN patterns (`DDD-DD-DDDD`).
389fn scan_digit_sequences(text: &str, findings: &mut Vec<CredentialFinding>) {
390    let bytes = text.as_bytes();
391    let mut i = 0;
392    while i < bytes.len() {
393        if !bytes[i].is_ascii_digit() {
394            i += 1;
395            continue;
396        }
397
398        let start = i;
399        let mut digits = String::new();
400        let mut j = i;
401        let limit = (start + 24).min(bytes.len());
402
403        while j < limit {
404            match bytes[j] {
405                b if b.is_ascii_digit() => {
406                    digits.push(b as char);
407                    j += 1;
408                }
409                b' ' | b'-' if !digits.is_empty() => {
410                    j += 1;
411                }
412                _ => break,
413            }
414        }
415
416        let end = j;
417        let segment = &text[start..end];
418
419        if is_ssn(segment) {
420            findings.push(CredentialFinding::new(CredentialKind::SsnPattern, start, end));
421        } else if digits.len() >= 13 && digits.len() <= 19 && luhn_valid(&digits) {
422            findings.push(CredentialFinding::new(CredentialKind::CreditCardLuhn, start, end));
423        }
424        i = end.max(i + 1);
425    }
426}
427
428/// Computes the Shannon entropy of `s` in bits per character.
429fn shannon_entropy(s: &str) -> f64 {
430    if s.is_empty() {
431        return 0.0;
432    }
433    let mut freq = [0u32; 256];
434    for &b in s.as_bytes() {
435        freq[b as usize] += 1;
436    }
437    let len = s.len() as f64;
438    freq.iter()
439        .filter(|&&c| c > 0)
440        .map(|&c| {
441            let p = c as f64 / len;
442            -p * p.log2()
443        })
444        .sum()
445}
446
447/// Scans `text` for high-entropy whitespace-delimited tokens (> 4.5 bits/char,
448/// length 20–64 bytes) and reports them as [`CredentialKind::GenericHighEntropy`].
449fn scan_high_entropy(text: &str, findings: &mut Vec<CredentialFinding>) {
450    let mut offset = 0usize;
451    for token in text.split_whitespace() {
452        let token_offset = text[offset..].find(token).map(|i| offset + i).unwrap_or(offset);
453        let token_end_pos = token_offset + token.len();
454        let len = token.len();
455        if (20..=64).contains(&len) && shannon_entropy(token) > 4.5 {
456            findings.push(CredentialFinding::new(
457                CredentialKind::GenericHighEntropy,
458                token_offset,
459                token_end_pos,
460            ));
461        }
462        offset = token_end_pos;
463    }
464}
465
466/// Scans `text` for email addresses by locating `@` signs and expanding outward.
467fn scan_emails(text: &str, findings: &mut Vec<CredentialFinding>) {
468    let mut search = text;
469    let mut base = 0usize;
470
471    while let Some(at) = search.find('@') {
472        let abs_at = base + at;
473
474        let local_start = text[..abs_at]
475            .rfind(|c: char| c.is_whitespace() || matches!(c, '<' | ',' | ';' | '"' | '\''))
476            .map(|i| i + 1)
477            .unwrap_or(0);
478
479        let domain_end = token_end(text, abs_at + 1);
480        let local = &text[local_start..abs_at];
481        let domain = &text[abs_at + 1..domain_end];
482
483        if !local.is_empty() && domain.contains('.') && domain.len() >= 3 {
484            findings.push(CredentialFinding::new(
485                CredentialKind::EmailAddress,
486                local_start,
487                domain_end,
488            ));
489        }
490
491        let next = abs_at + 1;
492        if next >= text.len() {
493            break;
494        }
495        search = &text[next..];
496        base = next;
497    }
498}
499
500// ---------------------------------------------------------------------------
501// Tests
502// ---------------------------------------------------------------------------
503
504#[cfg(test)]
505mod tests {
506    use super::*;
507
508    // --- CredentialKind::as_str ---
509
510    #[test]
511    fn credential_kind_as_str_round_trips() {
512        assert_eq!(CredentialKind::AnthropicKey.as_str(), "AnthropicKey");
513        assert_eq!(CredentialKind::AwsAccessKey.as_str(), "AwsAccessKey");
514        assert_eq!(CredentialKind::GenericHighEntropy.as_str(), "GenericHighEntropy");
515    }
516
517    // --- API key patterns ---
518
519    #[test]
520    fn detects_anthropic_key() {
521        let scanner = CredentialScanner::new();
522        let result = scanner.scan("auth: sk-ant-api03-XXXXXXXXXXXXXXXXXXXX");
523        assert!(result.findings.iter().any(|f| f.kind == CredentialKind::AnthropicKey));
524    }
525
526    #[test]
527    fn detects_openai_key_not_misclassified_as_anthropic() {
528        let scanner = CredentialScanner::new();
529        let result = scanner.scan("key: sk-proj-XXXXXXXXXXXXXXXXXXXX");
530        assert!(result.findings.iter().any(|f| f.kind == CredentialKind::OpenAiKey));
531        assert!(!result.findings.iter().any(|f| f.kind == CredentialKind::AnthropicKey));
532    }
533
534    #[test]
535    fn detects_aws_access_key() {
536        let scanner = CredentialScanner::new();
537        let result = scanner.scan("AWS_ACCESS_KEY_ID=AKIAIOSFODNN7EXAMPLE");
538        assert!(result.findings.iter().any(|f| f.kind == CredentialKind::AwsAccessKey));
539    }
540
541    #[test]
542    fn detects_gcp_service_account() {
543        let scanner = CredentialScanner::new();
544        let result = scanner.scan(r#"{"type": "service_account", "project_id": "my-project"}"#);
545        assert!(result
546            .findings
547            .iter()
548            .any(|f| f.kind == CredentialKind::GcpServiceAccount));
549    }
550
551    // --- Auth token patterns ---
552
553    #[test]
554    fn detects_github_pat() {
555        let scanner = CredentialScanner::new();
556        let result = scanner.scan("token: ghp_1234567890abcdefghijklmnopqrstuvwxyz");
557        assert!(result.findings.iter().any(|f| f.kind == CredentialKind::GitHubPat));
558    }
559
560    #[test]
561    fn detects_github_app_token() {
562        let scanner = CredentialScanner::new();
563        let result = scanner.scan("token: ghs_1234567890abcdefghijklmnopqrstuvwxyz");
564        assert!(result.findings.iter().any(|f| f.kind == CredentialKind::GitHubAppToken));
565    }
566
567    #[test]
568    fn detects_slack_bot_token() {
569        let scanner = CredentialScanner::new();
570        let result = scanner.scan("SLACK_BOT_TOKEN=xoxb-123456789012-123456789012-XXXXXXXXXXXX");
571        assert!(result.findings.iter().any(|f| f.kind == CredentialKind::SlackBotToken));
572    }
573
574    #[test]
575    fn detects_slack_user_token() {
576        let scanner = CredentialScanner::new();
577        let result = scanner.scan("token=xoxp-123456789012-123456789012-XXXXXXXXXXXX");
578        assert!(result.findings.iter().any(|f| f.kind == CredentialKind::SlackUserToken));
579    }
580
581    #[test]
582    fn detects_slack_oauth_token() {
583        let scanner = CredentialScanner::new();
584        let result = scanner.scan("oauth=xoxa-123456789012-123456789012-XXXXXXXXXXXX");
585        assert!(result
586            .findings
587            .iter()
588            .any(|f| f.kind == CredentialKind::SlackOAuthToken));
589    }
590
591    // --- Cloud credential patterns ---
592
593    #[test]
594    fn detects_azure_connection_string() {
595        let scanner = CredentialScanner::new();
596        let result = scanner.scan("DefaultEndpointsProtocol=https;AccountName=myaccount;AccountKey=XXXX");
597        assert!(result
598            .findings
599            .iter()
600            .any(|f| f.kind == CredentialKind::AzureConnectionString));
601    }
602
603    // --- Database URL patterns ---
604
605    #[test]
606    fn detects_postgres_url() {
607        let scanner = CredentialScanner::new();
608        let result = scanner.scan("DATABASE_URL=postgres://user:password@host:5432/db");
609        assert!(result.findings.iter().any(|f| f.kind == CredentialKind::PostgresUrl));
610    }
611
612    #[test]
613    fn detects_mysql_url() {
614        let scanner = CredentialScanner::new();
615        let result = scanner.scan("db=mysql://user:secret@localhost/mydb");
616        assert!(result.findings.iter().any(|f| f.kind == CredentialKind::MysqlUrl));
617    }
618
619    #[test]
620    fn detects_mongodb_url() {
621        let scanner = CredentialScanner::new();
622        let result = scanner.scan("uri=mongodb://admin:pass@cluster0.mongodb.net/mydb");
623        assert!(result.findings.iter().any(|f| f.kind == CredentialKind::MongodbUrl));
624    }
625
626    // --- Private key patterns ---
627
628    #[test]
629    fn detects_rsa_private_key() {
630        let scanner = CredentialScanner::new();
631        let result =
632            scanner.scan("-----BEGIN RSA PRIVATE KEY-----\nMIIEpAIBAAKCAQEA...\n-----END RSA PRIVATE KEY-----");
633        assert!(result.findings.iter().any(|f| f.kind == CredentialKind::RsaPrivateKey));
634    }
635
636    #[test]
637    fn detects_ec_private_key() {
638        let scanner = CredentialScanner::new();
639        let result = scanner.scan("-----BEGIN EC PRIVATE KEY-----\nMHQCAQEEI...\n-----END EC PRIVATE KEY-----");
640        assert!(result.findings.iter().any(|f| f.kind == CredentialKind::EcPrivateKey));
641    }
642
643    #[test]
644    fn detects_openssh_private_key() {
645        let scanner = CredentialScanner::new();
646        let result = scanner
647            .scan("-----BEGIN OPENSSH PRIVATE KEY-----\nb3BlbnNzaC1rZXkAAAA=\n-----END OPENSSH PRIVATE KEY-----");
648        assert!(result
649            .findings
650            .iter()
651            .any(|f| f.kind == CredentialKind::OpensshPrivateKey));
652    }
653
654    #[test]
655    fn detects_generic_private_key() {
656        let scanner = CredentialScanner::new();
657        let result = scanner.scan("-----BEGIN PRIVATE KEY-----\nMIIEvAIBADANBgk=\n-----END PRIVATE KEY-----");
658        assert!(result.findings.iter().any(|f| f.kind == CredentialKind::PrivateKey));
659    }
660
661    #[test]
662    fn detects_pgp_private_key() {
663        let scanner = CredentialScanner::new();
664        let result =
665            scanner.scan("-----BEGIN PGP PRIVATE KEY BLOCK-----\nlQOYBF...\n-----END PGP PRIVATE KEY BLOCK-----");
666        assert!(result.findings.iter().any(|f| f.kind == CredentialKind::PgpPrivateKey));
667    }
668
669    // --- PII patterns ---
670
671    #[test]
672    fn detects_credit_card_luhn() {
673        let scanner = CredentialScanner::new();
674        let result = scanner.scan("card: 4532015112830366");
675        assert!(result.findings.iter().any(|f| f.kind == CredentialKind::CreditCardLuhn));
676    }
677
678    #[test]
679    fn detects_credit_card_with_spaces() {
680        let scanner = CredentialScanner::new();
681        let result = scanner.scan("card: 4532 0151 1283 0366");
682        assert!(result.findings.iter().any(|f| f.kind == CredentialKind::CreditCardLuhn));
683    }
684
685    #[test]
686    fn does_not_flag_invalid_luhn() {
687        let scanner = CredentialScanner::new();
688        let result = scanner.scan("num: 4532015112830367");
689        assert!(!result.findings.iter().any(|f| f.kind == CredentialKind::CreditCardLuhn));
690    }
691
692    #[test]
693    fn detects_ssn() {
694        let scanner = CredentialScanner::new();
695        let result = scanner.scan("SSN: 123-45-6789");
696        assert!(result.findings.iter().any(|f| f.kind == CredentialKind::SsnPattern));
697    }
698
699    #[test]
700    fn detects_email_address() {
701        let scanner = CredentialScanner::new();
702        let result = scanner.scan("contact: user@example.com for support");
703        assert!(result.findings.iter().any(|f| f.kind == CredentialKind::EmailAddress));
704    }
705
706    // --- High-entropy ---
707
708    #[test]
709    fn detects_high_entropy_token() {
710        let scanner = CredentialScanner::new();
711        let result = scanner.scan("secret: xK9mP2nQvR7sT4wY1aB6dF3hJ8lN0eC5");
712        assert!(result
713            .findings
714            .iter()
715            .any(|f| f.kind == CredentialKind::GenericHighEntropy));
716    }
717
718    #[test]
719    fn does_not_flag_short_token_as_high_entropy() {
720        let scanner = CredentialScanner::new();
721        let result = scanner.scan("word: hello");
722        assert!(!result
723            .findings
724            .iter()
725            .any(|f| f.kind == CredentialKind::GenericHighEntropy));
726    }
727
728    // --- luhn_valid helper ---
729
730    #[test]
731    fn luhn_valid_visa_test_number() {
732        assert!(luhn_valid("4532015112830366"));
733    }
734
735    #[test]
736    fn luhn_valid_mastercard_test_number() {
737        assert!(luhn_valid("5425233430109903"));
738    }
739
740    #[test]
741    fn luhn_valid_amex_test_number() {
742        assert!(luhn_valid("371449635398431"));
743    }
744
745    #[test]
746    fn luhn_valid_discover_test_number() {
747        assert!(luhn_valid("6011111111111117"));
748    }
749
750    #[test]
751    fn luhn_invalid_altered_digit() {
752        assert!(!luhn_valid("4532015112830367"));
753    }
754
755    #[test]
756    fn luhn_rejects_too_short() {
757        assert!(!luhn_valid("123456789012"));
758    }
759
760    #[test]
761    fn luhn_rejects_too_long() {
762        assert!(!luhn_valid("45320151128303661234"));
763    }
764
765    // --- shannon_entropy helper ---
766
767    #[test]
768    fn entropy_zero_for_empty() {
769        assert_eq!(shannon_entropy(""), 0.0);
770    }
771
772    #[test]
773    fn entropy_low_for_repeated_char() {
774        assert!(shannon_entropy("aaaaaaaaaaaaaaaaaaaaaa") < 1.0);
775    }
776
777    #[test]
778    fn entropy_high_for_random_base64() {
779        assert!(shannon_entropy("xK9mP2nQvR7sT4wY1aB6dF3hJ8lN0") > 4.0);
780    }
781
782    #[test]
783    fn entropy_moderate_for_english_text() {
784        let e = shannon_entropy("Thequickbrownfoxjumpsoverthelazydog");
785        assert!(e > 3.0 && e < 5.0);
786    }
787
788    // --- ScanResult::redact() and is_clean() ---
789
790    #[test]
791    fn redact_replaces_github_pat() {
792        let scanner = CredentialScanner::new();
793        let text = "key: ghp_abc123XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX end";
794        let result = scanner.scan(text);
795        let redacted = result.redact(text);
796        assert!(!redacted.contains("ghp_"));
797        assert!(redacted.contains("[REDACTED:GitHubPat]"));
798    }
799
800    #[test]
801    fn redact_is_deterministic() {
802        let scanner = CredentialScanner::new();
803        let text = "key: ghp_abc123XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX";
804        let result = scanner.scan(text);
805        assert_eq!(result.redact(text), result.redact(text));
806    }
807
808    #[test]
809    fn redact_clean_text_unchanged() {
810        let scanner = CredentialScanner::new();
811        let text = "This is a normal sentence with no secrets.";
812        let result = scanner.scan(text);
813        assert!(result.is_clean());
814        assert_eq!(result.redact(text), text);
815    }
816
817    #[test]
818    fn redact_multiple_findings_in_one_pass() {
819        let scanner = CredentialScanner::new();
820        let text = "a=ghp_XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX b=postgres://u:p@host/db";
821        let result = scanner.scan(text);
822        let redacted = result.redact(text);
823        assert!(!redacted.contains("ghp_"));
824        assert!(!redacted.contains("postgres://"));
825        assert!(redacted.contains("[REDACTED:GitHubPat]"));
826        assert!(redacted.contains("[REDACTED:PostgresUrl]"));
827    }
828
829    #[test]
830    fn is_clean_true_for_benign_text() {
831        let scanner = CredentialScanner::new();
832        assert!(scanner.scan("Hello, world! No secrets here.").is_clean());
833    }
834
835    // --- CredentialKind::Custom and CredentialFinding::from_regex_match ---
836
837    #[test]
838    fn custom_kind_as_str_returns_custom() {
839        assert_eq!(CredentialKind::Custom.as_str(), "Custom");
840    }
841
842    #[test]
843    fn from_regex_match_creates_custom_finding() {
844        let finding = CredentialFinding::from_regex_match(5, 20);
845        assert_eq!(finding.kind, CredentialKind::Custom);
846        assert_eq!(finding.offset, 5);
847        assert_eq!(finding.matched, "[REDACTED:Custom]");
848    }
849
850    // --- False-positive corpus ---
851
852    #[test]
853    fn false_positive_corpus_has_no_hard_credential_hits() {
854        let scanner = CredentialScanner::new();
855        let corpus = [
856            "The quick brown fox jumps over the lazy dog.",
857            "fn main() { println!(\"Hello, world!\"); }",
858            "SELECT * FROM users WHERE id = 42;",
859            "cargo build --release --features std",
860            "version = \"1.0.0\" edition = \"2021\"",
861            "2026-04-27T15:34:15.377+0800",
862            "error[E0382]: borrow of moved value: `x`",
863        ];
864        for text in &corpus {
865            let result = scanner.scan(text);
866            let hard: Vec<_> = result
867                .findings
868                .iter()
869                .filter(|f| f.kind != CredentialKind::GenericHighEntropy)
870                .collect();
871            assert!(hard.is_empty(), "false positive in: {:?} → {:?}", text, hard);
872        }
873    }
874
875    // --- ScannerConfig ---
876
877    #[test]
878    fn disabled_scanner_returns_empty_result() {
879        let config = ScannerConfig {
880            disabled: true,
881            ..Default::default()
882        };
883        let scanner = CredentialScanner::with_config(config);
884        let result = scanner.scan("sk-proj-XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX ghp_XXXXXXXXX");
885        assert!(result.is_clean(), "disabled scanner must return no findings");
886    }
887
888    #[test]
889    fn custom_pattern_detected_as_custom_kind() {
890        let config = ScannerConfig {
891            custom_patterns: vec!["INTERNAL_SECRET_".into()],
892            ..Default::default()
893        };
894        let scanner = CredentialScanner::with_config(config);
895        let result = scanner.scan("token=INTERNAL_SECRET_hello");
896        let custom: Vec<_> = result
897            .findings
898            .iter()
899            .filter(|f| f.kind == CredentialKind::Custom)
900            .collect();
901        assert!(!custom.is_empty(), "custom pattern must produce a Custom finding");
902        assert!(custom[0].matched.contains("[REDACTED:Custom]"));
903    }
904
905    #[test]
906    fn custom_pattern_coexists_with_builtin() {
907        let config = ScannerConfig {
908            custom_patterns: vec!["MY_TOKEN_".into()],
909            ..Default::default()
910        };
911        let scanner = CredentialScanner::with_config(config);
912        let text = "a=ghp_XXXXXXXXX b=MY_TOKEN_secret123";
913        let result = scanner.scan(text);
914        let kinds: Vec<_> = result.findings.iter().map(|f| &f.kind).collect();
915        assert!(kinds.contains(&&CredentialKind::GitHubPat));
916        assert!(kinds.contains(&&CredentialKind::Custom));
917    }
918
919    #[test]
920    fn default_config_matches_new() {
921        let default_scanner = CredentialScanner::new();
922        let config_scanner = CredentialScanner::with_config(ScannerConfig::default());
923        let text = "key=ghp_XXXXXXXXX url=postgres://u:p@host/db";
924        let r1 = default_scanner.scan(text);
925        let r2 = config_scanner.scan(text);
926        assert_eq!(r1.findings.len(), r2.findings.len());
927        for (a, b) in r1.findings.iter().zip(r2.findings.iter()) {
928            assert_eq!(a.kind, b.kind);
929            assert_eq!(a.offset, b.offset);
930        }
931    }
932}