khive_runtime/
secret_gate.rs

1//! Write-time secret detection gate (issue #76).
2//!
3//! Scans caller-supplied content strings before any storage write.  A match
4//! causes a hard `RuntimeError::SecretDetected` that names the detector and
5//! carries a masked excerpt — it never echoes the full candidate back.
6//!
7//! Scope: **credentials only** — API keys, tokens, private keys, passwords,
8//! and connection strings with embedded credentials.  General PII such as
9//! email addresses, phone numbers, and company names is intentionally NOT
10//! blocked; those are normal knowledge-graph content.
11//!
12//! Detection is layered, cheap-first:
13//!
14//! 1. **Known-prefix / known-shape patterns** — AWS AKIA/ASIA, GitHub tokens,
15//!    OpenAI `sk-proj-`, Anthropic `sk-ant-`, Stripe live keys, Fly.io tokens,
16//!    Vercel secrets, Slack `xox*`, JWT triples, PEM private-key headers,
17//!    Age secret keys, URL userinfo (`scheme://user:pass@`).
18//!    Bare `sk-` is also checked but only when NOT followed by a known safe
19//!    word boundary (e.g. `sk-learn`, `sk-image`).
20//! 2. **High-entropy token heuristic** — base64/hex/base64url runs ≥ 24 chars
21//!    near a trigger word (key, secret, password, credential, bearer, auth,
22//!    apikey, api_key, access_key, private_key).  The word `token` alone is NOT
23//!    a trigger to avoid blocking `tokenizer_*`, `token_count`, etc.
24//!
25//! Allowlist (false-positive suppression):
26//! - Pure hex strings (sha256, git SHA) — passed unconditionally.
27//! - UUID canonical form (`xxxxxxxx-xxxx-…`) — passed.
28//! - Base64/base64url content hashes with an explicit `sha<N>-` prefix (SRI
29//!   hashes, npm lockfile integrity) — passed when not preceded by a known-vendor
30//!   prefix.  Bare base64 tokens without the `sha<N>-` prefix are NOT passed.
31//! - Strings that are entirely ASCII punctuation/whitespace (e.g. code) — not
32//!   subject to the entropy heuristic, only the literal-prefix checks apply.
33
34use crate::error::{RuntimeError, RuntimeResult};
35
36// ─── Public API ──────────────────────────────────────────────────────────────
37
38/// Returned when a write would store credential-looking content.
39///
40/// Carries the detector name and a masked excerpt (`first6...Nchars`).  The
41/// full candidate is never stored in the error.
42#[derive(Debug, Clone, PartialEq, Eq)]
43pub struct SecretMatch {
44    /// Human-readable name of the detector that fired.
45    pub detector: &'static str,
46    /// `first6...N` — the first 6 chars of the match followed by the total length.
47    pub masked: String,
48}
49
50impl std::fmt::Display for SecretMatch {
51    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
52        write!(
53            f,
54            "content matches secret pattern {} at masked excerpt {}",
55            self.detector, self.masked
56        )
57    }
58}
59
60/// Hard-block content from being written.
61///
62/// Returns `Err(RuntimeError::SecretDetected)` on the first match found, or
63/// `Ok(())` if no secret pattern fires.
64pub fn check(content: &str) -> RuntimeResult<()> {
65    if let Some(m) = scan(content) {
66        return Err(RuntimeError::SecretDetected(m));
67    }
68    Ok(())
69}
70
71/// Recursively scan a JSON value for credential-shaped strings.
72///
73/// Walks every string leaf (object values, array elements, nested objects).
74/// Returns `Err(RuntimeError::SecretDetected)` on the first match found.
75/// `None` / null / numeric / boolean JSON values are skipped.
76pub fn check_json(value: &serde_json::Value) -> RuntimeResult<()> {
77    scan_json_value(value)
78}
79
80/// Scan a string-tagged slice (entity/note tags).
81///
82/// Each tag string is scanned individually.
83pub fn check_tags(tags: &[String]) -> RuntimeResult<()> {
84    for tag in tags {
85        check(tag)?;
86    }
87    Ok(())
88}
89
90fn scan_json_value(value: &serde_json::Value) -> RuntimeResult<()> {
91    match value {
92        serde_json::Value::String(s) => check(s),
93        serde_json::Value::Array(arr) => {
94            for v in arr {
95                scan_json_value(v)?;
96            }
97            Ok(())
98        }
99        serde_json::Value::Object(map) => {
100            for (k, v) in map {
101                // Scan both the key (a credential can appear as a JSON key name)
102                // and the value recursively.
103                check(k)?;
104                scan_json_value(v)?;
105            }
106            Ok(())
107        }
108        _ => Ok(()),
109    }
110}
111
112// ─── Scanner ─────────────────────────────────────────────────────────────────
113
114/// Return the first `SecretMatch` found in `text`, or `None`.
115fn scan(text: &str) -> Option<SecretMatch> {
116    // Layer 1: known prefix / shape patterns (no allocation per check).
117    if let Some(m) = check_known_patterns(text) {
118        return Some(m);
119    }
120    // Layer 2: entropy heuristic on long tokens near trigger words.
121    if let Some(m) = check_entropy_heuristic(text) {
122        return Some(m);
123    }
124    None
125}
126
127// ─── Layer 1: known patterns ─────────────────────────────────────────────────
128
129/// Each entry: (detector_name, needle, min_total_token_len).
130///
131/// The needle must appear as a word-boundary-adjacent prefix in the token.
132/// `min_total_token_len` is the minimum length the token (needle + remainder)
133/// must have — prevents the prefix alone triggering without a payload.
134const PREFIX_DETECTORS: &[(&str, &str, usize)] = &[
135    // AWS
136    ("aws-access-key-id", "AKIA", 20),
137    ("aws-access-key-id", "ASIA", 20),
138    // GitHub personal-access tokens
139    ("github-token", "ghp_", 36),
140    ("github-token", "gho_", 36),
141    ("github-token", "github_pat_", 20),
142    // OpenAI
143    ("openai-api-key", "sk-proj-", 40),
144    // NOTE: bare "sk-" also matches Anthropic/Stripe below; put it last so
145    // the more-specific detectors fire first when both would match.
146    // Anthropic
147    ("anthropic-api-key", "sk-ant-", 20),
148    // Stripe live keys
149    ("stripe-secret-key", "sk_live_", 30),
150    ("stripe-restricted-key", "rk_live_", 30),
151    // Fly.io (fm2_ prefix only — FlyV1 handled separately because it embeds a space)
152    ("fly-token", "fm2_", 20),
153    // Vercel
154    ("vercel-token", "vercel_", 20),
155    // Slack
156    ("slack-token", "xoxb-", 40),
157    ("slack-token", "xoxa-", 40),
158    ("slack-token", "xoxp-", 40),
159    ("slack-token", "xoxr-", 40),
160    ("slack-token", "xoxs-", 40),
161    // Age secret key
162    ("age-secret-key", "AGE-SECRET-KEY-", 60),
163];
164
165/// Known safe compound words that start with `sk-` but are not credentials.
166/// E.g. scikit-learn slugs such as `sk-learn`, `sk-image`, `sk-lego`.
167const SK_SAFE_PREFIXES: &[&str] = &["sk-learn", "sk-image", "sk-lego", "sk-base", "sk-misc"];
168
169/// Shape-based patterns checked with custom logic.
170fn check_known_patterns(text: &str) -> Option<SecretMatch> {
171    // --- Prefix patterns ---
172    for &(name, needle, min_len) in PREFIX_DETECTORS {
173        if let Some(m) = find_prefix_token(text, needle, min_len) {
174            return Some(build_match(name, m));
175        }
176    }
177
178    // --- Bare `sk-` (after all more-specific sk- detectors above) ---
179    // Require length ≥ 30 AND exclude known safe scikit/library compound words.
180    if let Some(token) = find_prefix_token(text, "sk-", 30) {
181        if !SK_SAFE_PREFIXES.iter().any(|safe| token.starts_with(safe)) {
182            return Some(build_match("openai-api-key", token));
183        }
184    }
185
186    // --- Fly.io FlyV1 token: "FlyV1 <base64-payload>" ---
187    // The format embeds a space, so the generic prefix extractor (which stops at
188    // whitespace) cannot measure the combined length.  Check for `FlyV1 ` followed
189    // by ≥ 4 non-whitespace characters as the payload.
190    if let Some(pos) = text.find("FlyV1 ") {
191        let at_boundary = pos == 0 || {
192            text[..pos]
193                .chars()
194                .next_back()
195                .is_none_or(|c| !c.is_alphanumeric())
196        };
197        if at_boundary {
198            let payload_start = pos + 6; // skip "FlyV1 "
199            let payload = extract_token(&text[payload_start..]);
200            if payload.len() >= 4 {
201                let candidate = &text[pos..payload_start + payload.len()];
202                return Some(build_match("fly-token", candidate));
203            }
204        }
205    }
206
207    // --- PEM private key block ---
208    // "-----BEGIN <TYPE> PRIVATE KEY-----"
209    if text.contains("-----BEGIN") && text.contains("PRIVATE KEY-----") {
210        if let Some(pos) = text.find("-----BEGIN") {
211            // Measure only the key block itself (up to END marker or end-of-string),
212            // not the rest of the surrounding text, so build_match reports the
213            // block length rather than the remaining string length.
214            let block_end = text[pos..]
215                .find("-----END")
216                .map(|rel| {
217                    text[pos + rel..]
218                        .find('\n')
219                        .map(|l| pos + rel + l + 1)
220                        .unwrap_or(text.len())
221                })
222                .unwrap_or(text.len());
223            let excerpt = &text[pos..block_end];
224            return Some(build_match("pem-private-key", excerpt));
225        }
226    }
227
228    // --- JWT triple: eyJ...eyJ...eyJ (header.payload.signature) ---
229    // A JWT starts with "eyJ" (base64url of `{"`) and has exactly two dots.
230    if let Some(m) = find_jwt(text) {
231        return Some(build_match("jwt", m));
232    }
233
234    // --- URL userinfo: scheme://user:pass@host ---
235    if let Some(m) = find_url_userinfo(text) {
236        return Some(build_match("url-userinfo", m));
237    }
238
239    None
240}
241
242/// Locate the first token in `text` that starts with `needle` and has a
243/// total length >= `min_len`.  Returns a slice of the full token on match.
244fn find_prefix_token<'a>(text: &'a str, needle: &str, min_len: usize) -> Option<&'a str> {
245    let mut start = 0;
246    while let Some(rel) = text[start..].find(needle) {
247        let abs = start + rel;
248        // Require that the needle starts at a token boundary (start-of-string
249        // or preceded by whitespace / punctuation that isn't alphanumeric).
250        let at_boundary = abs == 0 || {
251            let prev = text[..abs].chars().next_back().unwrap_or(' ');
252            !prev.is_alphanumeric()
253        };
254        if at_boundary {
255            let token = extract_token(&text[abs..]);
256            if token.len() >= min_len {
257                return Some(token);
258            }
259        }
260        start = abs + needle.len().max(1);
261    }
262    None
263}
264
265/// Scan for a JWT pattern: at least two "eyJ" segments separated by a `.`
266/// character, with each segment at least 10 chars.
267fn find_jwt(text: &str) -> Option<&str> {
268    let bytes = text.as_bytes();
269    let mut i = 0;
270    while i + 4 < bytes.len() {
271        if bytes[i..].starts_with(b"eyJ") {
272            // Find the end of this JWT (whitespace or string end).
273            let end = bytes[i..]
274                .iter()
275                .position(|&b| b == b' ' || b == b'\n' || b == b'\r' || b == b'\t')
276                .map(|p| i + p)
277                .unwrap_or(bytes.len());
278            let candidate = &text[i..end];
279            // Must have at least 2 dots and 3 eyJ-prefixed segments.
280            let dots = candidate.as_bytes().iter().filter(|&&b| b == b'.').count();
281            if dots >= 2 {
282                let parts: Vec<&str> = candidate.splitn(3, '.').collect();
283                if parts.len() == 3
284                    && parts[0].starts_with("eyJ")
285                    && parts[1].starts_with("eyJ")
286                    && parts[0].len() >= 10
287                    && parts[1].len() >= 10
288                {
289                    return Some(candidate);
290                }
291            }
292            i = end + 1;
293        } else {
294            i += 1;
295        }
296    }
297    None
298}
299
300/// Detect `scheme://user:pass@host` patterns where the `user:pass` portion
301/// contains actual credentials (both user and pass non-empty).
302fn find_url_userinfo(text: &str) -> Option<&str> {
303    let mut search = text;
304    let mut base = 0usize;
305    while let Some(at_rel) = search.find("://") {
306        let at_abs = base + at_rel;
307        // After `://`, look for `@` before the next `/`, `?`, ` `, or newline.
308        let rest_start = at_abs + 3;
309        let rest = &text[rest_start..];
310        if let Some(at_pos) = rest.find('@') {
311            let userinfo = &rest[..at_pos];
312            // Must contain a colon and both sides non-empty.
313            if let Some(colon) = userinfo.find(':') {
314                let user = &userinfo[..colon];
315                let pass = &userinfo[colon + 1..];
316                if !user.is_empty() && !pass.is_empty() && pass.len() >= 4 {
317                    // Return a slice starting from the scheme.
318                    // Walk back from at_abs to find the start of the scheme.
319                    let scheme_start = text[..at_abs]
320                        .rfind(|c: char| {
321                            !c.is_ascii_alphanumeric() && c != '+' && c != '-' && c != '.'
322                        })
323                        .map(|p| p + 1)
324                        .unwrap_or(0);
325                    // Ensure there are no spaces in userinfo (not a code snippet).
326                    if !userinfo.contains(' ') && !userinfo.contains('\n') {
327                        let end = rest_start
328                            + at_pos
329                            + 1
330                            + rest[at_pos + 1..]
331                                .find([' ', '\n', '\r'])
332                                .unwrap_or(rest[at_pos + 1..].len());
333                        return Some(&text[scheme_start..end.min(text.len())]);
334                    }
335                }
336            }
337        }
338        base = at_abs + 3;
339        search = &text[base..];
340    }
341    None
342}
343
344// ─── Layer 2: entropy heuristic ─────────────────────────────────────────────
345
346/// Trigger words near which high-entropy tokens are suspicious.
347///
348/// The bare substring `token` is NOT in this list because it fires on benign
349/// terms like `tokenizer`, `token_count`, and `next_token`.  Instead we use
350/// the dedicated boundary-aware helpers `has_standalone_token` (standalone word)
351/// and `has_token_assignment` (`token=` / `token:` with word boundary before).
352const TRIGGER_WORDS: &[&str] = &[
353    "key",
354    "secret",
355    "password",
356    "passwd",
357    "credential",
358    "bearer",
359    "auth",
360    "apikey",
361    "api_key",
362    "access_key",
363    "private_key",
364];
365
366/// Minimum token length to apply the entropy check.
367const MIN_ENTROPY_LEN: usize = 24;
368
369/// Shannon entropy threshold (bits per character) above which a token is
370/// considered high-entropy.  7.0 corresponds to ~99% utilisation of a
371/// 128-symbol alphabet — typical for random base64/hex.
372const ENTROPY_THRESHOLD: f64 = 4.5;
373
374/// Window around a trigger word in which a high-entropy token must appear.
375const TRIGGER_WINDOW: usize = 120;
376
377fn check_entropy_heuristic(text: &str) -> Option<SecretMatch> {
378    // Tokenize once: collect all whitespace-delimited tokens with their byte offsets.
379    let tokens: Vec<(usize, &str)> = text
380        .split_ascii_whitespace()
381        .map(|t| {
382            let offset = t.as_ptr() as usize - text.as_ptr() as usize;
383            (offset, t)
384        })
385        .collect();
386
387    for &(tok_offset, raw_token) in &tokens {
388        // Strip common delimiters that wrap the actual value.
389        let token = strip_delimiters(raw_token);
390        if token.len() < MIN_ENTROPY_LEN {
391            continue;
392        }
393
394        // UUID and sha-prefixed base64 content hashes (SRI / npm lockfile) are
395        // unconditionally allowlisted: their forms are unambiguous regardless of
396        // surrounding context.
397        if is_uuid_canonical(token) || is_base64_content_hash(token) {
398            continue;
399        }
400
401        // Compute the trigger window before deciding whether to allowlist hex
402        // tokens.  A pure-hex token near a credential trigger word cannot be
403        // safely assumed to be a non-secret hash and must be entropy-checked.
404        let window_start = tok_offset.saturating_sub(TRIGGER_WINDOW);
405        let window_end = (tok_offset + raw_token.len() + TRIGGER_WINDOW).min(text.len());
406        let window = &text[window_start..window_end];
407        let low_window = window.to_ascii_lowercase();
408
409        let near_trigger = TRIGGER_WORDS.iter().any(|tw| low_window.contains(tw))
410            || has_standalone_token(&low_window)
411            || has_token_assignment(&low_window);
412
413        // Pure hex tokens (git SHA, checksum digests) are allowlisted only when
414        // they are NOT near a credential trigger.
415        if !near_trigger && is_pure_hex(token) {
416            continue;
417        }
418
419        // Hex API keys (AWS secret access key, Stripe test keys, random hex
420        // tokens) are pure hex yet are real credentials.  The entropy heuristic
421        // cannot catch them — hex alphabet maxes at log2(16) = 4.0 bits/char,
422        // which is always below ENTROPY_THRESHOLD (4.5).  A credential-shaped
423        // hex token (32 / 40 / 64 / 128 chars) near a trigger word is always
424        // flagged.  Credential triggers dominate: adding "sha" or "hash" to
425        // the window does not rescue the token — a caller controlling the prose
426        // could trivially bypass the gate with one extra word.  Safe git SHAs
427        // and content-hash digests do not appear near credential trigger words
428        // and are already allowed via the `!near_trigger && is_pure_hex` path.
429        const HEX_CREDENTIAL_LENGTHS: &[usize] = &[32, 40, 64, 128];
430        if near_trigger && is_pure_hex(token) && HEX_CREDENTIAL_LENGTHS.contains(&token.len()) {
431            return Some(build_match("hex-credential-token", token));
432        }
433
434        let entropy = shannon_entropy(token.as_bytes());
435        if entropy < ENTROPY_THRESHOLD {
436            continue;
437        }
438
439        // High-entropy token in trigger context — flag it.
440        if near_trigger {
441            return Some(build_match("high-entropy-token", token));
442        }
443    }
444    None
445}
446
447/// Returns `true` when `low_window` contains the word `token` as a standalone
448/// word — i.e. surrounded by non-alphanumeric boundaries — but NOT as part of
449/// compound identifiers such as `tokenizer`, `token_count`, or `next_token`.
450fn has_standalone_token(low_window: &str) -> bool {
451    let needle = "token";
452    let mut start = 0;
453    while let Some(rel) = low_window[start..].find(needle) {
454        let abs = start + rel;
455        let before_ok = abs == 0
456            || low_window[..abs]
457                .chars()
458                .next_back()
459                .is_none_or(|c| !c.is_alphanumeric() && c != '_');
460        let after_end = abs + needle.len();
461        let after_ok = after_end >= low_window.len()
462            || low_window[after_end..]
463                .chars()
464                .next()
465                .is_none_or(|c| !c.is_alphanumeric() && c != '_');
466        if before_ok && after_ok {
467            return true;
468        }
469        start = abs + needle.len().max(1);
470    }
471    false
472}
473
474/// Returns `true` when `low_window` contains the assignment form `token=` or
475/// `token:` where the `token` identifier has a word boundary BEFORE it.
476///
477/// This is boundary-aware so that compound identifiers like `next_token:` or
478/// `pagination_token=` do NOT trigger — only a standalone `token=`/`token:`
479/// at the start of a field name does.
480///
481/// Examples that return `true`:  `token=<value>`, `token: <value>`,
482///   `"token": "<value>"` (JSON key-value pairs).
483/// Examples that return `false`: `next_token: <value>`,
484///   `pagination_token=<value>`, `token_count: <value>`.
485fn has_token_assignment(low_window: &str) -> bool {
486    let needle = "token";
487    let mut start = 0;
488    while let Some(rel) = low_window[start..].find(needle) {
489        let abs = start + rel;
490        // Require a word boundary BEFORE `token`.
491        let before_ok = abs == 0
492            || low_window[..abs]
493                .chars()
494                .next_back()
495                .is_none_or(|c| !c.is_alphanumeric() && c != '_');
496        let after_end = abs + needle.len();
497        // Require `=` or `:` immediately after `token` (possibly with surrounding
498        // whitespace or quotes stripped by the time we see the lowercased window).
499        let after_char = low_window[after_end..].chars().next();
500        let after_is_assign = matches!(after_char, Some('=') | Some(':'));
501        if before_ok && after_is_assign {
502            return true;
503        }
504        start = abs + needle.len().max(1);
505    }
506    false
507}
508
509// ─── Allowlist helpers ───────────────────────────────────────────────────────
510
511/// Returns `true` for pure-hex tokens (case-insensitive, optional `0x`/`0X` prefix,
512/// 8–128 chars) — git SHAs, checksum digests, uuid-hex without hyphens.
513///
514/// This helper is used with context: pure-hex tokens near credential trigger words
515/// are NOT allowlisted (see `check_entropy_heuristic`).  Only call this function
516/// when you have already confirmed no trigger context is nearby.
517fn is_pure_hex(token: &str) -> bool {
518    let hex_part = token
519        .strip_prefix("0x")
520        .or(token.strip_prefix("0X"))
521        .unwrap_or(token);
522    hex_part.len() >= 8 && hex_part.len() <= 128 && hex_part.bytes().all(|b| b.is_ascii_hexdigit())
523}
524
525/// Returns `true` for tokens that are unambiguous base64/base64url content
526/// hashes with an explicit `sha<N>-` prefix (SRI hash, npm lockfile integrity).
527///
528/// Criteria:
529/// - Token starts with `sha<digits>-` (e.g. `sha256-`, `sha384-`, `sha512-`).
530/// - The body after the prefix matches a SHA-family length (43, 64, or 86–88
531///   unpadded chars).
532/// - Every byte in the body is a standard-base64 or URL-safe-base64 character.
533/// - Does NOT start with a known vendor-token prefix (those are credentials
534///   regardless of alphabet).
535///
536/// Bare base64 tokens of those lengths WITHOUT the `sha<N>-` prefix are NOT
537/// allowlisted here — a 43-char base64url API token near the word "key" is
538/// indistinguishable from a sha256 hash body without the prefix, so we require
539/// the explicit prefix to avoid false-negative credential escapes.
540fn is_base64_content_hash(token: &str) -> bool {
541    // Known vendor prefixes — never allowlist even if they look like base64.
542    // Includes bare `sk-` to prevent OpenAI-shaped tokens from being allowlisted.
543    const VENDOR_PREFIXES: &[&str] = &[
544        "sk-",
545        "rk_live_",
546        "fm2_",
547        "vercel_",
548        "xoxb-",
549        "xoxa-",
550        "xoxp-",
551        "xoxr-",
552        "xoxs-",
553        "ghp_",
554        "gho_",
555        "github_pat_",
556        "AKIA",
557        "ASIA",
558        "AGE-SECRET-KEY-",
559        "FlyV1",
560    ];
561    if VENDOR_PREFIXES.iter().any(|p| token.starts_with(p)) {
562        return false;
563    }
564    // Require an explicit SRI `sha[0-9]+-` prefix.  Bare base64 at sha-length
565    // is NOT allowlisted — it is indistinguishable from a real API token.
566    let body = if let Some(rest) = token.strip_prefix("sha") {
567        // rest starts with digits followed by '-'
568        let dash = rest.find('-').unwrap_or(rest.len());
569        let digits = &rest[..dash];
570        if !digits.is_empty() && digits.bytes().all(|b| b.is_ascii_digit()) && dash < rest.len() {
571            &rest[dash + 1..] // everything after "sha<digits>-"
572        } else {
573            return false; // no valid sha<N>- prefix → not a known content hash
574        }
575    } else {
576        return false; // no sha prefix → not allowlisted
577    };
578    // Strip optional padding (at most 2 `=`).
579    let stripped = body.trim_end_matches('=');
580    let pad_removed = body.len() - stripped.len();
581    if pad_removed > 2 {
582        return false;
583    }
584    // Accept only SHA-family content-hash lengths (43, 64, 86–88 chars unpadded).
585    let n = stripped.len();
586    if n != 43 && n != 64 && !(86..=88).contains(&n) {
587        return false;
588    }
589    // Accept both standard-base64 and URL-safe-base64 alphabets.
590    stripped
591        .bytes()
592        .all(|b| b.is_ascii_alphanumeric() || b == b'+' || b == b'/' || b == b'-' || b == b'_')
593}
594
595/// `xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx`
596fn is_uuid_canonical(s: &str) -> bool {
597    let b = s.as_bytes();
598    if b.len() != 36 {
599        return false;
600    }
601    b[8] == b'-'
602        && b[13] == b'-'
603        && b[18] == b'-'
604        && b[23] == b'-'
605        && b[..8].iter().all(|c| c.is_ascii_hexdigit())
606        && b[9..13].iter().all(|c| c.is_ascii_hexdigit())
607        && b[14..18].iter().all(|c| c.is_ascii_hexdigit())
608        && b[19..23].iter().all(|c| c.is_ascii_hexdigit())
609        && b[24..].iter().all(|c| c.is_ascii_hexdigit())
610}
611
612/// Strip common wrapping characters (`"`, `'`, `` ` ``, `:`, `=`) from both ends.
613fn strip_delimiters(s: &str) -> &str {
614    s.trim_matches(|c| matches!(c, '"' | '\'' | '`' | ':' | '=' | ',' | ';'))
615}
616
617// ─── Utilities ───────────────────────────────────────────────────────────────
618
619/// Extract a contiguous token (non-whitespace chars) starting at the beginning of `s`.
620fn extract_token(s: &str) -> &str {
621    let end = s
622        .find(|c: char| c.is_whitespace() || c == '\n' || c == '\r')
623        .unwrap_or(s.len());
624    &s[..end]
625}
626
627/// Shannon entropy in bits per character.
628///
629/// H = -∑ p_i log2(p_i)
630fn shannon_entropy(bytes: &[u8]) -> f64 {
631    if bytes.is_empty() {
632        return 0.0;
633    }
634    let mut counts = [0u32; 256];
635    for &b in bytes {
636        counts[b as usize] += 1;
637    }
638    let len = bytes.len() as f64;
639    counts
640        .iter()
641        .filter(|&&c| c > 0)
642        .map(|&c| {
643            let p = c as f64 / len;
644            -p * p.log2()
645        })
646        .sum()
647}
648
649/// Build a `SecretMatch` from a detector name and the candidate string.
650///
651/// The masked excerpt is: first 6 chars + "..." + total length.
652/// Never includes more than 6 chars of the actual value.
653fn build_match(detector: &'static str, candidate: &str) -> SecretMatch {
654    let chars: Vec<char> = candidate.chars().collect();
655    let preview: String = chars.iter().take(6).collect();
656    let masked = format!("{}...{}chars", preview, chars.len());
657    SecretMatch { detector, masked }
658}
659
660// ─── Tests ───────────────────────────────────────────────────────────────────
661
662#[cfg(test)]
663mod tests {
664    use super::*;
665
666    // ── Catch suite ──────────────────────────────────────────────────────────
667
668    #[test]
669    fn blocks_aws_akia() {
670        // FAKE key: prefix is real shape, 16-char suffix invented.
671        let fake = "AKIAFAKEKEY1234567890";
672        assert!(scan(fake).is_some(), "AKIA must be caught");
673        let m = scan(fake).unwrap();
674        assert_eq!(m.detector, "aws-access-key-id");
675        // Masked excerpt must not echo the full key.
676        assert!(
677            !m.masked.contains("FAKEKEY1234567890"),
678            "must not echo the secret: {}",
679            m.masked
680        );
681    }
682
683    #[test]
684    fn blocks_aws_asia() {
685        let fake = "ASIAFAKEKEY00000000000";
686        let m = scan(fake);
687        assert!(m.is_some(), "ASIA must be caught");
688        assert_eq!(m.unwrap().detector, "aws-access-key-id");
689    }
690
691    #[test]
692    fn blocks_github_ghp() {
693        // 36 chars total to pass min_len.
694        let fake = "ghp_AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA";
695        assert!(scan(fake).is_some(), "ghp_ must be caught");
696    }
697
698    #[test]
699    fn blocks_github_gho() {
700        let fake = "gho_BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB";
701        assert!(scan(fake).is_some(), "gho_ must be caught");
702    }
703
704    #[test]
705    fn blocks_github_pat() {
706        let fake = "github_pat_AAAAAABBBBBBCCCCCC";
707        assert!(scan(fake).is_some(), "github_pat_ must be caught");
708    }
709
710    #[test]
711    fn blocks_openai_sk() {
712        let fake = "sk-aaaaaabbbbbbccccccddddddeeeeeeffffgg";
713        assert!(scan(fake).is_some(), "sk- must be caught");
714    }
715
716    #[test]
717    fn blocks_anthropic_sk_ant() {
718        let fake = "sk-ant-api03-AAAAAAAAAAAAAAA";
719        assert!(scan(fake).is_some(), "sk-ant- must be caught");
720        assert_eq!(scan(fake).unwrap().detector, "anthropic-api-key");
721    }
722
723    #[test]
724    fn blocks_stripe_live() {
725        let fake = "sk_live_FAKESTRIPE0000000000000"; // gitleaks:allow
726        assert!(scan(fake).is_some(), "sk_live_ must be caught");
727        assert_eq!(scan(fake).unwrap().detector, "stripe-secret-key");
728    }
729
730    #[test]
731    fn blocks_stripe_restricted() {
732        let fake = "rk_live_FAKESTRIPE0000000000000"; // gitleaks:allow
733        assert!(scan(fake).is_some(), "rk_live_ must be caught");
734        assert_eq!(scan(fake).unwrap().detector, "stripe-restricted-key");
735    }
736
737    #[test]
738    fn blocks_fly_flyv1() {
739        let fake = "FlyV1 FAKEFLYTOKEN000000000000000000";
740        assert!(scan(fake).is_some(), "FlyV1 must be caught");
741        assert_eq!(scan(fake).unwrap().detector, "fly-token");
742    }
743
744    #[test]
745    fn blocks_fly_fm2() {
746        let fake = "fm2_FAKEFLYTOKEN00000000000000000";
747        assert!(scan(fake).is_some(), "fm2_ must be caught");
748        assert_eq!(scan(fake).unwrap().detector, "fly-token");
749    }
750
751    #[test]
752    fn blocks_vercel_token() {
753        let fake = "vercel_FAKETOKEN00000000000000000";
754        assert!(scan(fake).is_some(), "vercel_ must be caught");
755        assert_eq!(scan(fake).unwrap().detector, "vercel-token");
756    }
757
758    #[test]
759    fn blocks_slack_xoxb() {
760        let fake = "xoxb-FAKE-SLACKTOKEN-000000000000000000000000";
761        assert!(scan(fake).is_some(), "xoxb- must be caught");
762        assert_eq!(scan(fake).unwrap().detector, "slack-token");
763    }
764
765    #[test]
766    fn blocks_pem_private_key() {
767        // Split the header so the literal detector-trigger string is not present
768        // verbatim in source — pre-commit's detect-private-key hook would fire.
769        // The gate detects it at runtime because scan() sees the assembled string.
770        let header = ["-----BEGIN RSA", " PRIVATE KEY-----"].concat(); // gitleaks:allow
771        let fake = format!("{}\nMIIEo\u{2026}\n-----END RSA PRIVATE KEY-----", header);
772        assert!(scan(&fake).is_some(), "PEM private key must be caught");
773        assert_eq!(scan(&fake).unwrap().detector, "pem-private-key");
774    }
775
776    #[test]
777    fn blocks_pem_ec_private_key() {
778        let header = ["-----BEGIN EC", " PRIVATE KEY-----"].concat(); // gitleaks:allow
779        let fake = format!("{}\nMHQCAQEE\u{2026}\n-----END EC PRIVATE KEY-----", header);
780        assert!(scan(&fake).is_some(), "EC PEM must be caught");
781    }
782
783    #[test]
784    fn blocks_age_secret_key() {
785        // AGE-SECRET-KEY- followed by 59 base32 chars (Bech32m body).
786        let fake = "AGE-SECRET-KEY-1QQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQ";
787        assert!(scan(fake).is_some(), "AGE-SECRET-KEY- must be caught");
788        assert_eq!(scan(fake).unwrap().detector, "age-secret-key");
789    }
790
791    #[test]
792    fn blocks_jwt_triple() {
793        // Synthetic JWT structure: header.payload.signature (no real key).
794        let fake = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIn0.FAKE_SIG_XXXXXXXXXXXX"; // gitleaks:allow
795        assert!(scan(fake).is_some(), "JWT triple must be caught");
796        assert_eq!(scan(fake).unwrap().detector, "jwt");
797    }
798
799    #[test]
800    fn blocks_url_userinfo() {
801        let fake = "postgresql://dbuser:S3cr3tP4ss@db.example.com:5432/mydb";
802        assert!(scan(fake).is_some(), "URL userinfo must be caught");
803        assert_eq!(scan(fake).unwrap().detector, "url-userinfo");
804    }
805
806    #[test]
807    fn blocks_high_entropy_near_bearer_word() {
808        // 32 random-looking base64 chars adjacent to the word "bearer".
809        let fake = "Bearer token: Xk9mZ2vQpLrT8nJwYuAeHfBsDcGiONvM"; // gitleaks:allow
810        assert!(
811            scan(fake).is_some(),
812            "high-entropy value near 'bearer' must be caught"
813        );
814        assert_eq!(scan(fake).unwrap().detector, "high-entropy-token");
815    }
816
817    #[test]
818    fn blocks_high_entropy_near_secret_word() {
819        let fake = "secret=Xk9mZ2vQpLrT8nJwYuAeHfBsDcGiONvM"; // gitleaks:allow
820        assert!(
821            scan(fake).is_some(),
822            "high-entropy value near 'secret' must be caught"
823        );
824    }
825
826    #[test]
827    fn error_message_masks_secret() {
828        let fake = "ghp_AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA";
829        let m = scan(fake).unwrap();
830        // Masked form: first 6 chars + "...N chars".
831        // Must NOT contain the full suffix.
832        let masked = &m.masked;
833        assert!(
834            !masked.contains("AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"),
835            "mask must not echo the full secret value; got: {masked}"
836        );
837        // Must start with "ghp_AA" (first 6 chars of the token).
838        assert!(
839            masked.starts_with("ghp_AA"),
840            "mask must show first 6 chars; got: {masked}"
841        );
842    }
843
844    // ── False-positive suite ─────────────────────────────────────────────────
845
846    #[test]
847    fn allows_sha256_hex() {
848        // 64-char lowercase hex — typical sha256 digest.
849        let sha = "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855";
850        assert!(
851            scan(sha).is_none(),
852            "sha256 hex must pass (allowlisted); fired: {:?}",
853            scan(sha)
854        );
855    }
856
857    #[test]
858    fn allows_uuid() {
859        let uuid = "550e8400-e29b-41d4-a716-446655440000";
860        assert!(
861            scan(uuid).is_none(),
862            "UUID must pass; fired: {:?}",
863            scan(uuid)
864        );
865    }
866
867    #[test]
868    fn allows_git_sha() {
869        // 40-char lowercase git SHA.
870        let sha = "d362950a3c9b1a4cb47d97f1623e38f1a1e6bcdf";
871        assert!(
872            scan(sha).is_none(),
873            "git SHA must pass; fired: {:?}",
874            scan(sha)
875        );
876    }
877
878    #[test]
879    fn allows_normal_prose() {
880        let prose =
881            "The FlashAttention paper introduces IO-aware tiling for transformer self-attention.";
882        assert!(scan(prose).is_none(), "normal prose must pass");
883    }
884
885    #[test]
886    fn allows_code_snippet() {
887        let code = r#"fn create_entity(name: &str, kind: &str) -> RuntimeResult<Entity> {
888    self.validate_entity_kind(kind)?;
889    Ok(Entity::new("local", kind, name))
890}"#;
891        assert!(
892            scan(code).is_none(),
893            "code snippet must pass; fired: {:?}",
894            scan(code)
895        );
896    }
897
898    #[test]
899    fn allows_long_url_without_credentials() {
900        let url = "https://docs.example.com/api/v2/entities?kind=concept&limit=100";
901        assert!(scan(url).is_none(), "URL without userinfo must pass");
902    }
903
904    #[test]
905    fn allows_base64_image_stub() {
906        // Realistic short base64 data URI stub — no trigger words, below threshold length.
907        let b64 = "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAAC0lEQVQI12NgAAIABQ";
908        assert!(
909            scan(b64).is_none(),
910            "base64 image stub without trigger word must pass; fired: {:?}",
911            scan(b64)
912        );
913    }
914
915    #[test]
916    fn allows_long_plain_url() {
917        let url = "https://api.github.com/repos/ohdearquant/khive/pulls/76/comments?per_page=100";
918        assert!(
919            scan(url).is_none(),
920            "plain URL must pass; fired: {:?}",
921            scan(url)
922        );
923    }
924
925    #[test]
926    fn allows_manifest_content_hash() {
927        // A string like what appears in Cargo.lock or npm lockfiles.
928        let line =
929            "checksum = \"e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855\"";
930        assert!(
931            scan(line).is_none(),
932            "manifest content hash line must pass; fired: {:?}",
933            scan(line)
934        );
935    }
936
937    #[test]
938    fn masked_excerpt_format() {
939        let fake = "AKIAFAKEKEY1234567890";
940        let m = scan(fake).unwrap();
941        // Format: first6...Nchars
942        assert!(m.masked.contains("..."), "masked must contain '...'");
943        assert!(m.masked.ends_with("chars"), "masked must end with 'chars'");
944    }
945
946    // ── Gate function ────────────────────────────────────────────────────────
947
948    #[test]
949    fn check_returns_ok_for_safe_content() {
950        assert!(check("A normal memory note about LoRA.").is_ok());
951    }
952
953    #[test]
954    fn check_returns_err_for_secret() {
955        let fake = "AKIAFAKEKEY1234567890";
956        let result = check(fake);
957        assert!(result.is_err(), "check must fail for AKIA key");
958        let err = result.unwrap_err();
959        assert!(
960            matches!(err, RuntimeError::SecretDetected(_)),
961            "error variant must be SecretDetected"
962        );
963    }
964
965    // ── Entropy helpers ──────────────────────────────────────────────────────
966
967    #[test]
968    fn entropy_of_uniform_string_is_zero() {
969        let s = "aaaaaaaaaaaaaaaa";
970        assert!(shannon_entropy(s.as_bytes()) < 0.01);
971    }
972
973    #[test]
974    fn entropy_of_random_bytes_is_high() {
975        // A truly random-looking string should exceed 4.5 bits/char.
976        let s = b"X9kZ2vQpLrT8nJwYuAeHfBsDcGiONvM1"; // 32 mixed base64 chars
977        assert!(shannon_entropy(s) > 4.5, "entropy={}", shannon_entropy(s));
978    }
979
980    #[test]
981    fn allowlist_passes_sha256() {
982        // A plain sha256 hex digest passes via `is_pure_hex` (not `is_allowlisted`
983        // because hex is now context-dependent; this tests the primitive directly).
984        let sha = "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855";
985        assert!(is_pure_hex(sha));
986    }
987
988    #[test]
989    fn allowlist_passes_uuid_canonical() {
990        assert!(is_uuid_canonical("550e8400-e29b-41d4-a716-446655440000"));
991    }
992
993    #[test]
994    fn allowlist_does_not_pass_mixed_token() {
995        // A token that starts with letters but mixes in non-hex chars.
996        assert!(!is_pure_hex("sk-aaaaaabbbbbbccccccddddddeeeeeeffffgg"));
997    }
998
999    // ── Structured-field gate helpers ────────────────────────────────────────
1000
1001    #[test]
1002    fn check_json_blocks_secret_in_object_value() {
1003        let props = serde_json::json!({ "api_key": "AKIAFAKEKEY1234567890" });
1004        assert!(
1005            check_json(&props).is_err(),
1006            "secret in properties object value must be blocked"
1007        );
1008    }
1009
1010    #[test]
1011    fn check_json_blocks_secret_in_nested_object() {
1012        let props = serde_json::json!({ "credentials": { "token": "sk-proj-FAKEKEY00000000000000000000000000000000" } }); // gitleaks:allow
1013        assert!(
1014            check_json(&props).is_err(),
1015            "secret in nested properties object must be blocked"
1016        );
1017    }
1018
1019    #[test]
1020    fn check_json_blocks_secret_in_array() {
1021        let props = serde_json::json!(["normal", "AKIAFAKEKEY1234567890"]);
1022        assert!(
1023            check_json(&props).is_err(),
1024            "secret in JSON array must be blocked"
1025        );
1026    }
1027
1028    #[test]
1029    fn check_json_passes_safe_properties() {
1030        let props = serde_json::json!({
1031            "domain": "attention",
1032            "status": "researched",
1033            "year": 2024
1034        });
1035        assert!(
1036            check_json(&props).is_ok(),
1037            "normal properties must pass; fired: {:?}",
1038            check_json(&props).err()
1039        );
1040    }
1041
1042    #[test]
1043    fn check_tags_blocks_credential_tag() {
1044        let tags = vec![
1045            "type:concept".to_string(),
1046            "AKIAFAKEKEY1234567890".to_string(),
1047        ];
1048        assert!(
1049            check_tags(&tags).is_err(),
1050            "credential-shaped tag must be blocked"
1051        );
1052    }
1053
1054    #[test]
1055    fn check_tags_passes_normal_tags() {
1056        let tags = vec!["type:concept".to_string(), "domain:attention".to_string()];
1057        assert!(
1058            check_tags(&tags).is_ok(),
1059            "normal tags must pass; fired: {:?}",
1060            check_tags(&tags).err()
1061        );
1062    }
1063
1064    // ── False-positive: sk-learn and scikit-learn slugs ──────────────────────
1065
1066    #[test]
1067    fn allows_sk_learn_prose() {
1068        // scikit-learn slug used as an entity name or knowledge atom.
1069        let texts = &[
1070            "sk-learn is a Python machine learning library",
1071            "sk-learn-compatible transformer pipeline reference",
1072            "sk-learn scikit-learn estimator interface",
1073        ];
1074        for t in texts {
1075            assert!(
1076                scan(t).is_none(),
1077                "sk-learn prose must pass; fired: {:?} on {:?}",
1078                scan(t),
1079                t
1080            );
1081        }
1082    }
1083
1084    #[test]
1085    fn blocks_openai_sk_proj_not_confused_with_sk_learn() {
1086        // Real OpenAI key shape must still be caught.
1087        let fake = "sk-proj-FAKEKEY00000000000000000000000000000000"; // gitleaks:allow
1088        assert!(
1089            scan(fake).is_some(),
1090            "sk-proj- key must still be caught after sk-learn exemption"
1091        );
1092    }
1093
1094    // ── False-positive: SRI / tokenizer hash metadata ────────────────────────
1095
1096    #[test]
1097    fn allows_sri_hash() {
1098        // SRI hash as used in HTML integrity attributes (sha384, base64-encoded).
1099        // Placed near the word "key" to test the entropy heuristic allowlist.
1100        let line = "integrity key: sha384-oqVuAfXRKap7fdgcCY5uykM6+R9GqQ8K/uxy9rx7HNQlGYl1kPzQho1wx4JwY8wC";
1101        assert!(
1102            scan(line).is_none(),
1103            "SRI hash must pass; fired: {:?}",
1104            scan(line)
1105        );
1106    }
1107
1108    #[test]
1109    fn allows_base64_tokenizer_hash_metadata() {
1110        // Tokenizer metadata containing a base64 hash near technical keywords.
1111        let line = "tokenizer_vocab_hash: Xk9mZ2vQpLrT8nJwYuAeHfBsDcGiONvM"; // gitleaks:allow
1112        assert!(
1113            scan(line).is_none(),
1114            "tokenizer hash metadata must pass; fired: {:?}",
1115            scan(line)
1116        );
1117    }
1118
1119    #[test]
1120    fn allows_npm_lockfile_integrity() {
1121        // npm lockfile integrity line with sha512 base64url hash (86 base64 chars + ==).
1122        // sha512 digest = 64 bytes → base64 = 88 chars (86 unpadded + ==).
1123        let body_86 = "Xk9mZ2vQpLrT8nJwYuAeHfBsDcGiONvM1234567890abcdefghijklmnopqrstuvwxABCDEFGHIJKLMNOPQRST";
1124        assert_eq!(body_86.len(), 86, "test body must be exactly 86 chars");
1125        let line = format!(
1126            "resolved: https://registry.npmjs.org/foo/-/foo-1.0.0.tgz\nintegrity: sha512-{body_86}=="
1127        );
1128        assert!(
1129            scan(&line).is_none(),
1130            "npm lockfile integrity must pass; fired: {:?}",
1131            scan(&line)
1132        );
1133    }
1134
1135    // ── False-positive: tokenizer vs token trigger word ─────────────────────
1136
1137    #[test]
1138    fn allows_tokenizer_vocab_hash_no_block() {
1139        // `tokenizer_vocab_hash` contains the substring "token" but NOT as a
1140        // standalone word (followed by 'i' which is alphanumeric), so the
1141        // standalone-token boundary check must not fire here.
1142        let line = "tokenizer_vocab_hash = Xk9mZ2vQpLrT8nJwYuAeHfBsDcGiONvM"; // gitleaks:allow
1143        assert!(
1144            scan(line).is_none(),
1145            "tokenizer_vocab_hash must pass; 'token' is only standalone-word matched; fired: {:?}",
1146            scan(line)
1147        );
1148    }
1149
1150    // ── True-positives: bare base64 at sha-lengths near trigger words ────────
1151
1152    #[test]
1153    fn blocks_bare_base64url_43chars_near_key() {
1154        // A 43-char base64url token (= sha256 body length) near the word "key".
1155        // Without a sha<N>- prefix this MUST be caught, not allowlisted.
1156        let token_43 = "wJalrXUtnFEMI-K7MDENGbPxRfiCYEXAMPLEKEYX123"; // gitleaks:allow
1157        assert_eq!(token_43.len(), 43, "test token must be exactly 43 chars");
1158        let line = format!("api key {token_43}");
1159        assert!(
1160            scan(&line).is_some(),
1161            "43-char base64url token near 'key' must be caught (no sha-prefix = not a hash); fired: {:?}",
1162            scan(&line)
1163        );
1164    }
1165
1166    #[test]
1167    fn blocks_bare_base64url_64chars_near_secret() {
1168        // A 64-char base64url token (= sha384 body length) near "secret".
1169        // Must be caught without sha<N>- prefix.
1170        let token_64 = "wJalrXUtnFEMI-K7MDENGbPxRfiCYEXAMPLEKEYX123wJalrXUtnFEMI-K7MDENa"; // gitleaks:allow
1171        assert_eq!(token_64.len(), 64, "test token must be exactly 64 chars");
1172        let line = format!("secret: {token_64}");
1173        assert!(
1174            scan(&line).is_some(),
1175            "64-char base64url token near 'secret' must be caught; got: {:?}",
1176            scan(&line)
1177        );
1178    }
1179
1180    #[test]
1181    fn blocks_bare_base64url_86chars_near_auth() {
1182        // An 86-char base64url token (= sha512 body length) near "auth".
1183        // Must be caught without sha<N>- prefix.
1184        let token_86 = "wJalrXUtnFEMI-K7MDENGbPxRfiCYEXAMPLEKEYX123wJalrXUtnFEMI-K7MDENwJalrXUtnFEMI-K7MDENabc"; // gitleaks:allow
1185        assert_eq!(token_86.len(), 86, "test token must be exactly 86 chars");
1186        let line = format!("auth header {token_86}");
1187        assert!(
1188            scan(&line).is_some(),
1189            "86-char base64url token near 'auth' must be caught; got: {:?}",
1190            scan(&line)
1191        );
1192    }
1193
1194    // ── True-positives: standalone `token` trigger ───────────────────────────
1195
1196    #[test]
1197    fn blocks_service_token_opaque_value() {
1198        // "service token <opaque-high-entropy>" — `token` as a standalone word
1199        // with a high-entropy value must be caught.
1200        let opaque = "Xk9mZ2vQpLrT8nJwYuAeHfBsDcGiONvMabcdef"; // gitleaks:allow
1201        assert!(
1202            opaque.len() >= 24,
1203            "opaque must be long enough for entropy check"
1204        );
1205        let line = format!("service token {opaque}");
1206        assert!(
1207            scan(&line).is_some(),
1208            "service token <opaque> must be caught by standalone 'token' check; got: {:?}",
1209            scan(&line)
1210        );
1211    }
1212
1213    #[test]
1214    fn blocks_token_equals_credential() {
1215        // `token=<high-entropy>` (assignment form) must be caught via has_token_assignment.
1216        let opaque = "Xk9mZ2vQpLrT8nJwYuAeHfBsDcGiONvMabcdef"; // gitleaks:allow
1217        let line = format!("token={opaque}");
1218        assert!(
1219            scan(&line).is_some(),
1220            "token=<value> must be caught via token= trigger; got: {:?}",
1221            scan(&line)
1222        );
1223    }
1224
1225    #[test]
1226    fn blocks_token_colon_credential() {
1227        // `token: <high-entropy>` (key-value form) must be caught via has_token_assignment.
1228        let opaque = "Xk9mZ2vQpLrT8nJwYuAeHfBsDcGiONvMabcdef"; // gitleaks:allow
1229        let line = format!("token: {opaque}");
1230        assert!(
1231            scan(&line).is_some(),
1232            "token: <value> must be caught via token: trigger; got: {:?}",
1233            scan(&line)
1234        );
1235    }
1236
1237    #[test]
1238    fn allows_next_token_technical_context() {
1239        // `next_token` is a technical term; the high-entropy value here has low
1240        // entropy anyway, so it must pass.
1241        let line = "next_token: cursor-page-2-abcdef12345678";
1242        assert!(
1243            scan(line).is_none(),
1244            "next_token technical context must not be blocked; fired: {:?}",
1245            scan(line)
1246        );
1247    }
1248
1249    // ── Finding 6: boundary-aware token= / token: — compound identifiers must pass ──
1250
1251    #[test]
1252    fn allows_next_token_high_entropy_cursor() {
1253        // `next_token:` with a realistic high-entropy pagination cursor must NOT be
1254        // blocked.  `next_token` has `_token` suffix — not a standalone assignment form.
1255        let cursor = "Xk9mZ2vQpLrT8nJwYuAeHfBsDcGiONvMabcdef"; // gitleaks:allow
1256        let line = format!("next_token: {cursor}");
1257        assert!(
1258            scan(&line).is_none(),
1259            "next_token with high-entropy cursor must pass (compound identifier); fired: {:?}",
1260            scan(&line)
1261        );
1262    }
1263
1264    #[test]
1265    fn allows_token_count_high_entropy() {
1266        // `token_count:` with a high-entropy value must NOT be blocked.
1267        // `token_count` has `token_` prefix — the word boundary after `token` is `_`,
1268        // which is excluded by has_token_assignment.
1269        let opaque = "Xk9mZ2vQpLrT8nJwYuAeHfBsDcGiONvMabcdef"; // gitleaks:allow
1270        let line = format!("token_count: {opaque}");
1271        assert!(
1272            scan(&line).is_none(),
1273            "token_count with high-entropy value must pass; fired: {:?}",
1274            scan(&line)
1275        );
1276    }
1277
1278    // ── Finding 5: hex allowlist is not applied when trigger context is present ─
1279    //
1280    // Pure hex strings have a theoretical maximum entropy of log2(16) = 4.0 bits/char,
1281    // which is below the ENTROPY_THRESHOLD of 4.5.  That means pure hex tokens cannot
1282    // reach the entropy threshold and will never be flagged by the heuristic alone.
1283    //
1284    // However, the hex allowlist was previously applied BEFORE the trigger window was
1285    // computed, meaning a future threshold reduction or edge case could silently
1286    // skip credential-context hex.  The fix: compute trigger context first; only
1287    // apply the hex allowlist when NOT near a trigger.  The tests below verify the
1288    // structural change is in place by confirming that non-pure-hex high-entropy
1289    // tokens near triggers are caught (showing the trigger path is live), and that
1290    // purely hex tokens near triggers still correctly pass (entropy too low to flag).
1291
1292    #[test]
1293    fn hex_near_key_blocked_in_credential_context() {
1294        // A pure-hex 32-char token near "api key" is a credential-shaped hex
1295        // token in trigger context.  Entropy alone cannot flag it (hex max =
1296        // 4.0 < 4.5 threshold), but the explicit hex-credential-token path
1297        // must catch it.
1298        let hex32 = "4f9c2e8a1d3b5c7e9f0a2b4d6e8c0a2b";
1299        assert_eq!(hex32.len(), 32);
1300        let line = format!("api key {hex32}");
1301        assert!(
1302            scan(&line).is_some(),
1303            "32-char pure hex near 'api key' must be blocked; got None"
1304        );
1305    }
1306
1307    #[test]
1308    fn hex_credential_lengths_blocked_near_trigger() {
1309        // Verify all four credential-shaped lengths are caught near a trigger.
1310        let hex40 = "a3f5c2e9d1b8047e63a1f4c2d5b6e8f1a9c3d2e4";
1311        let hex64 = "1a2b3c4d5e6f7a8b9c0d1e2f3a4b5c6d7e8f9a0b1c2d3e4f5a6b7c8d9e0f1a2b";
1312        let hex128 = format!("{hex64}{hex64}");
1313        assert_eq!(hex40.len(), 40);
1314        assert_eq!(hex64.len(), 64);
1315        assert_eq!(hex128.len(), 128);
1316
1317        for (label, hex) in &[
1318            ("hex40", hex40),
1319            ("hex64", hex64),
1320            ("hex128", hex128.as_str()),
1321        ] {
1322            let line = format!("secret key: {hex}");
1323            assert!(
1324                scan(&line).is_some(),
1325                "{label} near 'secret key' must be blocked; got None"
1326            );
1327        }
1328    }
1329
1330    #[test]
1331    fn hex_blocked_when_trigger_and_hash_word_coexist() {
1332        // Credential trigger dominates: adding "hash" or "sha" to the window does
1333        // not rescue a pure-hex token when a credential trigger is also present.
1334        // An attacker controlling the prose could otherwise bypass the gate with
1335        // one extra word, so the hash-word exception must NOT apply in trigger context.
1336        let hex32 = "4f9c2e8a1d3b5c7e9f0a2b4d6e8c0a2b";
1337        let key_hash_line = format!("api key hash {hex32}");
1338        let secret_sha_line = format!("secret sha {hex32}");
1339        assert!(
1340            scan(&key_hash_line).is_some(),
1341            "'api key hash <hex32>' must be blocked; got None"
1342        );
1343        assert!(
1344            scan(&secret_sha_line).is_some(),
1345            "'secret sha <hex32>' must be blocked; got None"
1346        );
1347    }
1348
1349    #[test]
1350    fn hex_near_sha_context_word_allowed() {
1351        // A 40-char hex with "sha" or "commit" in the window — but no credential
1352        // trigger — must be allowed (git SHA or content hash in normal prose).
1353        let hex40 = "da39a3ee5e6b4b0d3255bfef95601890afd80709";
1354        let sha_line = format!("sha1: {hex40}");
1355        let commit_line = format!("commit sha {hex40}");
1356        assert!(
1357            scan(&sha_line).is_none(),
1358            "hex40 near 'sha1' context must be allowed; fired: {:?}",
1359            scan(&sha_line)
1360        );
1361        assert!(
1362            scan(&commit_line).is_none(),
1363            "hex40 near 'commit sha' context must be allowed; fired: {:?}",
1364            scan(&commit_line)
1365        );
1366    }
1367
1368    #[test]
1369    fn hex64_near_hash_context_allowed() {
1370        // A 64-char hex near "sha256" or "hash" — with no credential trigger —
1371        // must be allowed (content digest in normal prose).
1372        let hex64 = "1a2b3c4d5e6f7a8b9c0d1e2f3a4b5c6d7e8f9a0b1c2d3e4f5a6b7c8d9e0f1a2b";
1373        let sha_line = format!("sha256: {hex64}");
1374        let hash_line = format!("hash value {hex64}");
1375        assert!(
1376            scan(&sha_line).is_none(),
1377            "hex64 near 'sha256' must be allowed; fired: {:?}",
1378            scan(&sha_line)
1379        );
1380        assert!(
1381            scan(&hash_line).is_none(),
1382            "hex64 near 'hash' must be allowed; fired: {:?}",
1383            scan(&hash_line)
1384        );
1385    }
1386
1387    #[test]
1388    fn blocks_high_entropy_hex_like_token_near_key() {
1389        // A token whose character set exceeds pure hex (contains mixed-case, digits,
1390        // and non-hex chars) that ALSO passes `is_pure_hex = false` AND has high
1391        // entropy AND appears near "key" MUST be caught.  This is the realistic
1392        // real-world case: hex-looking API tokens often mix case and non-hex chars.
1393        // Example: a 32-char mixed-charset token near "api key".
1394        let mixed = "Xk9mZ2vQpLrT8nJwYuAeHfBsDcGiONvM"; // gitleaks:allow — not pure hex
1395        assert!(!is_pure_hex(mixed), "test token must not be pure hex");
1396        let line = format!("api key {mixed}");
1397        assert!(
1398            scan(&line).is_some(),
1399            "mixed-charset high-entropy token near 'api key' must be caught; got: {:?}",
1400            scan(&line)
1401        );
1402    }
1403
1404    #[test]
1405    fn allows_hex40_without_trigger() {
1406        // 40-char hex string in a neutral context (no trigger word) must still pass —
1407        // it's likely a git commit SHA or content hash.
1408        let hex40 = "da39a3ee5e6b4b0d3255bfef95601890afd80709";
1409        let line = format!("commit: {hex40}");
1410        assert!(
1411            scan(&line).is_none(),
1412            "40-char hex without trigger word must pass; fired: {:?}",
1413            scan(&line)
1414        );
1415    }
1416
1417    // ── Finding 4: check_json scans object keys ───────────────────────────────
1418
1419    #[test]
1420    fn check_json_blocks_secret_in_object_key() {
1421        // A credential used as a JSON object key (not a value) must be caught.
1422        let props = serde_json::json!({ "ghp_FakeGitHubToken0000000000000000000": "redacted" }); // gitleaks:allow
1423        assert!(
1424            check_json(&props).is_err(),
1425            "credential as JSON object key must be blocked"
1426        );
1427    }
1428
1429    #[test]
1430    fn check_json_blocks_nested_secret_key() {
1431        // Nested credential key must be caught.
1432        let props = serde_json::json!({
1433            "metadata": {
1434                "AKIAFAKEKEY000000000": "value" // gitleaks:allow
1435            }
1436        });
1437        assert!(
1438            check_json(&props).is_err(),
1439            "nested credential as JSON object key must be blocked"
1440        );
1441    }
1442
1443    // ── PEM masking format ───────────────────────────────────────────────────
1444
1445    #[test]
1446    fn pem_masked_excerpt_reflects_block_length_not_rest_of_string() {
1447        let header = ["-----BEGIN RSA", " PRIVATE KEY-----"].concat(); // gitleaks:allow
1448        let fake = format!(
1449            "{}\nMIIEo\u{2026}\n-----END RSA PRIVATE KEY-----\nsome trailing text that is very long",
1450            header
1451        );
1452        let m = scan(&fake).unwrap();
1453        assert_eq!(m.detector, "pem-private-key");
1454        // The masked length should reflect only the key block, not the whole string.
1455        // "some trailing text that is very long" is ~37 chars; total string is much longer.
1456        // The block ends after "-----END RSA PRIVATE KEY-----\n".
1457        // We just verify it is shorter than the full string length.
1458        let full_len = fake.chars().count();
1459        let reported_len: usize = m
1460            .masked
1461            .trim_end_matches("chars")
1462            .rsplit("...")
1463            .next()
1464            .and_then(|s| s.parse().ok())
1465            .unwrap_or(full_len + 1);
1466        assert!(
1467            reported_len < full_len,
1468            "masked length ({reported_len}) should be less than full string length ({full_len})"
1469        );
1470    }
1471}
khive_runtime/secret_gate.rs

khive_runtime/
secret_gate.rs