keyhog-scanner 0.5.39

keyhog-scanner: high-performance SIMD-accelerated secret detection engine
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
//! Public suppression entry points. The scanner calls one of these three
//! per finding; they apply the path / shape pre-checks unique to each
//! call site, then delegate the rest to [`super::decision::should_suppress_inner`].

use super::decision::should_suppress_inner;
use super::path_filter::{looks_like_secret_scanner_source, looks_like_vendored_minified_path};
use super::shape::{
    contains_uuid_v4_substring, looks_like_credential_colliding_punctuation,
    looks_like_email_address, looks_like_pure_identifier, looks_like_regex_literal_tail,
    looks_like_scheme_prefixed_uri, looks_like_syntactic_punctuation_marker,
    looks_like_url_or_path_segment, looks_like_word_separated_identifier,
};
use crate::context;

/// Check if a credential should be suppressed (e.g., if it is a known example token).
pub fn should_suppress_known_example_credential(
    credential: &str,
    path: Option<&str>,
    context: context::CodeContext,
) -> bool {
    should_suppress_known_example_credential_with_source(credential, path, context, None)
}

/// Variant of [`should_suppress_known_example_credential`] that also takes the
/// chunk's `source_type`. When the credential arrived through an
/// **adversarial-evasion decoder** (reverse, Caesar/ROT-N), the EXAMPLE-token
/// suppression is skipped - legitimate test fixtures don't typically reverse
/// or rotate their EXAMPLE markers; only attackers building evasions do, so
/// the marker becomes evidence FOR a real leak rather than against it.
///
/// Other decoders (base64, hex, URL) decode legitimate transport encodings
/// where EXAMPLE-suppression remains appropriate, so we don't blanket-bypass
/// the rule on every decoder origin.
pub fn should_suppress_known_example_credential_with_source(
    credential: &str,
    path: Option<&str>,
    context: context::CodeContext,
    source_type: Option<&str>,
) -> bool {
    should_suppress_inner(credential, path, context, source_type, false, false, None)
}

/// Entropy-aware variant for high-entropy generic/entropy fallbacks.
pub(crate) fn should_suppress_known_example_credential_with_source_and_entropy(
    credential: &str,
    path: Option<&str>,
    context: context::CodeContext,
    source_type: Option<&str>,
    entropy: f64,
) -> bool {
    should_suppress_inner(
        credential,
        path,
        context,
        source_type,
        false,
        false,
        Some(entropy),
    )
}

/// Variant for named-detector findings that have already matched a
/// service-specific anchor (e.g. `ALGOLIA_ADMIN_KEY=<32hex>`). When set,
/// the shape-based gates (pure-hash-digest, UUID, b64-blob, dashed-serial,
/// hex-uniformity) are bypassed because the regex anchor IS the positive
/// evidence - a 32-hex value after `ALGOLIA_ADMIN_KEY=` is an Algolia key,
/// NOT an MD5. Use ONLY from detector paths whose regex requires a
/// service-keyword anchor in the alternation list.
pub fn should_suppress_named_detector_finding(
    credential: &str,
    path: Option<&str>,
    context: context::CodeContext,
    source_type: Option<&str>,
    detector_id: &str,
) -> bool {
    should_suppress_named_detector_finding_weak(
        credential,
        path,
        context,
        source_type,
        detector_id,
        false,
    )
}

/// Weak-anchor-aware variant of [`should_suppress_named_detector_finding`].
///
/// `weak_anchor` is the structural classification produced by
/// [`detector_weak_anchor`] at the scan call site (which has the full
/// [`keyhog_core::DetectorSpec`]). When `true`, the detector relies on a
/// generic keyword anchor with a broad / hash-shaped capture, so the
/// shape-suppression gates that protect the `generic-*` / `entropy-*`
/// fallbacks stay engaged instead of being bypassed. The id-only public
/// wrapper above passes `false` for callers that have not computed the
/// structural classification.
pub fn should_suppress_named_detector_finding_weak(
    credential: &str,
    path: Option<&str>,
    context: context::CodeContext,
    source_type: Option<&str>,
    detector_id: &str,
    weak_anchor: bool,
) -> bool {
    // Shape filters split into two tiers based on whether the shape
    // can legitimately appear as the body of a real service-anchored
    // credential.
    //
    // **Tier A - applies to ALL detectors.** Only `punctuation_decorated`
    // stays universal - `--api-secret`, `&password`, `Password:` are
    // grammar / syntax markers, never the body of a real credential
    // regardless of which detector matched.
    //
    // **Tier B - generic-* / entropy-* only.** These shapes CAN appear
    // as legitimate credential bodies when paired with a service-
    // specific regex anchor. The anchor is positive evidence that the
    // value is a credential, so the shape filter would be wrong to drop
    // it. (Examples the contract corpus enforces:
    //   * `powerbi-credentials` - body IS a UUID
    //   * `mongodb-atlas-credentials` - body IS `mongodb://...` URI
    //   * `cockroachdb-api-key` - body has underscore-separated words
    //   * `avalanche-api-credentials` - body IS an RPC URL
    //   * `aws-secret-access-key` - body has `/+=` URL-segment chars
    // These all DROPPED when the Tier-B filters fired on named
    // detectors. The generic-* / entropy-* fallbacks have no anchor -
    // there the shape filter IS the only positive-evidence gate, so
    // it must stay.)
    //
    // The previous flow applied Tier B universally and dropped 400+
    // contract evasions. See task #41 + the 2026-05-27 audit.
    let apply_tier_b = is_generic_or_entropy(detector_id, weak_anchor);

    if apply_tier_b && looks_like_pure_identifier(credential) {
        crate::telemetry::record_example_suppression(
            "pipeline",
            path,
            credential,
            "pure_identifier_no_digit",
        );
        return true;
    }
    if apply_tier_b && looks_like_word_separated_identifier(credential) {
        crate::telemetry::record_example_suppression(
            "pipeline",
            path,
            credential,
            "word_separated_identifier",
        );
        return true;
    }
    if apply_tier_b && looks_like_scheme_prefixed_uri(credential) {
        crate::telemetry::record_example_suppression(
            "pipeline",
            path,
            credential,
            "scheme_prefixed_uri",
        );
        return true;
    }
    // Tier A: pure syntactic markers (`--flag`, `&ptr`, `@attr`, `$var`,
    // `Label:`) are never a credential body - suppress for every detector.
    if looks_like_syntactic_punctuation_marker(credential) {
        crate::telemetry::record_example_suppression(
            "pipeline",
            path,
            credential,
            "syntactic_punctuation_marker",
        );
        return true;
    }
    // Tier B: `/`-led base64, `!`-led / `!`-trailed secrets look decorated but
    // are valid credential bodies. Only an FP signal for unanchored generic/
    // entropy matches; a named service-anchored detector has already proven
    // these bytes are the credential, so DON'T suppress there.
    if apply_tier_b && looks_like_credential_colliding_punctuation(credential) {
        crate::telemetry::record_example_suppression(
            "pipeline",
            path,
            credential,
            "credential_colliding_punctuation",
        );
        return true;
    }
    if apply_tier_b && looks_like_url_or_path_segment(credential) {
        crate::telemetry::record_example_suppression(
            "pipeline",
            path,
            credential,
            "url_or_path_segment",
        );
        return true;
    }
    // Captured value contains a UUID v4 / RFC-4122 substring anywhere.
    // Tier B because many real credentials are UUIDs (powerbi
    // client_id, opsgenie heartbeat, docusign integration key,
    // launchdarkly sdk-key, etc.) - only suppress in generic/entropy
    // paths where there's no service anchor.
    if apply_tier_b && contains_uuid_v4_substring(credential) {
        crate::telemetry::record_example_suppression(
            "pipeline",
            path,
            credential,
            "contains_uuid_v4",
        );
        return true;
    }
    // Email-address shape: `noreply@gogs.localhost` (gogs golden test
    // ini), `bob.norman@mail.example.com` (shopify test response).
    // Email addresses are public identifiers, not credentials.
    if looks_like_email_address(credential) {
        crate::telemetry::record_example_suppression("pipeline", path, credential, "email_address");
        return true;
    }
    // Vendored 3rd-party minified bundle path: applies to ALL detectors,
    // not just generic-*. A "secret-like" sequence in a minified
    // codemirror/pdfjs/jquery/etc. bundle is never a real leak.
    if looks_like_vendored_minified_path(path) {
        crate::telemetry::record_example_suppression(
            "pipeline",
            path,
            credential,
            "vendored_minified_path",
        );
        return true;
    }
    // Native-binary string extraction (`filesystem:binary-strings`,
    // `filesystem/archive-binary`): the file is an ELF / Mach-O / PE /
    // wasm / archived binary whose printable strings were extracted as
    // a fallback. Short-prefix detectors (openai `sk-`, stabilityai
    // `sk-`, helicone `sk-`/`pk-`/`eu-`, clickup `pk_`, AKIA / ASIA,
    // K00M, AIza, dn_, …) generate noise on random compiled-code byte
    // sequences that happen to start with the prefix. A real credential
    // embedded in a native binary is best caught via the optional
    // `binary` feature (Ghidra-based extraction with context), not via
    // brute-force strings. Skip every named-detector finding here so
    // we don't ship FPs from compiled apps' rodata.
    if source_type.is_some_and(|s| s.contains("binary-strings") || s.contains("archive-binary")) {
        crate::telemetry::record_example_suppression(
            "pipeline",
            path,
            credential,
            "native_binary_strings",
        );
        return true;
    }
    // The file at `path` is itself a secret scanner - every detector
    // routinely matches its own regex definitions inside the source.
    if looks_like_secret_scanner_source(path) {
        crate::telemetry::record_example_suppression(
            "pipeline",
            path,
            credential,
            "secret_scanner_source",
        );
        return true;
    }
    // Files explicitly marked as base64 (`.b64`, `.base64`, or basename
    // starting with `base64_` / containing `base64_string`) hold base64-
    // encoded blobs - usually images or binaries that the operator
    // wants the base64 decoder to handle. Raw text-mode hits inside the
    // base64 stream (AIza, sk-, ASIA, etc.) are alphabet coincidences,
    // not credentials. The base64-decoder pass produces a separate
    // `filesystem/base64` chunk with the decoded content; that chunk
    // hits `has_binary_magic` if it's image/binary, otherwise it's
    // scanned normally.
    if path.is_some_and(|p| {
        // Case-insensitive checks over raw bytes - avoids the per-match
        // `p.to_ascii_lowercase()` allocation. Endswith checks are also
        // case-insensitive so `.B64` / `.BASE64` extensions still suppress.
        let bytes = p.as_bytes();
        if crate::ascii_ci::ends_with_ignore_ascii_case(bytes, b".b64")
            || crate::ascii_ci::ends_with_ignore_ascii_case(bytes, b".base64")
        {
            return true;
        }
        // Both `/` and `\` so Windows paths (`C:\foo\base64_x.txt`)
        // collapse to the same basename. Same rationale as the
        // fallback_entropy path-gate sibling.
        let basename = bytes
            .iter()
            .rposition(|&b| b == b'/' || b == b'\\')
            .map(|i| &bytes[i + 1..])
            .unwrap_or(bytes);
        basename
            .get(..7)
            .is_some_and(|p| p.eq_ignore_ascii_case(b"base64_"))
            || crate::ascii_ci::ci_find(basename, b"base64_string")
            || basename.eq_ignore_ascii_case(b"base64.txt")
    }) && source_type.is_some_and(|s| s == "filesystem")
    {
        crate::telemetry::record_example_suppression(
            "pipeline",
            path,
            credential,
            "raw_base64_file",
        );
        return true;
    }
    // Regex-literal tail: applies to ALL detectors. A capture ending
    // in `)/g`, `)/g,`, `]+`, `})\\b`, etc. is a JS/Go/Python regex
    // pattern definition (often in another secret-scanner's own
    // source code), not a credential. claude-code's Feedback.tsx
    // has 1 `hot-aws_key` finding on its own AWS regex definition
    // `/AKIA[A-Z0-9]{16,17}/g,`.
    if looks_like_regex_literal_tail(credential) {
        crate::telemetry::record_example_suppression(
            "pipeline",
            path,
            credential,
            "regex_literal_tail",
        );
        return true;
    }
    // Generic detectors (generic-secret, generic-private-key, entropy-*)
    // never use this bypass - their anchor is keyword-class, not
    // service-specific, and shape gates are load-bearing for them.
    // Weakly anchored named detectors (e.g. datadog-api-key) also do not
    // bypass shape gates to prevent false positive traps from triggering.
    let bypass_shape_gates = !detector_id.starts_with("generic-")
        && !detector_id.starts_with("entropy-")
        && !weak_anchor
        && detector_id != "private-key";
    should_suppress_inner(
        credential,
        path,
        context,
        source_type,
        false,
        bypass_shape_gates,
        None,
    )
}

/// True if the detector that fired has no service-specific anchor -
/// the generic `generic-*` / `entropy-*` fallbacks, or a named detector
/// that the structural classifier flagged as weakly anchored
/// (`weak_anchor`). Used by [`should_suppress_named_detector_finding_weak`]
/// to decide whether the Tier-B shape filters apply: strongly anchored
/// detectors have positive evidence in their regex that the shape filter
/// would otherwise destroy.
fn is_generic_or_entropy(detector_id: &str, weak_anchor: bool) -> bool {
    detector_id.starts_with("generic-") || detector_id.starts_with("entropy-") || weak_anchor
}

/// Detectors that are weakly anchored but NOT caught by the structural
/// broad-identifier rule in [`detector_weak_anchor`], because their
/// capture is pure-hex (`[a-f0-9]{32}` / `{40}`) - structurally identical
/// to a legitimate hex API key such as `algolia-admin-api-key`, so shape
/// alone cannot tell them apart - or a high-minimum broad identifier
/// (`{20,}` / `{32}`). These were measured FP-prone on the SecretBench
/// mirror corpus and so remain an explicit, corpus-derived data set rather
/// than a structural derivation.
const RESIDUAL_WEAK_ANCHORED: &[&str] = &[
    "aerisweather-api-credentials",
    "base-api-credentials",
    "flickr-api-key",
    "census-api-key",
    "workato-api-credentials",
    "adobe-api-key",
    "alchemy-api-key",
    "azure-openai-api-key",
    "datadog-api-key",
    "etherscan-api-key",
    "spotify-client-credentials",
    "bamboohr-api-key",
    "calendly-api-key",
    "crowdin-api-token",
    "github-oauth-secret",
    "sonarcloud-token",
    "activecampaign-api-key",
    "chef-automate-token",
    "foundation-api-key",
    "getresponse-api-key",
    "rudder-api-token",
];

/// Structurally classify whether a named detector is *weakly anchored*:
/// it relies on a generic keyword anchor (`api_key=`, `token=`, …) with a
/// capture that can collide with non-secrets, so the shape-suppression
/// gates must stay engaged (it should be treated like a `generic-*`
/// fallback rather than a service-fingerprinted detector).
///
/// Derived at the scan call site from the detector's own regex shape so
/// every present and future detector with this shape is covered - this
/// replaces a hand-maintained ID allowlist that drifted out of sync with
/// the detector corpus. The broad-identifier class (Category C of
/// `FP_AUDIT_REPORT.md`: a `[a-zA-Z0-9_-]`-style capture with a small
/// minimum length that matches any short identifier) is derived here; the
/// pure-hex class, which is shape-indistinguishable from real hex keys,
/// stays in [`RESIDUAL_WEAK_ANCHORED`].
pub fn detector_weak_anchor(spec: &keyhog_core::DetectorSpec) -> bool {
    let id = spec.id.as_str();
    if id.starts_with("generic-") || id.starts_with("entropy-") || id == "private-key" {
        return false;
    }
    if spec.min_confidence.is_some() {
        return false;
    }
    RESIDUAL_WEAK_ANCHORED.contains(&id)
        || spec
            .patterns
            .iter()
            .any(|p| has_broad_identifier_capture(&p.regex))
}

/// True if `regex` contains a capture group whose entire body is a single
/// full-alphabet identifier character class (`[a-zA-Z0-9_-]` and close
/// variants, NOT hex-only `[a-f0-9]`) with a minimum repeat of 0 or 1
/// (`+`, `*`, `{0,..}`, `{1,..}`, `{1}`). That is the broad-identifier
/// false-positive shape from Category C of `FP_AUDIT_REPORT.md`: a
/// minimum length of one means the capture matches ANY short identifier
/// (function name, variable, kwarg default) sitting after the detector's
/// keyword anchor. Higher minimums (e.g. `{8,}`, `{16}`) describe real
/// fixed-shape keys and are deliberately NOT flagged.
fn has_broad_identifier_capture(regex: &str) -> bool {
    let mut search_from = 0;
    while let Some(rel) = regex[search_from..].find("([") {
        let class_open = search_from + rel + 1; // index of '['
        let Some(rel_close) = regex[class_open..].find(']') else {
            break;
        };
        let class_close = class_open + rel_close; // index of ']'
        let body = &regex[class_open + 1..class_close];
        if let Some(min_len) = group_capture_min_len(&regex[class_close + 1..]) {
            if min_len <= 1 && is_full_alpha_identifier_class(body) {
                return true;
            }
        }
        search_from = class_close + 1;
    }
    false
}

/// If `after` (the slice immediately following a class's closing `]`) is a
/// quantifier that closes the capture group right after it, return the
/// quantifier's minimum repeat count. `Some` only when the group is exactly
/// `([class]<quant>)`.
fn group_capture_min_len(after: &str) -> Option<usize> {
    let bytes = after.as_bytes();
    match bytes.first()? {
        b'+' if bytes.get(1) == Some(&b')') => Some(1),
        b'*' if bytes.get(1) == Some(&b')') => Some(0),
        b'{' => {
            let close = after.find('}')?;
            if after.as_bytes().get(close + 1) != Some(&b')') {
                return None;
            }
            after[1..close].split(',').next()?.parse::<usize>().ok()
        }
        _ => None,
    }
}

/// True if `body` (a regex character-class body, without the brackets) is
/// composed only of identifier range/literal tokens AND includes a full
/// alphabetic range (`a-z`, `A-Z`, or `\w`). Hex-only classes (`a-f0-9`)
/// return false because `a-f` is not an accepted token.
fn is_full_alpha_identifier_class(body: &str) -> bool {
    const TOKENS: &[&str] = &["a-z", "A-Z", "0-9", "\\w", "\\d", "_", "-"];
    let mut full_alpha = false;
    let mut rest = body;
    while !rest.is_empty() {
        match TOKENS.iter().find(|t| rest.starts_with(**t)) {
            Some(t) => {
                if *t == "a-z" || *t == "A-Z" || *t == "\\w" {
                    full_alpha = true;
                }
                rest = &rest[t.len()..];
            }
            None => return false,
        }
    }
    full_alpha
}