Skip to main content

keyhog_scanner/confidence/
prefixes.rs

1/// Canonical list of well-known service-credential prefixes.
2///
3/// This is the single source of truth for the prefix set. Two consumers:
4///
5/// 1. [`known_prefix_confidence_floor`] (this module) lifts any credential
6///    starting with one of these to a 0.8 confidence floor.
7/// 2. `context::inference::{is_sequential_placeholder, is_hex_sequential_placeholder}`
8///    strip these prefixes before sequence-detection so a `ghp_aaaaaaaaaa`
9///    placeholder still triggers the all-same-char suppression on the
10///    BODY, not on the prefix.
11///
12/// Pre-2026-05-24 state: this list was duplicated three times across
13/// `confidence/prefixes.rs` + `context/inference.rs` × 2, and the copies
14/// had already drifted (KNOWN_PREFIXES missed `glcbt-`, `glrt-`,
15/// `xoxs-`, `vercel_`, `sbp_`, `0x`, `rk_test_`, `sk-`; the inference
16/// copies missed `PRIVATE KEY`, `-----BEGIN`, `TESTKEY_`). Consolidated
17/// here (kimi-dedup audit rows #12-13).
18pub const KNOWN_PREFIXES: &[&str] = &[
19    // GitHub PATs (every documented variant)
20    "ghp_",
21    "gho_",
22    "ghu_",
23    "ghs_",
24    "ghr_",
25    "github_pat_",
26    // Stripe live + test for all key families
27    "sk_live_",
28    "sk_test_",
29    "pk_live_",
30    "pk_test_",
31    "rk_live_",
32    "rk_test_",
33    // AWS access key ID prefixes
34    "AKIA",
35    "ASIA",
36    // Slack (full variant set)
37    "xoxb-",
38    "xoxp-",
39    "xoxa-",
40    "xoxr-",
41    "xoxs-",
42    // OpenAI / Anthropic / generic sk-
43    "sk-proj-",
44    "sk-ant-",
45    "sk-",
46    // Google API keys
47    "AIza",
48    // SendGrid
49    "SG.",
50    // HuggingFace
51    "hf_",
52    // npm
53    "npm_",
54    // PyPI
55    "pypi-",
56    // GitLab PAT variants
57    "glpat-",
58    "glcbt-",
59    "glrt-",
60    // DigitalOcean
61    "dop_v1_",
62    // JWT shape (base64url of `{"alg":...}`)
63    "eyJ",
64    // Vercel
65    "vercel_",
66    // Supabase project
67    "sbp_",
68    // Hex-prefixed credentials (Ethereum-style addresses + a few API
69    // keys that ship as 0x<hex>).
70    "0x",
71    // Bare keyword used as a credential - the upstream detector already
72    // gated on `PRIVATE KEY` substring so this floor only lifts captured
73    // bodies, not arbitrary PEM blocks.
74    "PRIVATE KEY",
75    // PEM-framed private key blocks captured by the `private-key`
76    // detector start with `-----BEGIN` (e.g. `-----BEGIN RSA-PRIVATE-KEY-----`).
77    "-----BEGIN",
78    // Test-fixture marker used by the bundled suppression list.
79    "TESTKEY_",
80];
81
82/// Return a minimum confidence floor for credentials with well-known literal prefixes.
83///
84/// Credentials carrying a placeholder word (`EXAMPLE`, `PLACEHOLDER`, `DUMMY`,
85/// `FAKE`, `SAMPLE`, `CHANGEME`) do NOT get the floor. A `ghp_EXAMPLE_…`
86/// or `sk_live_PLACEHOLDER_…` is a doc sample, not a credential - the
87/// placeholder penalty in `apply_post_ml_penalties` had already slammed
88/// these to ~0.05, but the unconditional `final_score.max(0.8)` in
89/// `scan_postprocess` then lifted them straight back. Mirror corpus
90/// 2026-05-29: 154 docs-example FPs across the GitHub PAT, AWS access
91/// key, Slack bot token, and Stripe secret key prefix families all
92/// surfaced through this exact path; this single guard kills them.
93///
94/// The same lift-back defeated the degenerate-repeat penalty: a known-prefix
95/// placeholder like `AKIAXXXXXXXXXXXXXXXX` (16-char `X` run) was crushed to
96/// ~0.08 by `apply_post_ml_penalties` and then floored back to 0.8 here. The
97/// `is_degenerate_repeat` skip (CredData dogfood 2026-06-03) closes that hole
98/// the same way - a 10+ identical-char run is never a real key body.
99#[must_use]
100pub fn known_prefix_confidence_floor(credential: &str) -> Option<f64> {
101    if super::penalties::contains_placeholder_word(credential)
102        || crate::decode_structure::decoded_contains_placeholder(credential)
103        || super::penalties::is_degenerate_repeat(credential)
104    {
105        return None;
106    }
107    if KNOWN_PREFIXES
108        .iter()
109        .any(|prefix| credential.starts_with(prefix))
110    {
111        Some(0.8)
112    } else {
113        None
114    }
115}