keyhog_scanner/confidence/prefixes.rs
1/// Canonical list of well-known service-credential prefixes.
2///
3/// This is the single source of truth for the prefix set. Two consumers:
4///
5/// 1. [`known_prefix_confidence_floor`] (this module) lifts any credential
6/// starting with one of these to a 0.8 confidence floor.
7/// 2. `context::inference::{is_sequential_placeholder, is_hex_sequential_placeholder}`
8/// strip these prefixes before sequence-detection so a `ghp_aaaaaaaaaa`
9/// placeholder still triggers the all-same-char suppression on the
10/// BODY, not on the prefix.
11///
12/// Pre-2026-05-24 state: this list was duplicated three times across
13/// `confidence/prefixes.rs` + `context/inference.rs` × 2, and the copies
14/// had already drifted (KNOWN_PREFIXES missed `glcbt-`, `glrt-`,
15/// `xoxs-`, `vercel_`, `sbp_`, `0x`, `rk_test_`, `sk-`; the inference
16/// copies missed `PRIVATE KEY`, `-----BEGIN`, `TESTKEY_`). Consolidated
17/// here (kimi-dedup audit rows #12-13).
18pub const KNOWN_PREFIXES: &[&str] = &[
19 // GitHub PATs (every documented variant)
20 "ghp_",
21 "gho_",
22 "ghu_",
23 "ghs_",
24 "ghr_",
25 "github_pat_",
26 // Stripe live + test for all key families
27 "sk_live_",
28 "sk_test_",
29 "pk_live_",
30 "pk_test_",
31 "rk_live_",
32 "rk_test_",
33 // AWS access key ID prefixes
34 "AKIA",
35 "ASIA",
36 // Slack (full variant set)
37 "xoxb-",
38 "xoxp-",
39 "xoxa-",
40 "xoxr-",
41 "xoxs-",
42 // OpenAI / Anthropic / generic sk-
43 "sk-proj-",
44 "sk-ant-",
45 "sk-",
46 // Google API keys
47 "AIza",
48 // SendGrid
49 "SG.",
50 // HuggingFace
51 "hf_",
52 // npm
53 "npm_",
54 // PyPI
55 "pypi-",
56 // GitLab PAT variants
57 "glpat-",
58 "glcbt-",
59 "glrt-",
60 // DigitalOcean
61 "dop_v1_",
62 // JWT shape (base64url of `{"alg":...}`)
63 "eyJ",
64 // Vercel
65 "vercel_",
66 // Supabase project
67 "sbp_",
68 // Hex-prefixed credentials (Ethereum-style addresses + a few API
69 // keys that ship as 0x<hex>).
70 "0x",
71 // Bare keyword used as a credential - the upstream detector already
72 // gated on `PRIVATE KEY` substring so this floor only lifts captured
73 // bodies, not arbitrary PEM blocks.
74 "PRIVATE KEY",
75 // PEM-framed private key blocks captured by the `private-key`
76 // detector start with `-----BEGIN` (e.g. `-----BEGIN RSA-PRIVATE-KEY-----`).
77 "-----BEGIN",
78 // Test-fixture marker used by the bundled suppression list.
79 "TESTKEY_",
80];
81
82/// Return a minimum confidence floor for credentials with well-known literal prefixes.
83///
84/// Credentials carrying a placeholder word (`EXAMPLE`, `PLACEHOLDER`, `DUMMY`,
85/// `FAKE`, `SAMPLE`, `CHANGEME`) do NOT get the floor. A `ghp_EXAMPLE_…`
86/// or `sk_live_PLACEHOLDER_…` is a doc sample, not a credential - the
87/// placeholder penalty in `apply_post_ml_penalties` had already slammed
88/// these to ~0.05, but the unconditional `final_score.max(0.8)` in
89/// `scan_postprocess` then lifted them straight back. Mirror corpus
90/// 2026-05-29: 154 docs-example FPs across the GitHub PAT, AWS access
91/// key, Slack bot token, and Stripe secret key prefix families all
92/// surfaced through this exact path; this single guard kills them.
93///
94/// The same lift-back defeated the degenerate-repeat penalty: a known-prefix
95/// placeholder like `AKIAXXXXXXXXXXXXXXXX` (16-char `X` run) was crushed to
96/// ~0.08 by `apply_post_ml_penalties` and then floored back to 0.8 here. The
97/// `is_degenerate_repeat` skip (CredData dogfood 2026-06-03) closes that hole
98/// the same way - a 10+ identical-char run is never a real key body.
99#[must_use]
100pub fn known_prefix_confidence_floor(credential: &str) -> Option<f64> {
101 if super::penalties::contains_placeholder_word(credential)
102 || crate::decode_structure::decoded_contains_placeholder(credential)
103 || super::penalties::is_degenerate_repeat(credential)
104 {
105 return None;
106 }
107 if KNOWN_PREFIXES
108 .iter()
109 .any(|prefix| credential.starts_with(prefix))
110 {
111 Some(0.8)
112 } else {
113 None
114 }
115}