keyhog-core 0.5.38

keyhog-core: shared data model and detector specifications for the KeyHog secret scanner
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
//! Detector specification: TOML-based pattern definitions with regex, keywords,
//! verification endpoints, and companion patterns.

// Debt bucket: 55 public items, each landed before the crate floor raised
// `missing_docs` to `warn`. Each is part of the public TOML schema and would
// benefit from a doc line; remove this allow once they all carry one.
#![allow(missing_docs)]

mod load;
mod validate;

use serde::{Deserialize, Serialize};
use thiserror::Error;

pub use load::{
    load_detector_cache, load_detectors, load_detectors_from_str, load_detectors_with_gate,
    save_detector_cache,
};
pub use validate::{validate_detector, QualityIssue};

/// Metadata field specification for verification results.
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(deny_unknown_fields)]
pub struct MetadataSpec {
    /// Field name in the finding metadata map.
    pub name: String,
    /// GJSON path to extract from the verification response body.
    pub json_path: String,
}

/// A complete detector definition loaded from a TOML file.
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
#[serde(deny_unknown_fields)]
pub struct DetectorSpec {
    /// Unique stable identifier (e.g. \`aws-access-key\`).
    pub id: String,
    /// Human-readable name.
    pub name: String,
    /// Target service (e.g. \`aws\`, \`stripe\`).
    pub service: String,
    /// Default severity for findings.
    pub severity: Severity,
    /// List of regex patterns to match.
    pub patterns: Vec<PatternSpec>,
    /// Secondary patterns required to confirm a match.
    #[serde(default)]
    pub companions: Vec<CompanionSpec>,
    /// Live verification configuration.
    pub verify: Option<VerifySpec>,
    /// High-performance pre-filtering keywords.
    #[serde(default)]
    pub keywords: Vec<String>,
    /// Self-declared per-detector confidence floor, in `[0.0, 1.0]`.
    ///
    /// When set, findings from THIS detector use this floor instead of the
    /// global `--min-confidence` / `[scan] min_confidence`. A detector with a
    /// distinctive vendor prefix (e.g. sourcegraph `sgp_<40hex>`, cursor
    /// `key_<64hex>`) is high-confidence by virtue of the prefix even when the
    /// body is low-entropy hex that the generic confidence model scores below
    /// the global floor; the detector author declares that here so the
    /// detector ships working out of the box. Costs nothing at scan time —
    /// it is a single O(1) map lookup at the post-scan floor gate, on an
    /// already-compiled corpus. An operator `.keyhog.toml`
    /// `[detector.<id>] min_confidence` still overrides this self-declared
    /// default. `None` (the default) means "use the global floor".
    #[serde(default)]
    pub min_confidence: Option<f64>,
    /// Inline self-test fixtures (`[[detector.tests]]`, Tier-B data): each entry
    /// carries a positive example the detector MUST fire on and/or a negative
    /// example it MUST NOT. Consumed by the contract/self-validate harness;
    /// ignored at scan time. Modeled here (rather than silently dropped) so the
    /// schema's `deny_unknown_fields` typo-guard covers the whole detector file.
    #[serde(default)]
    pub tests: Vec<DetectorTestSpec>,
}

/// One inline detector self-test fixture (`[[detector.tests]]`).
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Default)]
#[serde(deny_unknown_fields)]
pub struct DetectorTestSpec {
    /// Text this detector MUST fire on.
    #[serde(default)]
    pub test_positive: Option<String>,
    /// Text this detector MUST NOT fire on.
    #[serde(default)]
    pub test_negative: Option<String>,
}

/// A regex pattern with optional capture group and description.
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Default)]
#[serde(deny_unknown_fields)]
pub struct PatternSpec {
    /// Regular expression string (Rust flavor).
    pub regex: String,
    /// Optional context description.
    pub description: Option<String>,
    /// Optional capture group index containing the secret.
    pub group: Option<usize>,
    /// When true, a match against THIS pattern downgrades the
    /// finding to `Severity::ClientSafe` (regardless of the detector's
    /// nominal severity). Used by services that intentionally ship
    /// public-facing keys in client bundles:
    ///
    ///   - Sentry DSN (the `https://<key>@` URL is meant for the browser)
    ///   - Stripe `pk_live_` / `pk_test_` (publishable, sk_ is secret)
    ///   - Mapbox `pk.` (public, `sk.` is secret)
    ///   - Firebase Web API key, Google Maps browser key
    ///   - PostHog / Mixpanel / Algolia search / Datadog browser RUM
    ///
    /// Per-pattern (not per-detector) so detectors that fire on both
    /// the public *and* the secret prefix can tag only the public one.
    ///
    /// Case sensitivity: keyhog compiles every regex `case_insensitive(true)`,
    /// so to make a single pattern case-SENSITIVE (AWS `AKIA` is uppercase,
    /// GCP/Snowflake ids are lowercase) prefix its regex with the inline flag
    /// `(?-i)` in the TOML - no schema field needed.
    #[serde(default)]
    pub client_safe: bool,
}

/// Secondary pattern used to confirm a primary match or provide extra context.
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(deny_unknown_fields)]
pub struct CompanionSpec {
    /// Field name used in verification templates (e.g. \`{{companion.secret_key}}\`).
    pub name: String,
    /// Regex to find the companion value nearby.
    pub regex: String,
    /// Maximum line distance from the primary match.
    pub within_lines: usize,
    /// Whether this companion must be found to report the finding.
    #[serde(default)]
    pub required: bool,
}

/// Live verification configuration for a detector.
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
#[serde(deny_unknown_fields)]
pub struct VerifySpec {
    /// Target service identifier (defaults to detector's service if omitted).
    #[serde(default)]
    pub service: String,
    /// HTTP method (default: GET).
    pub method: Option<HttpMethod>,
    /// Endpoint URL with optional \`{{match}}\` or \`{{companion.<name>}}\` placeholders.
    pub url: Option<String>,
    /// Authentication scheme.
    pub auth: Option<AuthSpec>,
    /// Custom HTTP headers.
    #[serde(default)]
    pub headers: Vec<HeaderSpec>,
    /// Optional request body template.
    pub body: Option<String>,
    /// Criteria for a successful verification.
    pub success: Option<SuccessSpec>,
    /// Metadata to extract from the response.
    #[serde(default)]
    pub metadata: Vec<MetadataSpec>,
    /// Optional request timeout override.
    pub timeout_ms: Option<u64>,
    /// Multi-step verification flow.
    #[serde(default)]
    pub steps: Vec<StepSpec>,
    /// Domain allowlist for the verify URL after interpolation. If non-empty,
    /// the resolved host of the (interpolated) URL - and of every step's URL -
    /// MUST equal one of these entries (or be a subdomain of one). When empty,
    /// the verifier falls back to a hardcoded service allowlist if the
    /// `service` field maps to a known provider; otherwise the verifier
    /// REFUSES to send the request. This blocks malicious detector TOMLs
    /// that set `url = "{{match}}"` (or interpolate an attacker-controlled
    /// companion) from exfiltrating credentials. See kimi-wave1 audit
    /// finding 4.1 + wave3 §1.
    #[serde(default)]
    pub allowed_domains: Vec<String>,
    /// Optional out-of-band verification probe. When set, the verifier mints a
    /// per-finding correlation URL via the configured interactsh server,
    /// substitutes `{{interactsh}}` (and `{{interactsh.host}}` /
    /// `{{interactsh.url}}`) into the request template, and waits for the
    /// service to call back. OOB verification proves a leaked credential is
    /// **exfil-capable**, not just live: a webhook URL that returns 200 OK to
    /// every probe still has to actually fetch our collector to confirm it
    /// will deliver attacker-controlled traffic.
    ///
    /// Gated behind the runtime `--verify-oob` flag - never default. When the
    /// flag is off, `oob` is ignored and verification falls back to the
    /// HTTP success criteria alone.
    pub oob: Option<OobSpec>,
}

/// Out-of-band callback verification configuration.
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(deny_unknown_fields)]
pub struct OobSpec {
    /// Callback protocol the verifier waits for. The service may also touch
    /// other protocols on the same correlation id; only the listed ones count
    /// toward `Verified`.
    pub protocol: OobProtocol,
    /// How long to wait for the callback after the HTTP request returns.
    /// Defaults to 30 seconds when omitted; capped at the engine's
    /// `oob_timeout_max` to bound scan time.
    #[serde(default)]
    pub timeout_secs: Option<u64>,
    /// Verification policy:
    /// - `OobAndHttp` (default): both HTTP success criteria *and* OOB
    ///   callback must hold. This is the strict mode for webhook-style
    ///   detectors where 200 OK is necessary but not sufficient.
    /// - `OobOnly`: ignore HTTP success, trust the OOB callback. For
    ///   detectors where the API has no useful HTTP response shape but
    ///   provably triggers an outbound request (e.g., one-way push tokens).
    /// - `OobOptional`: HTTP success alone verifies; OOB just enriches
    ///   metadata with `oob_observed=true|false` for the report.
    #[serde(default)]
    pub policy: OobPolicy,
}

/// Out-of-band callback protocol expected from a successful exfil.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "lowercase")]
pub enum OobProtocol {
    /// Any DNS resolution against `{{interactsh}}.host`. Cheapest signal -
    /// many services resolve a webhook URL even before fetching it.
    Dns,
    /// HTTP or HTTPS request to the interactsh URL. The strongest signal;
    /// proves the service made an outbound HTTP request with the credential.
    Http,
    /// SMTP delivery attempt to `<random>@{{interactsh.host}}`. For mail
    /// detectors (Mailgun, SendGrid, …) where exfil = sending mail.
    Smtp,
    /// Any of the above. Use sparingly - a chatty CDN doing DNS prefetch
    /// can cause false positives.
    Any,
}

/// How OOB observation combines with HTTP success criteria.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
#[serde(rename_all = "snake_case")]
pub enum OobPolicy {
    #[default]
    OobAndHttp,
    OobOnly,
    OobOptional,
}

/// A single step in a multi-step verification flow.
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(deny_unknown_fields)]
pub struct StepSpec {
    pub name: String,
    pub method: HttpMethod,
    pub url: String,
    pub auth: AuthSpec,
    #[serde(default)]
    pub headers: Vec<HeaderSpec>,
    pub body: Option<String>,
    pub success: SuccessSpec,
    #[serde(default)]
    pub extract: Vec<MetadataSpec>,
}

/// Custom HTTP header specification.
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(deny_unknown_fields)]
pub struct HeaderSpec {
    pub name: String,
    pub value: String,
}

/// Authentication scheme for verification requests.
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(tag = "type", rename_all = "snake_case")]
pub enum AuthSpec {
    None,
    Bearer {
        field: String,
    },
    Basic {
        username: String,
        password: String,
    },
    Header {
        name: String,
        template: String,
    },
    Query {
        param: String,
        field: String,
    },
    #[serde(rename = "aws_v4")]
    AwsV4 {
        access_key: String,
        secret_key: String,
        region: String,
        service: String,
        session_token: Option<String>,
    },
    Script {
        engine: String,
        code: String,
    },
}

impl AuthSpec {
    pub fn service_name(&self) -> Option<&str> {
        match self {
            AuthSpec::AwsV4 { service, .. } => Some(service),
            _ => None,
        }
    }
}

/// Criteria for a successful verification response.
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
#[serde(deny_unknown_fields)]
pub struct SuccessSpec {
    #[serde(default)]
    /// Required HTTP status code.
    pub status: Option<u16>,
    #[serde(default)]
    /// Reject if this status code is returned.
    pub status_not: Option<u16>,
    #[serde(default)]
    /// Response body must contain this substring.
    pub body_contains: Option<String>,
    #[serde(default)]
    /// Response body must NOT contain this substring.
    pub body_not_contains: Option<String>,
    #[serde(default)]
    /// GJSON path to check in response body.
    pub json_path: Option<String>,
    #[serde(default)]
    /// Expected value at \`json_path\`.
    pub equals: Option<String>,
}

/// Severity level for a finding.
///
/// `ClientSafe` is the bug-bounty tier for keys that are public by
/// design and shipped in client bundles: Sentry DSNs, Stripe `pk_*`
/// publishable keys, Mapbox `pk.` public tokens, PostHog project keys,
/// Firebase Web API keys, Google Maps browser keys, Algolia search
/// keys, Datadog browser RUM tokens, Mixpanel project tokens. The
/// detector still fires (a token grep is a token grep) but the
/// finding is rendered below `Low` and gated by `--hide-client-safe`
/// so a hunter running `keyhog scan --hide-client-safe target/` only
/// sees credentials that an attacker could actually exfiltrate
/// server-side.
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize, Default)]
#[serde(rename_all = "kebab-case")]
pub enum Severity {
    #[default]
    Info,
    #[serde(alias = "client_safe")]
    ClientSafe,
    Low,
    Medium,
    High,
    Critical,
}

impl Severity {
    pub fn to_severity(&self) -> Self {
        *self
    }

    /// Step the severity down one tier (Critical → High, High → Medium, …).
    /// `Info` stays at `Info` (no lower bucket).
    ///
    /// Used by diff-aware scoring: a credential that only appears in non-HEAD
    /// git history is still a leak (commit history is public if the repo is)
    /// but is meaningfully less urgent than a credential live in HEAD that an
    /// attacker can grep right now. One tier of downgrade communicates that
    /// without hiding the finding entirely.
    pub fn downgrade_one(self) -> Self {
        match self {
            Severity::Critical => Severity::High,
            Severity::High => Severity::Medium,
            Severity::Medium => Severity::Low,
            Severity::Low => Severity::ClientSafe,
            Severity::ClientSafe => Severity::Info,
            Severity::Info => Severity::Info,
        }
    }

    /// Canonical lowercase string for this severity, matching the serde
    /// `kebab-case` wire form (`client-safe`, not `clientsafe`). This is the
    /// single source of truth for rendering a severity as text; reporters and
    /// any other surface should go through `Display`/`as_str` rather than
    /// reaching for `format!("{:?}")`, which diverges for `ClientSafe`.
    pub fn as_str(&self) -> &'static str {
        match self {
            Severity::Info => "info",
            Severity::ClientSafe => "client-safe",
            Severity::Low => "low",
            Severity::Medium => "medium",
            Severity::High => "high",
            Severity::Critical => "critical",
        }
    }
}

impl std::fmt::Display for Severity {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.write_str(self.as_str())
    }
}

/// HTTP method for verification requests.
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub enum HttpMethod {
    #[serde(rename = "GET")]
    Get,
    #[serde(rename = "POST")]
    Post,
    #[serde(rename = "PUT")]
    Put,
    #[serde(rename = "DELETE")]
    Delete,
    #[serde(rename = "PATCH")]
    Patch,
    #[serde(rename = "HEAD")]
    Head,
}

/// Wrapping struct for a detector TOML file.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DetectorFile {
    pub detector: DetectorSpec,
}

/// Errors returned while loading or validating detector specifications.
#[derive(Debug, Error)]
#[allow(clippy::result_large_err)] // SpecError variants include 128-byte toml::de::Error; boxing would be a breaking API change.
pub enum SpecError {
    #[error(
        "failed to read detector file {path}: {source}. Fix: check the detector path exists and that the file is readable TOML"
    )]
    ReadFile {
        path: String,
        source: std::io::Error,
    },
    #[error("invalid TOML in detector {path}: {source}. Fix: repair the TOML syntax in the detector file")]
    InvalidToml {
        path: std::path::PathBuf,
        source: toml::de::Error,
    },
}