Skip to main content

keyhog_core/
spec.rs

1//! Detector specification: TOML-based pattern definitions with regex, keywords,
2//! verification endpoints, and companion patterns.
3
4// Debt bucket: 55 public items, each landed before the crate floor raised
5// `missing_docs` to `warn`. Each is part of the public TOML schema and would
6// benefit from a doc line; remove this allow once they all carry one.
7#![allow(missing_docs)]
8
9mod load;
10mod validate;
11
12use serde::{Deserialize, Serialize};
13use thiserror::Error;
14
15pub use load::{
16    load_detector_cache, load_detectors, load_detectors_from_str, load_detectors_with_gate,
17    save_detector_cache,
18};
19pub use validate::{validate_detector, QualityIssue};
20
21/// Metadata field specification for verification results.
22#[derive(Debug, Clone, Serialize, Deserialize)]
23#[serde(deny_unknown_fields)]
24pub struct MetadataSpec {
25    /// Field name in the finding metadata map.
26    pub name: String,
27    /// GJSON path to extract from the verification response body.
28    pub json_path: String,
29}
30
31/// A complete detector definition loaded from a TOML file.
32#[derive(Debug, Clone, Serialize, Deserialize, Default)]
33#[serde(deny_unknown_fields)]
34pub struct DetectorSpec {
35    /// Unique stable identifier (e.g. \`aws-access-key\`).
36    pub id: String,
37    /// Human-readable name.
38    pub name: String,
39    /// Target service (e.g. \`aws\`, \`stripe\`).
40    pub service: String,
41    /// Default severity for findings.
42    pub severity: Severity,
43    /// List of regex patterns to match.
44    pub patterns: Vec<PatternSpec>,
45    /// Secondary patterns required to confirm a match.
46    #[serde(default)]
47    pub companions: Vec<CompanionSpec>,
48    /// Live verification configuration.
49    pub verify: Option<VerifySpec>,
50    /// High-performance pre-filtering keywords.
51    #[serde(default)]
52    pub keywords: Vec<String>,
53    /// Self-declared per-detector confidence floor, in `[0.0, 1.0]`.
54    ///
55    /// When set, findings from THIS detector use this floor instead of the
56    /// global `--min-confidence` / `[scan] min_confidence`. A detector with a
57    /// distinctive vendor prefix (e.g. sourcegraph `sgp_<40hex>`, cursor
58    /// `key_<64hex>`) is high-confidence by virtue of the prefix even when the
59    /// body is low-entropy hex that the generic confidence model scores below
60    /// the global floor; the detector author declares that here so the
61    /// detector ships working out of the box. Costs nothing at scan time —
62    /// it is a single O(1) map lookup at the post-scan floor gate, on an
63    /// already-compiled corpus. An operator `.keyhog.toml`
64    /// `[detector.<id>] min_confidence` still overrides this self-declared
65    /// default. `None` (the default) means "use the global floor".
66    #[serde(default)]
67    pub min_confidence: Option<f64>,
68    /// Inline self-test fixtures (`[[detector.tests]]`, Tier-B data): each entry
69    /// carries a positive example the detector MUST fire on and/or a negative
70    /// example it MUST NOT. Consumed by the contract/self-validate harness;
71    /// ignored at scan time. Modeled here (rather than silently dropped) so the
72    /// schema's `deny_unknown_fields` typo-guard covers the whole detector file.
73    #[serde(default)]
74    pub tests: Vec<DetectorTestSpec>,
75}
76
77/// One inline detector self-test fixture (`[[detector.tests]]`).
78#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Default)]
79#[serde(deny_unknown_fields)]
80pub struct DetectorTestSpec {
81    /// Text this detector MUST fire on.
82    #[serde(default)]
83    pub test_positive: Option<String>,
84    /// Text this detector MUST NOT fire on.
85    #[serde(default)]
86    pub test_negative: Option<String>,
87}
88
89/// A regex pattern with optional capture group and description.
90#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Default)]
91#[serde(deny_unknown_fields)]
92pub struct PatternSpec {
93    /// Regular expression string (Rust flavor).
94    pub regex: String,
95    /// Optional context description.
96    pub description: Option<String>,
97    /// Optional capture group index containing the secret.
98    pub group: Option<usize>,
99    /// When true, a match against THIS pattern downgrades the
100    /// finding to `Severity::ClientSafe` (regardless of the detector's
101    /// nominal severity). Used by services that intentionally ship
102    /// public-facing keys in client bundles:
103    ///
104    ///   - Sentry DSN (the `https://<key>@` URL is meant for the browser)
105    ///   - Stripe `pk_live_` / `pk_test_` (publishable, sk_ is secret)
106    ///   - Mapbox `pk.` (public, `sk.` is secret)
107    ///   - Firebase Web API key, Google Maps browser key
108    ///   - PostHog / Mixpanel / Algolia search / Datadog browser RUM
109    ///
110    /// Per-pattern (not per-detector) so detectors that fire on both
111    /// the public *and* the secret prefix can tag only the public one.
112    ///
113    /// Case sensitivity: keyhog compiles every regex `case_insensitive(true)`,
114    /// so to make a single pattern case-SENSITIVE (AWS `AKIA` is uppercase,
115    /// GCP/Snowflake ids are lowercase) prefix its regex with the inline flag
116    /// `(?-i)` in the TOML - no schema field needed.
117    #[serde(default)]
118    pub client_safe: bool,
119}
120
121/// Secondary pattern used to confirm a primary match or provide extra context.
122#[derive(Debug, Clone, Serialize, Deserialize)]
123#[serde(deny_unknown_fields)]
124pub struct CompanionSpec {
125    /// Field name used in verification templates (e.g. \`{{companion.secret_key}}\`).
126    pub name: String,
127    /// Regex to find the companion value nearby.
128    pub regex: String,
129    /// Maximum line distance from the primary match.
130    pub within_lines: usize,
131    /// Whether this companion must be found to report the finding.
132    #[serde(default)]
133    pub required: bool,
134}
135
136/// Live verification configuration for a detector.
137#[derive(Debug, Clone, Default, Serialize, Deserialize)]
138#[serde(deny_unknown_fields)]
139pub struct VerifySpec {
140    /// Target service identifier (defaults to detector's service if omitted).
141    #[serde(default)]
142    pub service: String,
143    /// HTTP method (default: GET).
144    pub method: Option<HttpMethod>,
145    /// Endpoint URL with optional \`{{match}}\` or \`{{companion.<name>}}\` placeholders.
146    pub url: Option<String>,
147    /// Authentication scheme.
148    pub auth: Option<AuthSpec>,
149    /// Custom HTTP headers.
150    #[serde(default)]
151    pub headers: Vec<HeaderSpec>,
152    /// Optional request body template.
153    pub body: Option<String>,
154    /// Criteria for a successful verification.
155    pub success: Option<SuccessSpec>,
156    /// Metadata to extract from the response.
157    #[serde(default)]
158    pub metadata: Vec<MetadataSpec>,
159    /// Optional request timeout override.
160    pub timeout_ms: Option<u64>,
161    /// Multi-step verification flow.
162    #[serde(default)]
163    pub steps: Vec<StepSpec>,
164    /// Domain allowlist for the verify URL after interpolation. If non-empty,
165    /// the resolved host of the (interpolated) URL - and of every step's URL -
166    /// MUST equal one of these entries (or be a subdomain of one). When empty,
167    /// the verifier falls back to a hardcoded service allowlist if the
168    /// `service` field maps to a known provider; otherwise the verifier
169    /// REFUSES to send the request. This blocks malicious detector TOMLs
170    /// that set `url = "{{match}}"` (or interpolate an attacker-controlled
171    /// companion) from exfiltrating credentials. See kimi-wave1 audit
172    /// finding 4.1 + wave3 §1.
173    #[serde(default)]
174    pub allowed_domains: Vec<String>,
175    /// Optional out-of-band verification probe. When set, the verifier mints a
176    /// per-finding correlation URL via the configured interactsh server,
177    /// substitutes `{{interactsh}}` (and `{{interactsh.host}}` /
178    /// `{{interactsh.url}}`) into the request template, and waits for the
179    /// service to call back. OOB verification proves a leaked credential is
180    /// **exfil-capable**, not just live: a webhook URL that returns 200 OK to
181    /// every probe still has to actually fetch our collector to confirm it
182    /// will deliver attacker-controlled traffic.
183    ///
184    /// Gated behind the runtime `--verify-oob` flag - never default. When the
185    /// flag is off, `oob` is ignored and verification falls back to the
186    /// HTTP success criteria alone.
187    pub oob: Option<OobSpec>,
188}
189
190/// Out-of-band callback verification configuration.
191#[derive(Debug, Clone, Serialize, Deserialize)]
192#[serde(deny_unknown_fields)]
193pub struct OobSpec {
194    /// Callback protocol the verifier waits for. The service may also touch
195    /// other protocols on the same correlation id; only the listed ones count
196    /// toward `Verified`.
197    pub protocol: OobProtocol,
198    /// How long to wait for the callback after the HTTP request returns.
199    /// Defaults to 30 seconds when omitted; capped at the engine's
200    /// `oob_timeout_max` to bound scan time.
201    #[serde(default)]
202    pub timeout_secs: Option<u64>,
203    /// Verification policy:
204    /// - `OobAndHttp` (default): both HTTP success criteria *and* OOB
205    ///   callback must hold. This is the strict mode for webhook-style
206    ///   detectors where 200 OK is necessary but not sufficient.
207    /// - `OobOnly`: ignore HTTP success, trust the OOB callback. For
208    ///   detectors where the API has no useful HTTP response shape but
209    ///   provably triggers an outbound request (e.g., one-way push tokens).
210    /// - `OobOptional`: HTTP success alone verifies; OOB just enriches
211    ///   metadata with `oob_observed=true|false` for the report.
212    #[serde(default)]
213    pub policy: OobPolicy,
214}
215
216/// Out-of-band callback protocol expected from a successful exfil.
217#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
218#[serde(rename_all = "lowercase")]
219pub enum OobProtocol {
220    /// Any DNS resolution against `{{interactsh}}.host`. Cheapest signal -
221    /// many services resolve a webhook URL even before fetching it.
222    Dns,
223    /// HTTP or HTTPS request to the interactsh URL. The strongest signal;
224    /// proves the service made an outbound HTTP request with the credential.
225    Http,
226    /// SMTP delivery attempt to `<random>@{{interactsh.host}}`. For mail
227    /// detectors (Mailgun, SendGrid, …) where exfil = sending mail.
228    Smtp,
229    /// Any of the above. Use sparingly - a chatty CDN doing DNS prefetch
230    /// can cause false positives.
231    Any,
232}
233
234/// How OOB observation combines with HTTP success criteria.
235#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
236#[serde(rename_all = "snake_case")]
237pub enum OobPolicy {
238    #[default]
239    OobAndHttp,
240    OobOnly,
241    OobOptional,
242}
243
244/// A single step in a multi-step verification flow.
245#[derive(Debug, Clone, Serialize, Deserialize)]
246#[serde(deny_unknown_fields)]
247pub struct StepSpec {
248    pub name: String,
249    pub method: HttpMethod,
250    pub url: String,
251    pub auth: AuthSpec,
252    #[serde(default)]
253    pub headers: Vec<HeaderSpec>,
254    pub body: Option<String>,
255    pub success: SuccessSpec,
256    #[serde(default)]
257    pub extract: Vec<MetadataSpec>,
258}
259
260/// Custom HTTP header specification.
261#[derive(Debug, Clone, Serialize, Deserialize)]
262#[serde(deny_unknown_fields)]
263pub struct HeaderSpec {
264    pub name: String,
265    pub value: String,
266}
267
268/// Authentication scheme for verification requests.
269#[derive(Debug, Clone, Serialize, Deserialize)]
270#[serde(tag = "type", rename_all = "snake_case")]
271pub enum AuthSpec {
272    None,
273    Bearer {
274        field: String,
275    },
276    Basic {
277        username: String,
278        password: String,
279    },
280    Header {
281        name: String,
282        template: String,
283    },
284    Query {
285        param: String,
286        field: String,
287    },
288    #[serde(rename = "aws_v4")]
289    AwsV4 {
290        access_key: String,
291        secret_key: String,
292        region: String,
293        service: String,
294        session_token: Option<String>,
295    },
296    Script {
297        engine: String,
298        code: String,
299    },
300}
301
302impl AuthSpec {
303    pub fn service_name(&self) -> Option<&str> {
304        match self {
305            AuthSpec::AwsV4 { service, .. } => Some(service),
306            _ => None,
307        }
308    }
309}
310
311/// Criteria for a successful verification response.
312#[derive(Debug, Clone, Serialize, Deserialize, Default)]
313#[serde(deny_unknown_fields)]
314pub struct SuccessSpec {
315    #[serde(default)]
316    /// Required HTTP status code.
317    pub status: Option<u16>,
318    #[serde(default)]
319    /// Reject if this status code is returned.
320    pub status_not: Option<u16>,
321    #[serde(default)]
322    /// Response body must contain this substring.
323    pub body_contains: Option<String>,
324    #[serde(default)]
325    /// Response body must NOT contain this substring.
326    pub body_not_contains: Option<String>,
327    #[serde(default)]
328    /// GJSON path to check in response body.
329    pub json_path: Option<String>,
330    #[serde(default)]
331    /// Expected value at \`json_path\`.
332    pub equals: Option<String>,
333}
334
335/// Severity level for a finding.
336///
337/// `ClientSafe` is the bug-bounty tier for keys that are public by
338/// design and shipped in client bundles: Sentry DSNs, Stripe `pk_*`
339/// publishable keys, Mapbox `pk.` public tokens, PostHog project keys,
340/// Firebase Web API keys, Google Maps browser keys, Algolia search
341/// keys, Datadog browser RUM tokens, Mixpanel project tokens. The
342/// detector still fires (a token grep is a token grep) but the
343/// finding is rendered below `Low` and gated by `--hide-client-safe`
344/// so a hunter running `keyhog scan --hide-client-safe target/` only
345/// sees credentials that an attacker could actually exfiltrate
346/// server-side.
347#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize, Default)]
348#[serde(rename_all = "kebab-case")]
349pub enum Severity {
350    #[default]
351    Info,
352    #[serde(alias = "client_safe")]
353    ClientSafe,
354    Low,
355    Medium,
356    High,
357    Critical,
358}
359
360impl Severity {
361    pub fn to_severity(&self) -> Self {
362        *self
363    }
364
365    /// Step the severity down one tier (Critical → High, High → Medium, …).
366    /// `Info` stays at `Info` (no lower bucket).
367    ///
368    /// Used by diff-aware scoring: a credential that only appears in non-HEAD
369    /// git history is still a leak (commit history is public if the repo is)
370    /// but is meaningfully less urgent than a credential live in HEAD that an
371    /// attacker can grep right now. One tier of downgrade communicates that
372    /// without hiding the finding entirely.
373    pub fn downgrade_one(self) -> Self {
374        match self {
375            Severity::Critical => Severity::High,
376            Severity::High => Severity::Medium,
377            Severity::Medium => Severity::Low,
378            Severity::Low => Severity::ClientSafe,
379            Severity::ClientSafe => Severity::Info,
380            Severity::Info => Severity::Info,
381        }
382    }
383
384    /// Canonical lowercase string for this severity, matching the serde
385    /// `kebab-case` wire form (`client-safe`, not `clientsafe`). This is the
386    /// single source of truth for rendering a severity as text; reporters and
387    /// any other surface should go through `Display`/`as_str` rather than
388    /// reaching for `format!("{:?}")`, which diverges for `ClientSafe`.
389    pub fn as_str(&self) -> &'static str {
390        match self {
391            Severity::Info => "info",
392            Severity::ClientSafe => "client-safe",
393            Severity::Low => "low",
394            Severity::Medium => "medium",
395            Severity::High => "high",
396            Severity::Critical => "critical",
397        }
398    }
399}
400
401impl std::fmt::Display for Severity {
402    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
403        f.write_str(self.as_str())
404    }
405}
406
407/// HTTP method for verification requests.
408#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
409pub enum HttpMethod {
410    #[serde(rename = "GET")]
411    Get,
412    #[serde(rename = "POST")]
413    Post,
414    #[serde(rename = "PUT")]
415    Put,
416    #[serde(rename = "DELETE")]
417    Delete,
418    #[serde(rename = "PATCH")]
419    Patch,
420    #[serde(rename = "HEAD")]
421    Head,
422}
423
424/// Wrapping struct for a detector TOML file.
425#[derive(Debug, Clone, Serialize, Deserialize)]
426pub struct DetectorFile {
427    pub detector: DetectorSpec,
428}
429
430/// Errors returned while loading or validating detector specifications.
431#[derive(Debug, Error)]
432#[allow(clippy::result_large_err)] // SpecError variants include 128-byte toml::de::Error; boxing would be a breaking API change.
433pub enum SpecError {
434    #[error(
435        "failed to read detector file {path}: {source}. Fix: check the detector path exists and that the file is readable TOML"
436    )]
437    ReadFile {
438        path: String,
439        source: std::io::Error,
440    },
441    #[error("invalid TOML in detector {path}: {source}. Fix: repair the TOML syntax in the detector file")]
442    InvalidToml {
443        path: std::path::PathBuf,
444        source: toml::de::Error,
445    },
446}