keyhog_core/
spec.rs

1//! Detector specification: TOML-based pattern definitions with regex, keywords,
2//! verification endpoints, and companion patterns.
3
4// Debt bucket: 55 public items, each landed before the crate floor raised
5// `missing_docs` to `warn`. Each is part of the public TOML schema and would
6// benefit from a doc line; remove this allow once they all carry one.
7#![allow(missing_docs)]
8
9mod load;
10mod validate;
11
12use serde::{Deserialize, Serialize};
13use thiserror::Error;
14
15pub use load::{
16    load_detector_cache, load_detectors, load_detectors_from_str, load_detectors_with_gate,
17    save_detector_cache,
18};
19pub use validate::{validate_detector, QualityIssue};
20
21/// Metadata field specification for verification results.
22#[derive(Debug, Clone, Serialize, Deserialize)]
23pub struct MetadataSpec {
24    /// Field name in the finding metadata map.
25    pub name: String,
26    /// GJSON path to extract from the verification response body.
27    pub json_path: String,
28}
29
30/// A complete detector definition loaded from a TOML file.
31#[derive(Debug, Clone, Serialize, Deserialize, Default)]
32pub struct DetectorSpec {
33    /// Unique stable identifier (e.g. \`aws-access-key\`).
34    pub id: String,
35    /// Human-readable name.
36    pub name: String,
37    /// Target service (e.g. \`aws\`, \`stripe\`).
38    pub service: String,
39    /// Default severity for findings.
40    pub severity: Severity,
41    /// List of regex patterns to match.
42    pub patterns: Vec<PatternSpec>,
43    /// Secondary patterns required to confirm a match.
44    #[serde(default)]
45    pub companions: Vec<CompanionSpec>,
46    /// Live verification configuration.
47    pub verify: Option<VerifySpec>,
48    /// High-performance pre-filtering keywords.
49    #[serde(default)]
50    pub keywords: Vec<String>,
51}
52
53/// A regex pattern with optional capture group and description.
54#[derive(Debug, Clone, Serialize, Deserialize, Default)]
55pub struct PatternSpec {
56    /// Regular expression string (Rust flavor).
57    pub regex: String,
58    /// Optional context description.
59    pub description: Option<String>,
60    /// Optional capture group index containing the secret.
61    pub group: Option<usize>,
62    /// When true, a match against THIS pattern downgrades the
63    /// finding to `Severity::ClientSafe` (regardless of the detector's
64    /// nominal severity). Used by services that intentionally ship
65    /// public-facing keys in client bundles:
66    ///
67    ///   - Sentry DSN (the `https://<key>@` URL is meant for the browser)
68    ///   - Stripe `pk_live_` / `pk_test_` (publishable, sk_ is secret)
69    ///   - Mapbox `pk.` (public, `sk.` is secret)
70    ///   - Firebase Web API key, Google Maps browser key
71    ///   - PostHog / Mixpanel / Algolia search / Datadog browser RUM
72    ///
73    /// Per-pattern (not per-detector) so detectors that fire on both
74    /// the public *and* the secret prefix can tag only the public one.
75    ///
76    /// Case sensitivity: keyhog compiles every regex `case_insensitive(true)`,
77    /// so to make a single pattern case-SENSITIVE (AWS `AKIA` is uppercase,
78    /// GCP/Snowflake ids are lowercase) prefix its regex with the inline flag
79    /// `(?-i)` in the TOML - no schema field needed.
80    #[serde(default)]
81    pub client_safe: bool,
82}
83
84/// Secondary pattern used to confirm a primary match or provide extra context.
85#[derive(Debug, Clone, Serialize, Deserialize)]
86pub struct CompanionSpec {
87    /// Field name used in verification templates (e.g. \`{{companion.secret_key}}\`).
88    pub name: String,
89    /// Regex to find the companion value nearby.
90    pub regex: String,
91    /// Maximum line distance from the primary match.
92    pub within_lines: usize,
93    /// Whether this companion must be found to report the finding.
94    #[serde(default)]
95    pub required: bool,
96}
97
98/// Live verification configuration for a detector.
99#[derive(Debug, Clone, Default, Serialize, Deserialize)]
100pub struct VerifySpec {
101    /// Target service identifier (defaults to detector's service if omitted).
102    #[serde(default)]
103    pub service: String,
104    /// HTTP method (default: GET).
105    pub method: Option<HttpMethod>,
106    /// Endpoint URL with optional \`{{match}}\` or \`{{companion.<name>}}\` placeholders.
107    pub url: Option<String>,
108    /// Authentication scheme.
109    pub auth: Option<AuthSpec>,
110    /// Custom HTTP headers.
111    #[serde(default)]
112    pub headers: Vec<HeaderSpec>,
113    /// Optional request body template.
114    pub body: Option<String>,
115    /// Criteria for a successful verification.
116    pub success: Option<SuccessSpec>,
117    /// Metadata to extract from the response.
118    #[serde(default)]
119    pub metadata: Vec<MetadataSpec>,
120    /// Optional request timeout override.
121    pub timeout_ms: Option<u64>,
122    /// Multi-step verification flow.
123    #[serde(default)]
124    pub steps: Vec<StepSpec>,
125    /// Domain allowlist for the verify URL after interpolation. If non-empty,
126    /// the resolved host of the (interpolated) URL - and of every step's URL -
127    /// MUST equal one of these entries (or be a subdomain of one). When empty,
128    /// the verifier falls back to a hardcoded service allowlist if the
129    /// `service` field maps to a known provider; otherwise the verifier
130    /// REFUSES to send the request. This blocks malicious detector TOMLs
131    /// that set `url = "{{match}}"` (or interpolate an attacker-controlled
132    /// companion) from exfiltrating credentials. See kimi-wave1 audit
133    /// finding 4.1 + wave3 §1.
134    #[serde(default)]
135    pub allowed_domains: Vec<String>,
136    /// Optional out-of-band verification probe. When set, the verifier mints a
137    /// per-finding correlation URL via the configured interactsh server,
138    /// substitutes `{{interactsh}}` (and `{{interactsh.host}}` /
139    /// `{{interactsh.url}}`) into the request template, and waits for the
140    /// service to call back. OOB verification proves a leaked credential is
141    /// **exfil-capable**, not just live: a webhook URL that returns 200 OK to
142    /// every probe still has to actually fetch our collector to confirm it
143    /// will deliver attacker-controlled traffic.
144    ///
145    /// Gated behind the runtime `--verify-oob` flag - never default. When the
146    /// flag is off, `oob` is ignored and verification falls back to the
147    /// HTTP success criteria alone.
148    pub oob: Option<OobSpec>,
149}
150
151/// Out-of-band callback verification configuration.
152#[derive(Debug, Clone, Serialize, Deserialize)]
153pub struct OobSpec {
154    /// Callback protocol the verifier waits for. The service may also touch
155    /// other protocols on the same correlation id; only the listed ones count
156    /// toward `Verified`.
157    pub protocol: OobProtocol,
158    /// How long to wait for the callback after the HTTP request returns.
159    /// Defaults to 30 seconds when omitted; capped at the engine's
160    /// `oob_timeout_max` to bound scan time.
161    #[serde(default)]
162    pub timeout_secs: Option<u64>,
163    /// Verification policy:
164    /// - `OobAndHttp` (default): both HTTP success criteria *and* OOB
165    ///   callback must hold. This is the strict mode for webhook-style
166    ///   detectors where 200 OK is necessary but not sufficient.
167    /// - `OobOnly`: ignore HTTP success, trust the OOB callback. For
168    ///   detectors where the API has no useful HTTP response shape but
169    ///   provably triggers an outbound request (e.g., one-way push tokens).
170    /// - `OobOptional`: HTTP success alone verifies; OOB just enriches
171    ///   metadata with `oob_observed=true|false` for the report.
172    #[serde(default)]
173    pub policy: OobPolicy,
174}
175
176/// Out-of-band callback protocol expected from a successful exfil.
177#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
178#[serde(rename_all = "lowercase")]
179pub enum OobProtocol {
180    /// Any DNS resolution against `{{interactsh}}.host`. Cheapest signal -
181    /// many services resolve a webhook URL even before fetching it.
182    Dns,
183    /// HTTP or HTTPS request to the interactsh URL. The strongest signal;
184    /// proves the service made an outbound HTTP request with the credential.
185    Http,
186    /// SMTP delivery attempt to `<random>@{{interactsh.host}}`. For mail
187    /// detectors (Mailgun, SendGrid, …) where exfil = sending mail.
188    Smtp,
189    /// Any of the above. Use sparingly - a chatty CDN doing DNS prefetch
190    /// can cause false positives.
191    Any,
192}
193
194/// How OOB observation combines with HTTP success criteria.
195#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
196#[serde(rename_all = "snake_case")]
197pub enum OobPolicy {
198    #[default]
199    OobAndHttp,
200    OobOnly,
201    OobOptional,
202}
203
204/// A single step in a multi-step verification flow.
205#[derive(Debug, Clone, Serialize, Deserialize)]
206pub struct StepSpec {
207    pub name: String,
208    pub method: HttpMethod,
209    pub url: String,
210    pub auth: AuthSpec,
211    #[serde(default)]
212    pub headers: Vec<HeaderSpec>,
213    pub body: Option<String>,
214    pub success: SuccessSpec,
215    #[serde(default)]
216    pub extract: Vec<MetadataSpec>,
217}
218
219/// Custom HTTP header specification.
220#[derive(Debug, Clone, Serialize, Deserialize)]
221pub struct HeaderSpec {
222    pub name: String,
223    pub value: String,
224}
225
226/// Authentication scheme for verification requests.
227#[derive(Debug, Clone, Serialize, Deserialize)]
228#[serde(tag = "type", rename_all = "snake_case")]
229pub enum AuthSpec {
230    None,
231    Bearer {
232        field: String,
233    },
234    Basic {
235        username: String,
236        password: String,
237    },
238    Header {
239        name: String,
240        template: String,
241    },
242    Query {
243        param: String,
244        field: String,
245    },
246    #[serde(rename = "aws_v4")]
247    AwsV4 {
248        access_key: String,
249        secret_key: String,
250        region: String,
251        service: String,
252        session_token: Option<String>,
253    },
254    Script {
255        engine: String,
256        code: String,
257    },
258}
259
260impl AuthSpec {
261    pub fn service_name(&self) -> Option<&str> {
262        match self {
263            AuthSpec::AwsV4 { service, .. } => Some(service),
264            _ => None,
265        }
266    }
267}
268
269/// Criteria for a successful verification response.
270#[derive(Debug, Clone, Serialize, Deserialize, Default)]
271pub struct SuccessSpec {
272    #[serde(default)]
273    /// Required HTTP status code.
274    pub status: Option<u16>,
275    #[serde(default)]
276    /// Reject if this status code is returned.
277    pub status_not: Option<u16>,
278    #[serde(default)]
279    /// Response body must contain this substring.
280    pub body_contains: Option<String>,
281    #[serde(default)]
282    /// Response body must NOT contain this substring.
283    pub body_not_contains: Option<String>,
284    #[serde(default)]
285    /// GJSON path to check in response body.
286    pub json_path: Option<String>,
287    #[serde(default)]
288    /// Expected value at \`json_path\`.
289    pub equals: Option<String>,
290}
291
292/// Severity level for a finding.
293///
294/// `ClientSafe` is the bug-bounty tier for keys that are public by
295/// design and shipped in client bundles: Sentry DSNs, Stripe `pk_*`
296/// publishable keys, Mapbox `pk.` public tokens, PostHog project keys,
297/// Firebase Web API keys, Google Maps browser keys, Algolia search
298/// keys, Datadog browser RUM tokens, Mixpanel project tokens. The
299/// detector still fires (a token grep is a token grep) but the
300/// finding is rendered below `Low` and gated by `--hide-client-safe`
301/// so a hunter running `keyhog scan --hide-client-safe target/` only
302/// sees credentials that an attacker could actually exfiltrate
303/// server-side.
304#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize, Default)]
305#[serde(rename_all = "kebab-case")]
306pub enum Severity {
307    #[default]
308    Info,
309    #[serde(alias = "client_safe")]
310    ClientSafe,
311    Low,
312    Medium,
313    High,
314    Critical,
315}
316
317impl Severity {
318    pub fn to_severity(&self) -> Self {
319        *self
320    }
321
322    /// Step the severity down one tier (Critical → High, High → Medium, …).
323    /// `Info` stays at `Info` (no lower bucket).
324    ///
325    /// Used by diff-aware scoring: a credential that only appears in non-HEAD
326    /// git history is still a leak (commit history is public if the repo is)
327    /// but is meaningfully less urgent than a credential live in HEAD that an
328    /// attacker can grep right now. One tier of downgrade communicates that
329    /// without hiding the finding entirely.
330    pub fn downgrade_one(self) -> Self {
331        match self {
332            Severity::Critical => Severity::High,
333            Severity::High => Severity::Medium,
334            Severity::Medium => Severity::Low,
335            Severity::Low => Severity::ClientSafe,
336            Severity::ClientSafe => Severity::Info,
337            Severity::Info => Severity::Info,
338        }
339    }
340}
341
342/// HTTP method for verification requests.
343#[derive(Debug, Clone, Serialize, Deserialize)]
344pub enum HttpMethod {
345    #[serde(rename = "GET")]
346    Get,
347    #[serde(rename = "POST")]
348    Post,
349    #[serde(rename = "PUT")]
350    Put,
351    #[serde(rename = "DELETE")]
352    Delete,
353    #[serde(rename = "PATCH")]
354    Patch,
355    #[serde(rename = "HEAD")]
356    Head,
357}
358
359/// Wrapping struct for a detector TOML file.
360#[derive(Debug, Clone, Serialize, Deserialize)]
361pub struct DetectorFile {
362    pub detector: DetectorSpec,
363}
364
365/// Errors returned while loading or validating detector specifications.
366#[derive(Debug, Error)]
367#[allow(clippy::result_large_err)] // SpecError variants include 128-byte toml::de::Error; boxing would be a breaking API change.
368pub enum SpecError {
369    #[error(
370        "failed to read detector file {path}: {source}. Fix: check the detector path exists and that the file is readable TOML"
371    )]
372    ReadFile {
373        path: String,
374        source: std::io::Error,
375    },
376    #[error("invalid TOML in detector {path}: {source}. Fix: repair the TOML syntax in the detector file")]
377    InvalidToml {
378        path: std::path::PathBuf,
379        source: toml::de::Error,
380    },
381}
keyhog_core/spec.rs

keyhog_core/
spec.rs