keyhog_core/spec.rs
1//! Detector specification: TOML-based pattern definitions with regex, keywords,
2//! verification endpoints, and companion patterns.
3
4// Debt bucket: 55 public items, each landed before the crate floor raised
5// `missing_docs` to `warn`. Each is part of the public TOML schema and would
6// benefit from a doc line; remove this allow once they all carry one.
7#![allow(missing_docs)]
8
9mod load;
10mod validate;
11
12use serde::{Deserialize, Serialize};
13use thiserror::Error;
14
15pub use load::{
16 load_detector_cache, load_detectors, load_detectors_from_str, load_detectors_with_gate,
17 save_detector_cache,
18};
19pub use validate::{validate_detector, QualityIssue};
20
21/// Metadata field specification for verification results.
22#[derive(Debug, Clone, Serialize, Deserialize)]
23#[serde(deny_unknown_fields)]
24pub struct MetadataSpec {
25 /// Field name in the finding metadata map.
26 pub name: String,
27 /// GJSON path to extract from the verification response body.
28 pub json_path: String,
29}
30
31/// A complete detector definition loaded from a TOML file.
32#[derive(Debug, Clone, Serialize, Deserialize, Default)]
33#[serde(deny_unknown_fields)]
34pub struct DetectorSpec {
35 /// Unique stable identifier (e.g. \`aws-access-key\`).
36 pub id: String,
37 /// Human-readable name.
38 pub name: String,
39 /// Target service (e.g. \`aws\`, \`stripe\`).
40 pub service: String,
41 /// Default severity for findings.
42 pub severity: Severity,
43 /// List of regex patterns to match.
44 pub patterns: Vec<PatternSpec>,
45 /// Secondary patterns required to confirm a match.
46 #[serde(default)]
47 pub companions: Vec<CompanionSpec>,
48 /// Live verification configuration.
49 pub verify: Option<VerifySpec>,
50 /// High-performance pre-filtering keywords.
51 #[serde(default)]
52 pub keywords: Vec<String>,
53 /// Self-declared per-detector confidence floor, in `[0.0, 1.0]`.
54 ///
55 /// When set, findings from THIS detector use this floor instead of the
56 /// global `--min-confidence` / `[scan] min_confidence`. A detector with a
57 /// distinctive vendor prefix (e.g. sourcegraph `sgp_<40hex>`, cursor
58 /// `key_<64hex>`) is high-confidence by virtue of the prefix even when the
59 /// body is low-entropy hex that the generic confidence model scores below
60 /// the global floor; the detector author declares that here so the
61 /// detector ships working out of the box. Costs nothing at scan time —
62 /// it is a single O(1) map lookup at the post-scan floor gate, on an
63 /// already-compiled corpus. An operator `.keyhog.toml`
64 /// `[detector.<id>] min_confidence` still overrides this self-declared
65 /// default. `None` (the default) means "use the global floor".
66 #[serde(default)]
67 pub min_confidence: Option<f64>,
68 /// Inline self-test fixtures (`[[detector.tests]]`, Tier-B data): each entry
69 /// carries a positive example the detector MUST fire on and/or a negative
70 /// example it MUST NOT. Consumed by the contract/self-validate harness;
71 /// ignored at scan time. Modeled here (rather than silently dropped) so the
72 /// schema's `deny_unknown_fields` typo-guard covers the whole detector file.
73 #[serde(default)]
74 pub tests: Vec<DetectorTestSpec>,
75}
76
77/// One inline detector self-test fixture (`[[detector.tests]]`).
78#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Default)]
79#[serde(deny_unknown_fields)]
80pub struct DetectorTestSpec {
81 /// Text this detector MUST fire on.
82 #[serde(default)]
83 pub test_positive: Option<String>,
84 /// Text this detector MUST NOT fire on.
85 #[serde(default)]
86 pub test_negative: Option<String>,
87}
88
89/// A regex pattern with optional capture group and description.
90#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Default)]
91#[serde(deny_unknown_fields)]
92pub struct PatternSpec {
93 /// Regular expression string (Rust flavor).
94 pub regex: String,
95 /// Optional context description.
96 pub description: Option<String>,
97 /// Optional capture group index containing the secret.
98 pub group: Option<usize>,
99 /// When true, a match against THIS pattern downgrades the
100 /// finding to `Severity::ClientSafe` (regardless of the detector's
101 /// nominal severity). Used by services that intentionally ship
102 /// public-facing keys in client bundles:
103 ///
104 /// - Sentry DSN (the `https://<key>@` URL is meant for the browser)
105 /// - Stripe `pk_live_` / `pk_test_` (publishable, sk_ is secret)
106 /// - Mapbox `pk.` (public, `sk.` is secret)
107 /// - Firebase Web API key, Google Maps browser key
108 /// - PostHog / Mixpanel / Algolia search / Datadog browser RUM
109 ///
110 /// Per-pattern (not per-detector) so detectors that fire on both
111 /// the public *and* the secret prefix can tag only the public one.
112 ///
113 /// Case sensitivity: keyhog compiles every regex `case_insensitive(true)`,
114 /// so to make a single pattern case-SENSITIVE (AWS `AKIA` is uppercase,
115 /// GCP/Snowflake ids are lowercase) prefix its regex with the inline flag
116 /// `(?-i)` in the TOML - no schema field needed.
117 #[serde(default)]
118 pub client_safe: bool,
119}
120
121/// Secondary pattern used to confirm a primary match or provide extra context.
122#[derive(Debug, Clone, Serialize, Deserialize)]
123#[serde(deny_unknown_fields)]
124pub struct CompanionSpec {
125 /// Field name used in verification templates (e.g. \`{{companion.secret_key}}\`).
126 pub name: String,
127 /// Regex to find the companion value nearby.
128 pub regex: String,
129 /// Maximum line distance from the primary match.
130 pub within_lines: usize,
131 /// Whether this companion must be found to report the finding.
132 #[serde(default)]
133 pub required: bool,
134}
135
136/// Live verification configuration for a detector.
137#[derive(Debug, Clone, Default, Serialize, Deserialize)]
138#[serde(deny_unknown_fields)]
139pub struct VerifySpec {
140 /// Target service identifier (defaults to detector's service if omitted).
141 #[serde(default)]
142 pub service: String,
143 /// HTTP method (default: GET).
144 pub method: Option<HttpMethod>,
145 /// Endpoint URL with optional \`{{match}}\` or \`{{companion.<name>}}\` placeholders.
146 pub url: Option<String>,
147 /// Authentication scheme.
148 pub auth: Option<AuthSpec>,
149 /// Custom HTTP headers.
150 #[serde(default)]
151 pub headers: Vec<HeaderSpec>,
152 /// Optional request body template.
153 pub body: Option<String>,
154 /// Criteria for a successful verification.
155 pub success: Option<SuccessSpec>,
156 /// Metadata to extract from the response.
157 #[serde(default)]
158 pub metadata: Vec<MetadataSpec>,
159 /// Optional request timeout override.
160 pub timeout_ms: Option<u64>,
161 /// Multi-step verification flow.
162 #[serde(default)]
163 pub steps: Vec<StepSpec>,
164 /// Domain allowlist for the verify URL after interpolation. If non-empty,
165 /// the resolved host of the (interpolated) URL - and of every step's URL -
166 /// MUST equal one of these entries (or be a subdomain of one). When empty,
167 /// the verifier falls back to a hardcoded service allowlist if the
168 /// `service` field maps to a known provider; otherwise the verifier
169 /// REFUSES to send the request. This blocks malicious detector TOMLs
170 /// that set `url = "{{match}}"` (or interpolate an attacker-controlled
171 /// companion) from exfiltrating credentials. See kimi-wave1 audit
172 /// finding 4.1 + wave3 §1.
173 #[serde(default)]
174 pub allowed_domains: Vec<String>,
175 /// Optional out-of-band verification probe. When set, the verifier mints a
176 /// per-finding correlation URL via the configured interactsh server,
177 /// substitutes `{{interactsh}}` (and `{{interactsh.host}}` /
178 /// `{{interactsh.url}}`) into the request template, and waits for the
179 /// service to call back. OOB verification proves a leaked credential is
180 /// **exfil-capable**, not just live: a webhook URL that returns 200 OK to
181 /// every probe still has to actually fetch our collector to confirm it
182 /// will deliver attacker-controlled traffic.
183 ///
184 /// Gated behind the runtime `--verify-oob` flag - never default. When the
185 /// flag is off, `oob` is ignored and verification falls back to the
186 /// HTTP success criteria alone.
187 pub oob: Option<OobSpec>,
188}
189
190/// Out-of-band callback verification configuration.
191#[derive(Debug, Clone, Serialize, Deserialize)]
192#[serde(deny_unknown_fields)]
193pub struct OobSpec {
194 /// Callback protocol the verifier waits for. The service may also touch
195 /// other protocols on the same correlation id; only the listed ones count
196 /// toward `Verified`.
197 pub protocol: OobProtocol,
198 /// How long to wait for the callback after the HTTP request returns.
199 /// Defaults to 30 seconds when omitted; capped at the engine's
200 /// `oob_timeout_max` to bound scan time.
201 #[serde(default)]
202 pub timeout_secs: Option<u64>,
203 /// Verification policy:
204 /// - `OobAndHttp` (default): both HTTP success criteria *and* OOB
205 /// callback must hold. This is the strict mode for webhook-style
206 /// detectors where 200 OK is necessary but not sufficient.
207 /// - `OobOnly`: ignore HTTP success, trust the OOB callback. For
208 /// detectors where the API has no useful HTTP response shape but
209 /// provably triggers an outbound request (e.g., one-way push tokens).
210 /// - `OobOptional`: HTTP success alone verifies; OOB just enriches
211 /// metadata with `oob_observed=true|false` for the report.
212 #[serde(default)]
213 pub policy: OobPolicy,
214}
215
216/// Out-of-band callback protocol expected from a successful exfil.
217#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
218#[serde(rename_all = "lowercase")]
219pub enum OobProtocol {
220 /// Any DNS resolution against `{{interactsh}}.host`. Cheapest signal -
221 /// many services resolve a webhook URL even before fetching it.
222 Dns,
223 /// HTTP or HTTPS request to the interactsh URL. The strongest signal;
224 /// proves the service made an outbound HTTP request with the credential.
225 Http,
226 /// SMTP delivery attempt to `<random>@{{interactsh.host}}`. For mail
227 /// detectors (Mailgun, SendGrid, …) where exfil = sending mail.
228 Smtp,
229 /// Any of the above. Use sparingly - a chatty CDN doing DNS prefetch
230 /// can cause false positives.
231 Any,
232}
233
234/// How OOB observation combines with HTTP success criteria.
235#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
236#[serde(rename_all = "snake_case")]
237pub enum OobPolicy {
238 #[default]
239 OobAndHttp,
240 OobOnly,
241 OobOptional,
242}
243
244/// A single step in a multi-step verification flow.
245#[derive(Debug, Clone, Serialize, Deserialize)]
246#[serde(deny_unknown_fields)]
247pub struct StepSpec {
248 pub name: String,
249 pub method: HttpMethod,
250 pub url: String,
251 pub auth: AuthSpec,
252 #[serde(default)]
253 pub headers: Vec<HeaderSpec>,
254 pub body: Option<String>,
255 pub success: SuccessSpec,
256 #[serde(default)]
257 pub extract: Vec<MetadataSpec>,
258}
259
260/// Custom HTTP header specification.
261#[derive(Debug, Clone, Serialize, Deserialize)]
262#[serde(deny_unknown_fields)]
263pub struct HeaderSpec {
264 pub name: String,
265 pub value: String,
266}
267
268/// Authentication scheme for verification requests.
269#[derive(Debug, Clone, Serialize, Deserialize)]
270#[serde(tag = "type", rename_all = "snake_case")]
271pub enum AuthSpec {
272 None,
273 Bearer {
274 field: String,
275 },
276 Basic {
277 username: String,
278 password: String,
279 },
280 Header {
281 name: String,
282 template: String,
283 },
284 Query {
285 param: String,
286 field: String,
287 },
288 #[serde(rename = "aws_v4")]
289 AwsV4 {
290 access_key: String,
291 secret_key: String,
292 region: String,
293 service: String,
294 session_token: Option<String>,
295 },
296 Script {
297 engine: String,
298 code: String,
299 },
300}
301
302impl AuthSpec {
303 pub fn service_name(&self) -> Option<&str> {
304 match self {
305 AuthSpec::AwsV4 { service, .. } => Some(service),
306 _ => None,
307 }
308 }
309}
310
311/// Criteria for a successful verification response.
312#[derive(Debug, Clone, Serialize, Deserialize, Default)]
313#[serde(deny_unknown_fields)]
314pub struct SuccessSpec {
315 #[serde(default)]
316 /// Required HTTP status code.
317 pub status: Option<u16>,
318 #[serde(default)]
319 /// Reject if this status code is returned.
320 pub status_not: Option<u16>,
321 #[serde(default)]
322 /// Response body must contain this substring.
323 pub body_contains: Option<String>,
324 #[serde(default)]
325 /// Response body must NOT contain this substring.
326 pub body_not_contains: Option<String>,
327 #[serde(default)]
328 /// GJSON path to check in response body.
329 pub json_path: Option<String>,
330 #[serde(default)]
331 /// Expected value at \`json_path\`.
332 pub equals: Option<String>,
333}
334
335/// Severity level for a finding.
336///
337/// `ClientSafe` is the bug-bounty tier for keys that are public by
338/// design and shipped in client bundles: Sentry DSNs, Stripe `pk_*`
339/// publishable keys, Mapbox `pk.` public tokens, PostHog project keys,
340/// Firebase Web API keys, Google Maps browser keys, Algolia search
341/// keys, Datadog browser RUM tokens, Mixpanel project tokens. The
342/// detector still fires (a token grep is a token grep) but the
343/// finding is rendered below `Low` and gated by `--hide-client-safe`
344/// so a hunter running `keyhog scan --hide-client-safe target/` only
345/// sees credentials that an attacker could actually exfiltrate
346/// server-side.
347#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize, Default)]
348#[serde(rename_all = "kebab-case")]
349pub enum Severity {
350 #[default]
351 Info,
352 #[serde(alias = "client_safe")]
353 ClientSafe,
354 Low,
355 Medium,
356 High,
357 Critical,
358}
359
360impl Severity {
361 pub fn to_severity(&self) -> Self {
362 *self
363 }
364
365 /// Step the severity down one tier (Critical → High, High → Medium, …).
366 /// `Info` stays at `Info` (no lower bucket).
367 ///
368 /// Used by diff-aware scoring: a credential that only appears in non-HEAD
369 /// git history is still a leak (commit history is public if the repo is)
370 /// but is meaningfully less urgent than a credential live in HEAD that an
371 /// attacker can grep right now. One tier of downgrade communicates that
372 /// without hiding the finding entirely.
373 pub fn downgrade_one(self) -> Self {
374 match self {
375 Severity::Critical => Severity::High,
376 Severity::High => Severity::Medium,
377 Severity::Medium => Severity::Low,
378 Severity::Low => Severity::ClientSafe,
379 Severity::ClientSafe => Severity::Info,
380 Severity::Info => Severity::Info,
381 }
382 }
383
384 /// Canonical lowercase string for this severity, matching the serde
385 /// `kebab-case` wire form (`client-safe`, not `clientsafe`). This is the
386 /// single source of truth for rendering a severity as text; reporters and
387 /// any other surface should go through `Display`/`as_str` rather than
388 /// reaching for `format!("{:?}")`, which diverges for `ClientSafe`.
389 pub fn as_str(&self) -> &'static str {
390 match self {
391 Severity::Info => "info",
392 Severity::ClientSafe => "client-safe",
393 Severity::Low => "low",
394 Severity::Medium => "medium",
395 Severity::High => "high",
396 Severity::Critical => "critical",
397 }
398 }
399}
400
401impl std::fmt::Display for Severity {
402 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
403 f.write_str(self.as_str())
404 }
405}
406
407/// HTTP method for verification requests.
408#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
409pub enum HttpMethod {
410 #[serde(rename = "GET")]
411 Get,
412 #[serde(rename = "POST")]
413 Post,
414 #[serde(rename = "PUT")]
415 Put,
416 #[serde(rename = "DELETE")]
417 Delete,
418 #[serde(rename = "PATCH")]
419 Patch,
420 #[serde(rename = "HEAD")]
421 Head,
422}
423
424/// Wrapping struct for a detector TOML file.
425#[derive(Debug, Clone, Serialize, Deserialize)]
426pub struct DetectorFile {
427 pub detector: DetectorSpec,
428}
429
430/// Errors returned while loading or validating detector specifications.
431#[derive(Debug, Error)]
432#[allow(clippy::result_large_err)] // SpecError variants include 128-byte toml::de::Error; boxing would be a breaking API change.
433pub enum SpecError {
434 #[error(
435 "failed to read detector file {path}: {source}. Fix: check the detector path exists and that the file is readable TOML"
436 )]
437 ReadFile {
438 path: String,
439 source: std::io::Error,
440 },
441 #[error("invalid TOML in detector {path}: {source}. Fix: repair the TOML syntax in the detector file")]
442 InvalidToml {
443 path: std::path::PathBuf,
444 source: toml::de::Error,
445 },
446}