Skip to main content

keyhog_core/
finding.rs

1//! Scanner findings: the output type for detected secrets with location,
2//! confidence, detector metadata, and optional verification status.
3
4use serde::{Deserialize, Serialize};
5use std::borrow::Cow;
6use std::collections::HashMap;
7use std::sync::Arc;
8
9use crate::Severity;
10
11/// A raw pattern match before verification or deduplication.
12///
13/// `entropy` and `confidence` are stored as `f64` but are guaranteed never to
14/// be `NaN` (sanitized at construction time). This keeps the manual `Eq` impl
15/// reflexive, which downstream code relies on for `HashMap`/`BTreeMap` keys.
16///
17/// Manual `Debug` impl redacts the `credential` field — the previous
18/// derive-`Debug` was a CRITICAL leak vector (any `{:?}` print, panic
19/// handler, or `tracing::error!(?match)` would expose plaintext). See
20/// audit kimi-wave1 finding 1.1.
21#[derive(Clone, Serialize, Deserialize)]
22pub struct RawMatch {
23    /// Stable detector identifier.
24    #[serde(with = "serde_arc_str")]
25    pub detector_id: Arc<str>,
26    /// Human-readable detector name.
27    #[serde(with = "serde_arc_str")]
28    pub detector_name: Arc<str>,
29    /// Service namespace associated with the detector.
30    #[serde(with = "serde_arc_str")]
31    pub service: Arc<str>,
32    /// Detector severity level.
33    pub severity: Severity,
34    /// Matched credential bytes before redaction.
35    #[serde(with = "serde_arc_str")]
36    pub credential: Arc<str>,
37    /// SHA-256 hash of the credential for allowlisting and deduplication.
38    pub credential_hash: String,
39    /// Companion credential or context value extracted nearby.
40    pub companions: std::collections::HashMap<String, String>,
41    /// Source location for the match.
42    pub location: MatchLocation,
43    /// Shannon entropy of the matched credential (0.0 - 8.0). NaN-sanitized.
44    #[serde(skip_serializing_if = "Option::is_none")]
45    pub entropy: Option<f64>,
46    /// Confidence score (0.0 - 1.0). NaN-sanitized at construction.
47    #[serde(skip_serializing_if = "Option::is_none")]
48    pub confidence: Option<f64>,
49}
50
51impl RawMatch {
52    /// Replace NaN floats with `None` so the manual `Eq` impl stays reflexive
53    /// and `HashMap`/`BTreeMap` lookups don't trap. Call this on any externally
54    /// constructed `RawMatch` (deserialized findings, scanner outputs).
55    pub fn sanitize_floats(mut self) -> Self {
56        if self.entropy.is_some_and(f64::is_nan) {
57            self.entropy = None;
58        }
59        if self.confidence.is_some_and(f64::is_nan) {
60            self.confidence = None;
61        }
62        self
63    }
64}
65
66impl PartialEq for RawMatch {
67    fn eq(&self, other: &Self) -> bool {
68        // Compare every field; for the f64 options use `total_cmp` semantics so
69        // NaN-vs-NaN compares equal. We additionally normalize NaN→None on
70        // construction (`sanitize_floats`), but the total-ordering comparison
71        // here keeps the impl sound even if a NaN slips through.
72        self.detector_id == other.detector_id
73            && self.detector_name == other.detector_name
74            && self.service == other.service
75            && self.severity == other.severity
76            && self.credential == other.credential
77            && self.credential_hash == other.credential_hash
78            && self.companions == other.companions
79            && self.location == other.location
80            && opt_f64_total_eq(self.entropy, other.entropy)
81            && opt_f64_total_eq(self.confidence, other.confidence)
82    }
83}
84
85impl Eq for RawMatch {}
86
87impl std::fmt::Debug for RawMatch {
88    /// Redacted Debug. Replaces `derive(Debug)` which would print the raw
89    /// `credential: Arc<str>` plaintext. See kimi-wave1 audit finding 1.1.
90    /// `credential_hash` is preserved because it's already a one-way SHA-256.
91    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
92        f.debug_struct("RawMatch")
93            .field("detector_id", &self.detector_id)
94            .field("detector_name", &self.detector_name)
95            .field("service", &self.service)
96            .field("severity", &self.severity)
97            .field(
98                "credential",
99                &format_args!("<redacted {} bytes>", self.credential.len()),
100            )
101            .field("credential_hash", &self.credential_hash)
102            .field(
103                "companions",
104                &format_args!("<{} redacted companions>", self.companions.len()),
105            )
106            .field("location", &self.location)
107            .field("entropy", &self.entropy)
108            .field("confidence", &self.confidence)
109            .finish()
110    }
111}
112
113#[inline]
114fn opt_f64_total_eq(a: Option<f64>, b: Option<f64>) -> bool {
115    match (a, b) {
116        (None, None) => true,
117        (Some(x), Some(y)) => x.total_cmp(&y) == std::cmp::Ordering::Equal,
118        _ => false,
119    }
120}
121
122impl PartialOrd for RawMatch {
123    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
124        Some(self.cmp(other))
125    }
126}
127
128impl Ord for RawMatch {
129    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
130        // Higher confidence first
131        let self_conf = self.confidence.unwrap_or(0.0);
132        let other_conf = other.confidence.unwrap_or(0.0);
133
134        match other_conf.total_cmp(&self_conf) {
135            std::cmp::Ordering::Equal => {}
136            ord => return ord,
137        }
138
139        // Then higher severity first (Critical > High > Medium > Low > Info)
140        match other.severity.cmp(&self.severity) {
141            std::cmp::Ordering::Equal => {}
142            ord => return ord,
143        }
144
145        // Finally, deterministic sort by detector and credential
146        match self.detector_id.cmp(&other.detector_id) {
147            std::cmp::Ordering::Equal => self.credential.cmp(&other.credential),
148            ord => ord,
149        }
150    }
151}
152
153/// Where a credential was found: file path, line number, commit, and author.
154#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
155pub struct MatchLocation {
156    /// Logical source backend, such as `filesystem` or `git`.
157    #[serde(with = "serde_arc_str")]
158    pub source: Arc<str>,
159    /// File path, object key, or logical path when available.
160    ///
161    /// Paths stored here must be valid UTF-8. Source implementations that see
162    /// non-UTF-8 paths should encode them into a reversible escaped string
163    /// before constructing a [`MatchLocation`].
164    #[serde(with = "serde_arc_str_opt")]
165    pub file_path: Option<Arc<str>>,
166    /// One-based line number when known.
167    pub line: Option<usize>,
168    /// Byte offset from the start of the source chunk.
169    pub offset: usize,
170    /// Commit identifier for history-derived matches.
171    #[serde(with = "serde_arc_str_opt")]
172    pub commit: Option<Arc<str>>,
173    /// Commit author when available.
174    #[serde(with = "serde_arc_str_opt")]
175    pub author: Option<Arc<str>>,
176    /// Commit timestamp when available.
177    #[serde(with = "serde_arc_str_opt")]
178    pub date: Option<Arc<str>>,
179}
180
181/// A finding after verification — the final output.
182#[derive(Debug, Clone, Serialize, Deserialize)]
183pub struct VerifiedFinding {
184    /// Stable detector identifier.
185    #[serde(with = "serde_arc_str")]
186    pub detector_id: Arc<str>,
187    /// Human-readable detector name.
188    #[serde(with = "serde_arc_str")]
189    pub detector_name: Arc<str>,
190    /// Service namespace associated with the detector.
191    #[serde(with = "serde_arc_str")]
192    pub service: Arc<str>,
193    /// Detector severity level.
194    pub severity: Severity,
195    /// Redacted version of the credential for reporting.
196    pub credential_redacted: Cow<'static, str>,
197    /// SHA-256 hash of the original credential for internal correlation.
198    pub credential_hash: String,
199    /// Source location for the match.
200    pub location: MatchLocation,
201    /// Verification result.
202    pub verification: VerificationResult,
203    /// Additional provider-specific metadata (e.g. account ID, scope).
204    pub metadata: HashMap<String, String>,
205    /// Additional duplicate locations found for this credential.
206    pub additional_locations: Vec<MatchLocation>,
207    /// Confidence score (0.0 - 1.0) combining entropy, keyword proximity, file type, etc.
208    #[serde(skip_serializing_if = "Option::is_none")]
209    pub confidence: Option<f64>,
210}
211
212/// Result of live verification: whether the credential is active, revoked, or untested.
213#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
214#[serde(rename_all = "snake_case")]
215pub enum VerificationResult {
216    /// Credential is active and verified by the provider.
217    Live,
218    /// Credential is valid but has been explicitly revoked or disabled.
219    Revoked,
220    /// Credential was rejected by the provider (invalid password/token).
221    Dead,
222    /// Provider returned a rate-limit error (e.g. 429).
223    RateLimited,
224    /// Verification failed due to network error or timeout.
225    Error(String),
226    /// Detector does not support live verification.
227    Unverifiable,
228    /// Verification was not attempted (e.g. disabled via flag).
229    Skipped,
230}
231
232impl RawMatch {
233    /// Get unique key for deduplication.
234    pub fn deduplication_key(&self) -> (&str, &str) {
235        (&self.detector_id, &self.credential)
236    }
237
238    /// Convert into a serialization-safe DTO that never carries the plaintext
239    /// credential. Use this anywhere a `RawMatch` would otherwise be written
240    /// to disk, sent over the network, or rendered into a user-visible
241    /// report. See kimi-wave1 audit finding 2.1 (`scan_system.rs` JSON exfil).
242    pub fn to_redacted(&self) -> RedactedFinding {
243        RedactedFinding {
244            detector_id: self.detector_id.clone(),
245            detector_name: self.detector_name.clone(),
246            service: self.service.clone(),
247            severity: self.severity,
248            credential_redacted: crate::redact(&self.credential),
249            credential_hash: self.credential_hash.clone(),
250            companions_redacted: self
251                .companions
252                .iter()
253                .map(|(k, v)| (k.clone(), crate::redact(v).into_owned()))
254                .collect(),
255            location: self.location.clone(),
256            entropy: self.entropy,
257            confidence: self.confidence,
258        }
259    }
260}
261
262/// Redacted, disk-safe view of a `RawMatch`. Carries only the SHA-256 hash
263/// and a "first4...last4" preview, never the plaintext credential. This is
264/// the only finding shape that should ever leave keyhog's process boundary.
265#[derive(Debug, Clone, Serialize, Deserialize)]
266pub struct RedactedFinding {
267    #[serde(with = "serde_arc_str")]
268    pub detector_id: Arc<str>,
269    #[serde(with = "serde_arc_str")]
270    pub detector_name: Arc<str>,
271    #[serde(with = "serde_arc_str")]
272    pub service: Arc<str>,
273    pub severity: Severity,
274    pub credential_redacted: Cow<'static, str>,
275    pub credential_hash: String,
276    pub companions_redacted: HashMap<String, String>,
277    pub location: MatchLocation,
278    #[serde(skip_serializing_if = "Option::is_none")]
279    pub entropy: Option<f64>,
280    #[serde(skip_serializing_if = "Option::is_none")]
281    pub confidence: Option<f64>,
282}
283
284pub mod serde_arc_str {
285    use serde::{Deserialize, Deserializer, Serialize, Serializer};
286    use std::sync::Arc;
287
288    pub fn serialize<S>(val: &Arc<str>, serializer: S) -> Result<S::Ok, S::Error>
289    where
290        S: Serializer,
291    {
292        val.as_ref().serialize(serializer)
293    }
294
295    pub fn deserialize<'de, D>(deserializer: D) -> Result<Arc<str>, D::Error>
296    where
297        D: Deserializer<'de>,
298    {
299        String::deserialize(deserializer).map(Arc::from)
300    }
301}
302
303pub mod serde_arc_str_opt {
304    use serde::{Deserialize, Deserializer, Serialize, Serializer};
305    use std::sync::Arc;
306
307    pub fn serialize<S>(val: &Option<Arc<str>>, serializer: S) -> Result<S::Ok, S::Error>
308    where
309        S: Serializer,
310    {
311        val.as_ref().map(|s| s.as_ref()).serialize(serializer)
312    }
313
314    pub fn deserialize<'de, D>(deserializer: D) -> Result<Option<Arc<str>>, D::Error>
315    where
316        D: Deserializer<'de>,
317    {
318        Option::<String>::deserialize(deserializer).map(|opt| opt.map(Arc::from))
319    }
320}
321
322#[cfg(test)]
323mod hostile_metadata_tests {
324    //! "Production-level robustness" coverage: a finding with NUL
325    //! bytes, control characters, or other unusual metadata content
326    //! must not panic on JSON serialization, Display formatting, or
327    //! any standard operation. The fields come from filesystem
328    //! walks (PathBuf::display) and external source backends, so
329    //! "hostile content" is realistic — operators scanning
330    //! adversarial repositories or untrusted HTTP responses see
331    //! these every day.
332
333    use super::*;
334    use std::collections::HashMap;
335    use std::sync::Arc;
336
337    fn finding_with_hostile_path(path: &str) -> VerifiedFinding {
338        VerifiedFinding {
339            detector_id: Arc::from("test-detector"),
340            detector_name: Arc::from("Test Detector"),
341            service: Arc::from("test"),
342            severity: Severity::Medium,
343            credential_redacted: Cow::Borrowed("****"),
344            credential_hash: "deadbeef".into(),
345            location: MatchLocation {
346                source: Arc::from("filesystem"),
347                file_path: Some(Arc::from(path)),
348                line: Some(1),
349                offset: 0,
350                commit: None,
351                author: None,
352                date: None,
353            },
354            verification: VerificationResult::Skipped,
355            metadata: HashMap::new(),
356            additional_locations: Vec::new(),
357            confidence: Some(0.5),
358        }
359    }
360
361    #[test]
362    fn nul_bytes_in_path_serialize_to_valid_json() {
363        // A path containing a NUL byte (e.g. crafted by a Source
364        // emitting through the registry) must round-trip via JSON
365        // without panic and without producing malformed output.
366        // serde_json escapes NUL as ``.
367        let finding = finding_with_hostile_path("evil\0name.env");
368        let json = serde_json::to_string(&finding).expect("serialize ok");
369        assert!(json.contains("\\u0000"), "NUL must be escaped in JSON");
370        // And it must parse back cleanly.
371        let v: serde_json::Value = serde_json::from_str(&json).expect("parse ok");
372        assert!(v.is_object());
373    }
374
375    #[test]
376    fn control_chars_in_path_serialize_safely() {
377        // Embedded \r, \n, \t, ESC, etc. — common in attacker-
378        // controlled filenames. JSON must escape rather than emit
379        // raw control bytes (which would corrupt log scrapers /
380        // SARIF readers).
381        let finding =
382            finding_with_hostile_path("path\r\nwith\x1b[31mANSI\x1bcontrol\tchars");
383        let json = serde_json::to_string(&finding).expect("serialize ok");
384        assert!(json.contains("\\r"));
385        assert!(json.contains("\\n"));
386        assert!(json.contains("\\t"));
387        // ESC (0x1b) escapes to  in JSON.
388        assert!(json.contains("\\u001b"));
389        let _: serde_json::Value = serde_json::from_str(&json).expect("parse ok");
390    }
391
392    #[test]
393    fn replacement_char_in_path_round_trips() {
394        // Lossy UTF-8 paths from `Path::display()` contain U+FFFD
395        // for invalid byte sequences. Must serialize/deserialize
396        // cleanly — Rust strings are valid UTF-8 by construction so
397        // U+FFFD is a normal char.
398        let finding = finding_with_hostile_path("name_\u{FFFD}_after");
399        let json = serde_json::to_string(&finding).expect("serialize ok");
400        let v: serde_json::Value = serde_json::from_str(&json).expect("parse ok");
401        let recovered = v["location"]["file_path"].as_str().unwrap();
402        assert!(recovered.contains('\u{FFFD}'));
403    }
404
405    #[test]
406    fn extremely_long_path_does_not_panic() {
407        // A 1 MiB path string. No panic, no truncation, no allocation
408        // failure on a typical machine. Tests that no Display
409        // formatter has a hidden length limit that would error.
410        let long = "a".repeat(1024 * 1024);
411        let finding = finding_with_hostile_path(&long);
412        let json = serde_json::to_string(&finding).expect("serialize ok");
413        assert!(json.len() > 1024 * 1024);
414        let v: serde_json::Value = serde_json::from_str(&json).expect("parse ok");
415        assert_eq!(
416            v["location"]["file_path"].as_str().unwrap().len(),
417            long.len()
418        );
419    }
420}