Skip to main content

keyhog_core/
finding.rs

1//! Scanner findings: the output type for detected secrets with location,
2//! confidence, detector metadata, and optional verification status.
3
4// Debt bucket: 16 public items predating the crate floor raising `missing_docs`
5// to `warn`. Public output schema; remove once each carries a doc line.
6#![allow(missing_docs)]
7
8use serde::{Deserialize, Serialize};
9use std::borrow::Cow;
10use std::collections::HashMap;
11use std::sync::Arc;
12
13use crate::Severity;
14
15/// A raw pattern match before verification or deduplication.
16///
17/// `entropy` and `confidence` are stored as `f64` but are guaranteed never to
18/// be `NaN` (sanitized at construction time). This keeps the manual `Eq` impl
19/// reflexive, which downstream code relies on for `HashMap`/`BTreeMap` keys.
20///
21/// Manual `Debug` impl redacts the `credential` field - the previous
22/// derive-`Debug` was a CRITICAL leak vector (any `{:?}` print, panic
23/// handler, or `tracing::error!(?match)` would expose plaintext). See
24/// audit kimi-wave1 finding 1.1.
25#[derive(Clone, Serialize, Deserialize)]
26pub struct RawMatch {
27    /// Stable detector identifier.
28    #[serde(with = "serde_arc_str")]
29    pub detector_id: Arc<str>,
30    /// Human-readable detector name.
31    #[serde(with = "serde_arc_str")]
32    pub detector_name: Arc<str>,
33    /// Service namespace associated with the detector.
34    #[serde(with = "serde_arc_str")]
35    pub service: Arc<str>,
36    /// Detector severity level.
37    pub severity: Severity,
38    /// Matched credential bytes before redaction.
39    #[serde(with = "serde_arc_str")]
40    pub credential: Arc<str>,
41    /// SHA-256 digest of the credential for allowlisting and deduplication.
42    ///
43    /// Stored as the raw 32 inline bytes (matching the verifier `CacheKey`),
44    /// never the 64-char hex `String`: zero heap, half the per-finding
45    /// footprint, no per-match allocation on the pre-dedup hot path. Hex
46    /// encoding happens lazily at the serde/reporter boundary only.
47    #[serde(with = "serde_hash_hex")]
48    pub credential_hash: [u8; 32],
49    /// Companion credential or context value extracted nearby.
50    pub companions: std::collections::HashMap<String, String>,
51    /// Source location for the match.
52    pub location: MatchLocation,
53    /// Shannon entropy of the matched credential (0.0 - 8.0). NaN-sanitized.
54    #[serde(skip_serializing_if = "Option::is_none")]
55    pub entropy: Option<f64>,
56    /// Confidence score (0.0 - 1.0). NaN-sanitized at construction.
57    #[serde(skip_serializing_if = "Option::is_none")]
58    pub confidence: Option<f64>,
59}
60
61impl RawMatch {
62    /// Replace NaN floats with `None` so the manual `Eq` impl stays reflexive
63    /// and `HashMap`/`BTreeMap` lookups don't trap. Call this on any externally
64    /// constructed `RawMatch` (deserialized findings, scanner outputs).
65    pub fn sanitize_floats(mut self) -> Self {
66        if self.entropy.is_some_and(f64::is_nan) {
67            self.entropy = None;
68        }
69        if self.confidence.is_some_and(f64::is_nan) {
70            self.confidence = None;
71        }
72        self
73    }
74}
75
76impl PartialEq for RawMatch {
77    fn eq(&self, other: &Self) -> bool {
78        // Compare every field; for the f64 options use `total_cmp` semantics so
79        // NaN-vs-NaN compares equal. We additionally normalize NaN→None on
80        // construction (`sanitize_floats`), but the total-ordering comparison
81        // here keeps the impl sound even if a NaN slips through.
82        self.detector_id == other.detector_id
83            && self.detector_name == other.detector_name
84            && self.service == other.service
85            && self.severity == other.severity
86            && self.credential == other.credential
87            && self.credential_hash == other.credential_hash
88            && self.companions == other.companions
89            && self.location == other.location
90            && opt_f64_total_eq(self.entropy, other.entropy)
91            && opt_f64_total_eq(self.confidence, other.confidence)
92    }
93}
94
95impl Eq for RawMatch {}
96
97impl std::fmt::Debug for RawMatch {
98    /// Redacted Debug. Replaces `derive(Debug)` which would print the raw
99    /// `credential: Arc<str>` plaintext. See kimi-wave1 audit finding 1.1.
100    /// `credential_hash` is preserved because it's already a one-way SHA-256.
101    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
102        f.debug_struct("RawMatch")
103            .field("detector_id", &self.detector_id)
104            .field("detector_name", &self.detector_name)
105            .field("service", &self.service)
106            .field("severity", &self.severity)
107            .field(
108                "credential",
109                &format_args!("<redacted {} bytes>", self.credential.len()),
110            )
111            .field(
112                "credential_hash",
113                &format_args!("{}", hex_encode(&self.credential_hash)),
114            )
115            .field(
116                "companions",
117                &format_args!("<{} redacted companions>", self.companions.len()),
118            )
119            .field("location", &self.location)
120            .field("entropy", &self.entropy)
121            .field("confidence", &self.confidence)
122            .finish()
123    }
124}
125
126#[inline]
127fn opt_f64_total_eq(a: Option<f64>, b: Option<f64>) -> bool {
128    match (a, b) {
129        (None, None) => true,
130        (Some(x), Some(y)) => x.total_cmp(&y) == std::cmp::Ordering::Equal,
131        _ => false,
132    }
133}
134
135impl PartialOrd for RawMatch {
136    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
137        Some(self.cmp(other))
138    }
139}
140
141impl Ord for RawMatch {
142    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
143        // Higher confidence first
144        let self_conf = self.confidence.unwrap_or(0.0);
145        let other_conf = other.confidence.unwrap_or(0.0);
146
147        match other_conf.total_cmp(&self_conf) {
148            std::cmp::Ordering::Equal => {}
149            ord => return ord,
150        }
151
152        // Then higher severity first (Critical > High > Medium > Low > Info)
153        match other.severity.cmp(&self.severity) {
154            std::cmp::Ordering::Equal => {}
155            ord => return ord,
156        }
157
158        // Finally, deterministic sort by detector and credential
159        match self.detector_id.cmp(&other.detector_id) {
160            std::cmp::Ordering::Equal => self.credential.cmp(&other.credential),
161            ord => ord,
162        }
163    }
164}
165
166/// Where a credential was found: file path, line number, commit, and author.
167#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
168pub struct MatchLocation {
169    /// Logical source backend, such as `filesystem` or `git`.
170    #[serde(with = "serde_arc_str")]
171    pub source: Arc<str>,
172    /// File path, object key, or logical path when available.
173    ///
174    /// Paths stored here must be valid UTF-8. Source implementations that see
175    /// non-UTF-8 paths should encode them into a reversible escaped string
176    /// before constructing a [`MatchLocation`].
177    #[serde(with = "serde_arc_str_opt")]
178    pub file_path: Option<Arc<str>>,
179    /// One-based line number when known.
180    pub line: Option<usize>,
181    /// Byte offset from the start of the source chunk.
182    pub offset: usize,
183    /// Commit identifier for history-derived matches.
184    #[serde(with = "serde_arc_str_opt")]
185    pub commit: Option<Arc<str>>,
186    /// Commit author when available.
187    #[serde(with = "serde_arc_str_opt")]
188    pub author: Option<Arc<str>>,
189    /// Commit timestamp when available.
190    #[serde(with = "serde_arc_str_opt")]
191    pub date: Option<Arc<str>>,
192}
193
194/// A finding after verification - the final output.
195#[derive(Debug, Clone, Serialize, Deserialize)]
196pub struct VerifiedFinding {
197    /// Stable detector identifier.
198    #[serde(with = "serde_arc_str")]
199    pub detector_id: Arc<str>,
200    /// Human-readable detector name.
201    #[serde(with = "serde_arc_str")]
202    pub detector_name: Arc<str>,
203    /// Service namespace associated with the detector.
204    #[serde(with = "serde_arc_str")]
205    pub service: Arc<str>,
206    /// Detector severity level.
207    pub severity: Severity,
208    /// Redacted version of the credential for reporting.
209    pub credential_redacted: Cow<'static, str>,
210    /// SHA-256 digest of the original credential for internal correlation.
211    /// Raw 32 inline bytes; hex-encoded lazily at the serde/reporter boundary.
212    #[serde(with = "serde_hash_hex")]
213    pub credential_hash: [u8; 32],
214    /// Source location for the match.
215    pub location: MatchLocation,
216    /// Verification result.
217    pub verification: VerificationResult,
218    /// Additional provider-specific metadata (e.g. account ID, scope).
219    pub metadata: HashMap<String, String>,
220    /// Additional duplicate locations found for this credential.
221    pub additional_locations: Vec<MatchLocation>,
222    /// Confidence score (0.0 - 1.0) combining entropy, keyword proximity, file type, etc.
223    #[serde(skip_serializing_if = "Option::is_none")]
224    pub confidence: Option<f64>,
225}
226
227/// Result of live verification: whether the credential is active, revoked, or untested.
228#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
229#[serde(rename_all = "snake_case")]
230pub enum VerificationResult {
231    /// Credential is active and verified by the provider.
232    Live,
233    /// Credential is valid but has been explicitly revoked or disabled.
234    Revoked,
235    /// Credential was rejected by the provider (invalid password/token).
236    Dead,
237    /// Provider returned a rate-limit error (e.g. 429).
238    RateLimited,
239    /// Verification failed due to network error or timeout.
240    Error(String),
241    /// Detector does not support live verification.
242    Unverifiable,
243    /// Verification was not attempted (e.g. disabled via flag).
244    Skipped,
245}
246
247impl RawMatch {
248    /// Get unique key for deduplication.
249    pub fn deduplication_key(&self) -> (&str, &str) {
250        (&self.detector_id, &self.credential)
251    }
252
253    /// Convert into a serialization-safe DTO that never carries the plaintext
254    /// credential. Use this anywhere a `RawMatch` would otherwise be written
255    /// to disk, sent over the network, or rendered into a user-visible
256    /// report. See kimi-wave1 audit finding 2.1 (`scan_system.rs` JSON exfil).
257    pub fn to_redacted(&self) -> RedactedFinding {
258        RedactedFinding {
259            detector_id: self.detector_id.clone(),
260            detector_name: self.detector_name.clone(),
261            service: self.service.clone(),
262            severity: self.severity,
263            credential_redacted: crate::redact(&self.credential),
264            credential_hash: self.credential_hash,
265            companions_redacted: self
266                .companions
267                .iter()
268                .map(|(k, v)| (k.clone(), crate::redact(v).into_owned()))
269                .collect(),
270            location: self.location.clone(),
271            entropy: self.entropy,
272            confidence: self.confidence,
273        }
274    }
275}
276
277/// Redacted, disk-safe view of a `RawMatch`. Carries only the SHA-256 hash
278/// and a "first4...last4" preview, never the plaintext credential. This is
279/// the only finding shape that should ever leave keyhog's process boundary.
280#[derive(Debug, Clone, Serialize, Deserialize)]
281pub struct RedactedFinding {
282    #[serde(with = "serde_arc_str")]
283    pub detector_id: Arc<str>,
284    #[serde(with = "serde_arc_str")]
285    pub detector_name: Arc<str>,
286    #[serde(with = "serde_arc_str")]
287    pub service: Arc<str>,
288    pub severity: Severity,
289    pub credential_redacted: Cow<'static, str>,
290    /// SHA-256 digest as raw 32 inline bytes; hex-encoded at the serde boundary.
291    #[serde(with = "serde_hash_hex")]
292    pub credential_hash: [u8; 32],
293    pub companions_redacted: HashMap<String, String>,
294    pub location: MatchLocation,
295    #[serde(skip_serializing_if = "Option::is_none")]
296    pub entropy: Option<f64>,
297    #[serde(skip_serializing_if = "Option::is_none")]
298    pub confidence: Option<f64>,
299}
300
301/// Lower-case hex of a 32-byte SHA-256 digest. The only place the hex string
302/// is materialized for a `[u8; 32]` `credential_hash` (reporters, Debug).
303#[inline]
304pub fn hex_encode(bytes: &[u8; 32]) -> String {
305    hex::encode(bytes)
306}
307
308/// Serde adapter keeping the on-wire shape of `credential_hash` a 64-char
309/// lower-case hex string while the in-memory field is raw `[u8; 32]`. This
310/// preserves the documented JSON/JSONL/baseline/SARIF format (`.credential_hash`
311/// consumers, `keyhogignore` `hash:` entries) with zero heap on the hot path.
312pub mod serde_hash_hex {
313    use serde::{Deserialize, Deserializer, Serializer};
314
315    pub fn serialize<S>(val: &[u8; 32], serializer: S) -> Result<S::Ok, S::Error>
316    where
317        S: Serializer,
318    {
319        serializer.serialize_str(&hex::encode(val))
320    }
321
322    pub fn deserialize<'de, D>(deserializer: D) -> Result<[u8; 32], D::Error>
323    where
324        D: Deserializer<'de>,
325    {
326        let s = String::deserialize(deserializer)?;
327        let bytes = hex::decode(&s).map_err(serde::de::Error::custom)?;
328        bytes
329            .try_into()
330            .map_err(|_| serde::de::Error::invalid_length(s.len() / 2, &"32-byte SHA-256 digest"))
331    }
332}
333
334pub mod serde_arc_str {
335    use serde::{Deserialize, Deserializer, Serialize, Serializer};
336    use std::sync::Arc;
337
338    pub fn serialize<S>(val: &Arc<str>, serializer: S) -> Result<S::Ok, S::Error>
339    where
340        S: Serializer,
341    {
342        val.as_ref().serialize(serializer)
343    }
344
345    pub fn deserialize<'de, D>(deserializer: D) -> Result<Arc<str>, D::Error>
346    where
347        D: Deserializer<'de>,
348    {
349        String::deserialize(deserializer).map(Arc::from)
350    }
351}
352
353pub mod serde_arc_str_opt {
354    use serde::{Deserialize, Deserializer, Serialize, Serializer};
355    use std::sync::Arc;
356
357    pub fn serialize<S>(val: &Option<Arc<str>>, serializer: S) -> Result<S::Ok, S::Error>
358    where
359        S: Serializer,
360    {
361        val.as_ref().map(|s| s.as_ref()).serialize(serializer)
362    }
363
364    pub fn deserialize<'de, D>(deserializer: D) -> Result<Option<Arc<str>>, D::Error>
365    where
366        D: Deserializer<'de>,
367    {
368        Option::<String>::deserialize(deserializer).map(|opt| opt.map(Arc::from))
369    }
370}