Skip to main content

keyhog_core/
finding.rs

1//! Scanner findings: the output type for detected secrets with location,
2//! confidence, detector metadata, and optional verification status.
3
4// Debt bucket: 16 public items predating the crate floor raising `missing_docs`
5// to `warn`. Public output schema; remove once each carries a doc line.
6#![allow(missing_docs)]
7
8use serde::{Deserialize, Serialize};
9use std::borrow::Cow;
10use std::collections::HashMap;
11use std::sync::Arc;
12
13use crate::Severity;
14
15/// A raw pattern match before verification or deduplication.
16///
17/// `entropy` and `confidence` are stored as `f64` but are guaranteed never to
18/// be `NaN` (sanitized at construction time). This keeps the manual `Eq` impl
19/// reflexive, which downstream code relies on for `HashMap`/`BTreeMap` keys.
20///
21/// Manual `Debug` impl redacts the `credential` field - the previous
22/// derive-`Debug` was a CRITICAL leak vector (any `{:?}` print, panic
23/// handler, or `tracing::error!(?match)` would expose plaintext). See
24/// audit kimi-wave1 finding 1.1.
25#[derive(Clone, Serialize, Deserialize)]
26pub struct RawMatch {
27    /// Stable detector identifier.
28    #[serde(with = "serde_arc_str")]
29    pub detector_id: Arc<str>,
30    /// Human-readable detector name.
31    #[serde(with = "serde_arc_str")]
32    pub detector_name: Arc<str>,
33    /// Service namespace associated with the detector.
34    #[serde(with = "serde_arc_str")]
35    pub service: Arc<str>,
36    /// Detector severity level.
37    pub severity: Severity,
38    /// Matched credential bytes before redaction.
39    #[serde(with = "serde_arc_str")]
40    pub credential: Arc<str>,
41    /// SHA-256 hash of the credential for allowlisting and deduplication.
42    pub credential_hash: String,
43    /// Companion credential or context value extracted nearby.
44    pub companions: std::collections::HashMap<String, String>,
45    /// Source location for the match.
46    pub location: MatchLocation,
47    /// Shannon entropy of the matched credential (0.0 - 8.0). NaN-sanitized.
48    #[serde(skip_serializing_if = "Option::is_none")]
49    pub entropy: Option<f64>,
50    /// Confidence score (0.0 - 1.0). NaN-sanitized at construction.
51    #[serde(skip_serializing_if = "Option::is_none")]
52    pub confidence: Option<f64>,
53}
54
55impl RawMatch {
56    /// Replace NaN floats with `None` so the manual `Eq` impl stays reflexive
57    /// and `HashMap`/`BTreeMap` lookups don't trap. Call this on any externally
58    /// constructed `RawMatch` (deserialized findings, scanner outputs).
59    pub fn sanitize_floats(mut self) -> Self {
60        if self.entropy.is_some_and(f64::is_nan) {
61            self.entropy = None;
62        }
63        if self.confidence.is_some_and(f64::is_nan) {
64            self.confidence = None;
65        }
66        self
67    }
68}
69
70impl PartialEq for RawMatch {
71    fn eq(&self, other: &Self) -> bool {
72        // Compare every field; for the f64 options use `total_cmp` semantics so
73        // NaN-vs-NaN compares equal. We additionally normalize NaN→None on
74        // construction (`sanitize_floats`), but the total-ordering comparison
75        // here keeps the impl sound even if a NaN slips through.
76        self.detector_id == other.detector_id
77            && self.detector_name == other.detector_name
78            && self.service == other.service
79            && self.severity == other.severity
80            && self.credential == other.credential
81            && self.credential_hash == other.credential_hash
82            && self.companions == other.companions
83            && self.location == other.location
84            && opt_f64_total_eq(self.entropy, other.entropy)
85            && opt_f64_total_eq(self.confidence, other.confidence)
86    }
87}
88
89impl Eq for RawMatch {}
90
91impl std::fmt::Debug for RawMatch {
92    /// Redacted Debug. Replaces `derive(Debug)` which would print the raw
93    /// `credential: Arc<str>` plaintext. See kimi-wave1 audit finding 1.1.
94    /// `credential_hash` is preserved because it's already a one-way SHA-256.
95    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
96        f.debug_struct("RawMatch")
97            .field("detector_id", &self.detector_id)
98            .field("detector_name", &self.detector_name)
99            .field("service", &self.service)
100            .field("severity", &self.severity)
101            .field(
102                "credential",
103                &format_args!("<redacted {} bytes>", self.credential.len()),
104            )
105            .field("credential_hash", &self.credential_hash)
106            .field(
107                "companions",
108                &format_args!("<{} redacted companions>", self.companions.len()),
109            )
110            .field("location", &self.location)
111            .field("entropy", &self.entropy)
112            .field("confidence", &self.confidence)
113            .finish()
114    }
115}
116
117#[inline]
118fn opt_f64_total_eq(a: Option<f64>, b: Option<f64>) -> bool {
119    match (a, b) {
120        (None, None) => true,
121        (Some(x), Some(y)) => x.total_cmp(&y) == std::cmp::Ordering::Equal,
122        _ => false,
123    }
124}
125
126impl PartialOrd for RawMatch {
127    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
128        Some(self.cmp(other))
129    }
130}
131
132impl Ord for RawMatch {
133    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
134        // Higher confidence first
135        let self_conf = self.confidence.unwrap_or(0.0);
136        let other_conf = other.confidence.unwrap_or(0.0);
137
138        match other_conf.total_cmp(&self_conf) {
139            std::cmp::Ordering::Equal => {}
140            ord => return ord,
141        }
142
143        // Then higher severity first (Critical > High > Medium > Low > Info)
144        match other.severity.cmp(&self.severity) {
145            std::cmp::Ordering::Equal => {}
146            ord => return ord,
147        }
148
149        // Finally, deterministic sort by detector and credential
150        match self.detector_id.cmp(&other.detector_id) {
151            std::cmp::Ordering::Equal => self.credential.cmp(&other.credential),
152            ord => ord,
153        }
154    }
155}
156
157/// Where a credential was found: file path, line number, commit, and author.
158#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
159pub struct MatchLocation {
160    /// Logical source backend, such as `filesystem` or `git`.
161    #[serde(with = "serde_arc_str")]
162    pub source: Arc<str>,
163    /// File path, object key, or logical path when available.
164    ///
165    /// Paths stored here must be valid UTF-8. Source implementations that see
166    /// non-UTF-8 paths should encode them into a reversible escaped string
167    /// before constructing a [`MatchLocation`].
168    #[serde(with = "serde_arc_str_opt")]
169    pub file_path: Option<Arc<str>>,
170    /// One-based line number when known.
171    pub line: Option<usize>,
172    /// Byte offset from the start of the source chunk.
173    pub offset: usize,
174    /// Commit identifier for history-derived matches.
175    #[serde(with = "serde_arc_str_opt")]
176    pub commit: Option<Arc<str>>,
177    /// Commit author when available.
178    #[serde(with = "serde_arc_str_opt")]
179    pub author: Option<Arc<str>>,
180    /// Commit timestamp when available.
181    #[serde(with = "serde_arc_str_opt")]
182    pub date: Option<Arc<str>>,
183}
184
185/// A finding after verification - the final output.
186#[derive(Debug, Clone, Serialize, Deserialize)]
187pub struct VerifiedFinding {
188    /// Stable detector identifier.
189    #[serde(with = "serde_arc_str")]
190    pub detector_id: Arc<str>,
191    /// Human-readable detector name.
192    #[serde(with = "serde_arc_str")]
193    pub detector_name: Arc<str>,
194    /// Service namespace associated with the detector.
195    #[serde(with = "serde_arc_str")]
196    pub service: Arc<str>,
197    /// Detector severity level.
198    pub severity: Severity,
199    /// Redacted version of the credential for reporting.
200    pub credential_redacted: Cow<'static, str>,
201    /// SHA-256 hash of the original credential for internal correlation.
202    pub credential_hash: String,
203    /// Source location for the match.
204    pub location: MatchLocation,
205    /// Verification result.
206    pub verification: VerificationResult,
207    /// Additional provider-specific metadata (e.g. account ID, scope).
208    pub metadata: HashMap<String, String>,
209    /// Additional duplicate locations found for this credential.
210    pub additional_locations: Vec<MatchLocation>,
211    /// Confidence score (0.0 - 1.0) combining entropy, keyword proximity, file type, etc.
212    #[serde(skip_serializing_if = "Option::is_none")]
213    pub confidence: Option<f64>,
214}
215
216/// Result of live verification: whether the credential is active, revoked, or untested.
217#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
218#[serde(rename_all = "snake_case")]
219pub enum VerificationResult {
220    /// Credential is active and verified by the provider.
221    Live,
222    /// Credential is valid but has been explicitly revoked or disabled.
223    Revoked,
224    /// Credential was rejected by the provider (invalid password/token).
225    Dead,
226    /// Provider returned a rate-limit error (e.g. 429).
227    RateLimited,
228    /// Verification failed due to network error or timeout.
229    Error(String),
230    /// Detector does not support live verification.
231    Unverifiable,
232    /// Verification was not attempted (e.g. disabled via flag).
233    Skipped,
234}
235
236impl RawMatch {
237    /// Get unique key for deduplication.
238    pub fn deduplication_key(&self) -> (&str, &str) {
239        (&self.detector_id, &self.credential)
240    }
241
242    /// Convert into a serialization-safe DTO that never carries the plaintext
243    /// credential. Use this anywhere a `RawMatch` would otherwise be written
244    /// to disk, sent over the network, or rendered into a user-visible
245    /// report. See kimi-wave1 audit finding 2.1 (`scan_system.rs` JSON exfil).
246    pub fn to_redacted(&self) -> RedactedFinding {
247        RedactedFinding {
248            detector_id: self.detector_id.clone(),
249            detector_name: self.detector_name.clone(),
250            service: self.service.clone(),
251            severity: self.severity,
252            credential_redacted: crate::redact(&self.credential),
253            credential_hash: self.credential_hash.clone(),
254            companions_redacted: self
255                .companions
256                .iter()
257                .map(|(k, v)| (k.clone(), crate::redact(v).into_owned()))
258                .collect(),
259            location: self.location.clone(),
260            entropy: self.entropy,
261            confidence: self.confidence,
262        }
263    }
264}
265
266/// Redacted, disk-safe view of a `RawMatch`. Carries only the SHA-256 hash
267/// and a "first4...last4" preview, never the plaintext credential. This is
268/// the only finding shape that should ever leave keyhog's process boundary.
269#[derive(Debug, Clone, Serialize, Deserialize)]
270pub struct RedactedFinding {
271    #[serde(with = "serde_arc_str")]
272    pub detector_id: Arc<str>,
273    #[serde(with = "serde_arc_str")]
274    pub detector_name: Arc<str>,
275    #[serde(with = "serde_arc_str")]
276    pub service: Arc<str>,
277    pub severity: Severity,
278    pub credential_redacted: Cow<'static, str>,
279    pub credential_hash: String,
280    pub companions_redacted: HashMap<String, String>,
281    pub location: MatchLocation,
282    #[serde(skip_serializing_if = "Option::is_none")]
283    pub entropy: Option<f64>,
284    #[serde(skip_serializing_if = "Option::is_none")]
285    pub confidence: Option<f64>,
286}
287
288pub mod serde_arc_str {
289    use serde::{Deserialize, Deserializer, Serialize, Serializer};
290    use std::sync::Arc;
291
292    pub fn serialize<S>(val: &Arc<str>, serializer: S) -> Result<S::Ok, S::Error>
293    where
294        S: Serializer,
295    {
296        val.as_ref().serialize(serializer)
297    }
298
299    pub fn deserialize<'de, D>(deserializer: D) -> Result<Arc<str>, D::Error>
300    where
301        D: Deserializer<'de>,
302    {
303        String::deserialize(deserializer).map(Arc::from)
304    }
305}
306
307pub mod serde_arc_str_opt {
308    use serde::{Deserialize, Deserializer, Serialize, Serializer};
309    use std::sync::Arc;
310
311    pub fn serialize<S>(val: &Option<Arc<str>>, serializer: S) -> Result<S::Ok, S::Error>
312    where
313        S: Serializer,
314    {
315        val.as_ref().map(|s| s.as_ref()).serialize(serializer)
316    }
317
318    pub fn deserialize<'de, D>(deserializer: D) -> Result<Option<Arc<str>>, D::Error>
319    where
320        D: Deserializer<'de>,
321    {
322        Option::<String>::deserialize(deserializer).map(|opt| opt.map(Arc::from))
323    }
324}