Skip to main content

keyhog_scanner/multiline/
fragment_cache.rs

1//! Cross-chunk fragment cache for virtual secret reassembly.
2//!
3//! This allows KeyHog to detect secrets split across different files or
4//! distant locations within a large file that exceed the chunk window.
5
6use lru::LruCache;
7use parking_lot::Mutex;
8use std::num::NonZeroUsize;
9use std::sync::Arc;
10use zeroize::Zeroizing;
11
12const SHARD_COUNT: usize = 64;
13const MAX_FRAGMENTS_PER_SCOPE: usize = 8;
14
15/// A potential fragment of a secret (variable assignment part).
16///
17/// `value` is wrapped in `Zeroizing<String>` so that fragment text gets
18/// scrubbed from the heap when an entry is evicted from the LRU or the
19/// cache is dropped. kimi-wave1 audit finding 1.HIGH: previously the
20/// credential text lived in a plain `String` for the lifetime of the
21/// scan, and reassembled candidates were materialized into a `Chunk`
22/// that re-embedded the secret in a `format!`-built dummy line. The
23/// `Debug` derive is also intentionally NOT wired through `value`
24/// - `Zeroizing<String>` prints redacted in `{:?}`.
25#[derive(Clone)]
26pub struct SecretFragment {
27    pub prefix: String,
28    pub var_name: String,
29    pub value: Zeroizing<String>,
30    pub line: usize,
31    pub path: Option<Arc<str>>,
32}
33
34impl std::fmt::Debug for SecretFragment {
35    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
36        f.debug_struct("SecretFragment")
37            .field("prefix", &self.prefix)
38            .field("var_name", &self.var_name)
39            .field(
40                "value",
41                &format_args!("<redacted {} bytes>", self.value.len()),
42            )
43            .field("line", &self.line)
44            .field("path", &self.path)
45            .finish()
46    }
47}
48
49/// A reassembled credential candidate plus the provenance of the
50/// fragment that anchors it. The anchor is the *prefix* fragment of
51/// the join (`f1` below): reassembled findings must be stamped with the
52/// real contributing file's path and line, NOT the path of whatever
53/// chunk happened to be scanning when the join completed. Without this,
54/// a join that fires while processing sibling file B was attributed to
55/// B even though both fragments came from file A - the cross-file
56/// mis-attribution half of the precision bug.
57pub struct ReassembledCandidate {
58    /// Glued credential text. Zeroized on drop.
59    pub value: Zeroizing<String>,
60    /// Path of the anchoring (prefix) fragment, if known.
61    pub path: Option<Arc<str>>,
62    /// Source line of the anchoring (prefix) fragment.
63    pub line: usize,
64}
65
66impl std::fmt::Debug for ReassembledCandidate {
67    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
68        f.debug_struct("ReassembledCandidate")
69            .field(
70                "value",
71                &format_args!("<redacted {} bytes>", self.value.len()),
72            )
73            .field("path", &self.path)
74            .field("line", &self.line)
75            .finish()
76    }
77}
78
79/// Global cache for tracking fragmented secrets across the entire scan run.
80pub struct FragmentCache {
81    /// Maps normalized prefix (e.g. "aws_key") to a list of found fragments.
82    /// Sharded to avoid a single global mutex becoming a bottleneck under rayon.
83    shards: [Mutex<LruCache<String, Vec<SecretFragment>>>; SHARD_COUNT],
84}
85
86impl FragmentCache {
87    pub fn new(capacity: usize) -> Self {
88        let per_shard = (capacity / SHARD_COUNT).max(1);
89        let nz = NonZeroUsize::new(per_shard).unwrap_or(NonZeroUsize::MIN);
90        Self {
91            shards: std::array::from_fn(|_| Mutex::new(LruCache::new(nz))),
92        }
93    }
94
95    /// Record a fragment and return a list of "complete" candidates if any.
96    /// The returned `Zeroizing<String>` lets the caller scope the
97    /// reassembled credential's lifetime tightly - drop it (or pass it
98    /// to a scan that consumes by reference) and the heap copy is zeroed.
99    pub fn record_and_reassemble(&self, fragment: SecretFragment) -> Vec<Zeroizing<String>> {
100        let scope = fragment.path.as_deref().unwrap_or("");
101        // Shard index is derived from the (prefix, scope) bytes directly via
102        // `shard_index_of`, so the joined key `String` no longer has to be
103        // materialized purely to pick a shard. The owned key is built once,
104        // only where `LruCache::get_or_insert_mut` actually needs to take
105        // ownership of it.
106        let shard_idx = shard_index_of(&fragment.prefix, scope);
107        let mut lock = self.shards[shard_idx].lock();
108
109        let key = scoped_key(&fragment);
110        let cluster = lock.get_or_insert_mut(key, Vec::new);
111
112        // Don't add duplicate fragments (same path/line/value)
113        if !cluster.iter().any(|f| {
114            f.path == fragment.path && f.line == fragment.line && **f.value == **fragment.value
115        }) {
116            cluster.push(fragment);
117            if cluster.len() > MAX_FRAGMENTS_PER_SCOPE {
118                // LRU-style: drop the oldest. The Zeroizing<String> drop
119                // impl scrubs the bytes before the allocator gets them.
120                cluster.remove(0);
121            }
122        }
123
124        // Reassembly is SAME-FILE only. Cross-file joins (every AKIA/AIza
125        // assignment in dir X paired with every other matching assignment
126        // in dir X siblings) were observed to cannibalize the standalone
127        // findings: the cross-file `:reassembled` candidate replaces the
128        // legitimate singleton during downstream resolution, and the
129        // synthesized credential gets attributed to a sibling-file path.
130        // Investigator evidence (mirror-pos-0000091.yaml AKIA glued to a
131        // sibling klaviyo sk_) confirmed the singleton was lost.
132        //
133        // Real split-credential patterns (AWS_ACCESS_KEY in one .env paired
134        // with AWS_SECRET in another) are NOT in the corpus and the loss
135        // they cause is concrete: the standalone finding is dropped.
136        // Restrict reassembly to same-path fragments within 100 lines of
137        // each other; preserves the file-boundary split case (chunked
138        // 1MB+ files) without manufacturing cross-file glue.
139        if cluster.len() >= 2 {
140            let mut candidates = Vec::new();
141            for i in 0..cluster.len() {
142                for j in 0..cluster.len() {
143                    if i == j {
144                        continue;
145                    }
146                    let f1 = &cluster[i];
147                    let f2 = &cluster[j];
148
149                    let near =
150                        f1.path == f2.path && (f1.line as isize - f2.line as isize).abs() < 100;
151
152                    if near {
153                        let mut joined = Zeroizing::new(String::new());
154                        joined.push_str(f1.value.as_str());
155                        joined.push_str(f2.value.as_str());
156                        candidates.push(joined);
157                    }
158                }
159            }
160            // Determinism: the `cluster` Vec is ordered by fragment
161            // *arrival*, and under a parallel (rayon) scan the order in
162            // which sibling chunks win the per-shard mutex is a thread
163            // race - so the (i, j) pair enumeration above emits the same
164            // *set* of joins in a run-to-run-varying order. That order
165            // then leaks downstream (synthesized match order, and via the
166            // stamped variant the anchor line attached to each glue), so
167            // identical input produced non-deterministic scan output.
168            // Canonicalize on the glued bytes - content-derived, so it is
169            // independent of arrival order - before returning. Join
170            // semantics (which pairs are produced) are untouched.
171            candidates.sort_unstable_by(|a, b| a.as_bytes().cmp(b.as_bytes()));
172            candidates
173        } else {
174            Vec::new()
175        }
176    }
177
178    /// Path/line-stamped variant of [`record_and_reassemble`]. Returns
179    /// each glued candidate together with the provenance of its anchor
180    /// (prefix) fragment so the caller can attribute a reassembled
181    /// finding to the file that actually contributed the credential,
182    /// rather than to whatever chunk's metadata was in scope when the
183    /// join fired. The join semantics (same-path, within-100-line
184    /// window) are identical to `record_and_reassemble`; this just
185    /// preserves the anchor's `path`/`line` on the way out.
186    pub fn record_and_reassemble_stamped(
187        &self,
188        fragment: SecretFragment,
189    ) -> Vec<ReassembledCandidate> {
190        let scope = fragment.path.as_deref().unwrap_or("");
191        // Same as `record_and_reassemble`: shard from the (prefix, scope)
192        // bytes, allocate the owned key only at the insert point.
193        let shard_idx = shard_index_of(&fragment.prefix, scope);
194        let mut lock = self.shards[shard_idx].lock();
195
196        let key = scoped_key(&fragment);
197        let cluster = lock.get_or_insert_mut(key, Vec::new);
198
199        if !cluster.iter().any(|f| {
200            f.path == fragment.path && f.line == fragment.line && **f.value == **fragment.value
201        }) {
202            cluster.push(fragment);
203            if cluster.len() > MAX_FRAGMENTS_PER_SCOPE {
204                cluster.remove(0);
205            }
206        }
207
208        if cluster.len() >= 2 {
209            let mut candidates = Vec::new();
210            for i in 0..cluster.len() {
211                for j in 0..cluster.len() {
212                    if i == j {
213                        continue;
214                    }
215                    let f1 = &cluster[i];
216                    let f2 = &cluster[j];
217
218                    let near =
219                        f1.path == f2.path && (f1.line as isize - f2.line as isize).abs() < 100;
220
221                    if near {
222                        let mut joined = Zeroizing::new(String::new());
223                        joined.push_str(f1.value.as_str());
224                        joined.push_str(f2.value.as_str());
225                        candidates.push(ReassembledCandidate {
226                            value: joined,
227                            // f1 is the prefix fragment - the anchor we
228                            // stamp the finding with. Cross-file pairs
229                            // are already impossible here (same key +
230                            // f1.path == f2.path), so f1.path == f2.path
231                            // by construction.
232                            path: f1.path.clone(),
233                            line: f1.line,
234                        });
235                    }
236                }
237            }
238            // Determinism (see `record_and_reassemble`): `cluster` is in
239            // race-dependent arrival order under a parallel scan, so the
240            // anchor (`line`) stamped onto each glue - and the emission
241            // order - varied run to run for identical input. A symmetric
242            // pair yields two distinct candidates (`A+B` anchored at A's
243            // line, `B+A` at B's line), so the sort key must be the full
244            // (glued bytes, anchor line) tuple, not the bytes alone, to
245            // give a total content-derived order. Within one full-path
246            // key every fragment shares a `path`, so `path` need not enter
247            // the key.
248            candidates.sort_unstable_by(|a, b| {
249                a.value
250                    .as_bytes()
251                    .cmp(b.value.as_bytes())
252                    .then_with(|| a.line.cmp(&b.line))
253            });
254            candidates
255        } else {
256            Vec::new()
257        }
258    }
259
260    pub fn clear(&self) {
261        for shard in &self.shards {
262            shard.lock().clear();
263        }
264    }
265}
266
267fn scoped_key(fragment: &SecretFragment) -> String {
268    // Scope clusters by the FULL file path, not the parent directory.
269    // Coalesced scan batches several files under one rayon map and the
270    // per-chunk `chunk.metadata.path` is the only provenance the
271    // fragment carries; pooling by parent_dir let every AKIA assignment
272    // in dir X share a cluster with every sibling assignment in dir X.
273    // The `f1.path == f2.path` near-guard in `record_and_reassemble`
274    // was then the SOLE defense against cross-file glue - and it leaked
275    // whenever two fragments were recorded under one shared chunk path.
276    // Keying on the full path means fragments from different files never
277    // pool in the first place; same-file chunk-seam splits (a 1 MB+ file
278    // chunked into windows that all carry the identical path) still land
279    // in one cluster and reassemble. The near-guard stays as a redundant
280    // safety net.
281    let scope = fragment.path.as_deref().unwrap_or("");
282    format!("{}\0{}", fragment.prefix, scope)
283}
284
285/// Fold one more byte into the running shard hash. Single home for the
286/// hash recurrence so the joined-key and slice-pair paths can never drift.
287#[inline]
288fn shard_fold(h: usize, b: u8) -> usize {
289    h.wrapping_mul(31).wrapping_add(b as usize)
290}
291
292/// Shard index for a fragment without materializing the joined `prefix\0scope`
293/// key as a `String`. Folds `prefix`, the `\0` separator, then `scope` in the
294/// exact byte order `scoped_key` produces, so a fragment maps to the same shard
295/// whether it is sharded from slices (the hot record path) or from the joined
296/// key. Keeps the per-record heap allocation off the warm path.
297fn shard_index_of(prefix: &str, scope: &str) -> usize {
298    let mut h = 0usize;
299    for &b in prefix.as_bytes() {
300        h = shard_fold(h, b);
301    }
302    h = shard_fold(h, 0);
303    for &b in scope.as_bytes() {
304        h = shard_fold(h, b);
305    }
306    h % SHARD_COUNT
307}
308
309/// Test-only probe for the shard-hash drift invariant: returns
310/// `(slice_pair_shard, joined_key_shard)` for `(prefix, scope)`. The hot
311/// record path shards from slices via [`shard_index_of`] (no allocation); the
312/// joined-key path folds the materialized `prefix\0scope` string. If these two
313/// ever diverge, a fragment recorded under one shard would never be found under
314/// the other, silently breaking reassembly. Exposed (doc-hidden) so the
315/// equivalence test can live in `tests/unit/inline_migrated/` without leaking
316/// the private `shard_fold` / `SHARD_COUNT` internals.
317#[doc(hidden)]
318pub fn shard_index_drift_probe(prefix: &str, scope: &str) -> (usize, usize) {
319    let joined = format!("{prefix}\0{scope}");
320    let joined_key_shard = joined.bytes().fold(0usize, shard_fold) % SHARD_COUNT;
321    (shard_index_of(prefix, scope), joined_key_shard)
322}