keyhog_scanner/multiline/fragment_cache.rs
1//! Cross-chunk fragment cache for virtual secret reassembly.
2//!
3//! This allows KeyHog to detect secrets split across different files or
4//! distant locations within a large file that exceed the chunk window.
5
6use lru::LruCache;
7use parking_lot::Mutex;
8use std::num::NonZeroUsize;
9use std::sync::Arc;
10use zeroize::Zeroizing;
11
12const SHARD_COUNT: usize = 64;
13const MAX_FRAGMENTS_PER_SCOPE: usize = 8;
14
15/// A potential fragment of a secret (variable assignment part).
16///
17/// `value` is wrapped in `Zeroizing<String>` so that fragment text gets
18/// scrubbed from the heap when an entry is evicted from the LRU or the
19/// cache is dropped. kimi-wave1 audit finding 1.HIGH: previously the
20/// credential text lived in a plain `String` for the lifetime of the
21/// scan, and reassembled candidates were materialized into a `Chunk`
22/// that re-embedded the secret in a `format!`-built dummy line. The
23/// `Debug` derive is also intentionally NOT wired through `value`
24/// - `Zeroizing<String>` prints redacted in `{:?}`.
25#[derive(Clone)]
26pub struct SecretFragment {
27 pub prefix: String,
28 pub var_name: String,
29 pub value: Zeroizing<String>,
30 pub line: usize,
31 pub path: Option<Arc<str>>,
32}
33
34impl std::fmt::Debug for SecretFragment {
35 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
36 f.debug_struct("SecretFragment")
37 .field("prefix", &self.prefix)
38 .field("var_name", &self.var_name)
39 .field(
40 "value",
41 &format_args!("<redacted {} bytes>", self.value.len()),
42 )
43 .field("line", &self.line)
44 .field("path", &self.path)
45 .finish()
46 }
47}
48
49/// A reassembled credential candidate plus the provenance of the
50/// fragment that anchors it. The anchor is the *prefix* fragment of
51/// the join (`f1` below): reassembled findings must be stamped with the
52/// real contributing file's path and line, NOT the path of whatever
53/// chunk happened to be scanning when the join completed. Without this,
54/// a join that fires while processing sibling file B was attributed to
55/// B even though both fragments came from file A - the cross-file
56/// mis-attribution half of the precision bug.
57pub struct ReassembledCandidate {
58 /// Glued credential text. Zeroized on drop.
59 pub value: Zeroizing<String>,
60 /// Path of the anchoring (prefix) fragment, if known.
61 pub path: Option<Arc<str>>,
62 /// Source line of the anchoring (prefix) fragment.
63 pub line: usize,
64}
65
66impl std::fmt::Debug for ReassembledCandidate {
67 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
68 f.debug_struct("ReassembledCandidate")
69 .field(
70 "value",
71 &format_args!("<redacted {} bytes>", self.value.len()),
72 )
73 .field("path", &self.path)
74 .field("line", &self.line)
75 .finish()
76 }
77}
78
79/// Global cache for tracking fragmented secrets across the entire scan run.
80pub struct FragmentCache {
81 /// Maps normalized prefix (e.g. "aws_key") to a list of found fragments.
82 /// Sharded to avoid a single global mutex becoming a bottleneck under rayon.
83 shards: [Mutex<LruCache<String, Vec<SecretFragment>>>; SHARD_COUNT],
84}
85
86impl FragmentCache {
87 pub fn new(capacity: usize) -> Self {
88 let per_shard = (capacity / SHARD_COUNT).max(1);
89 let nz = NonZeroUsize::new(per_shard).unwrap_or(NonZeroUsize::MIN);
90 Self {
91 shards: std::array::from_fn(|_| Mutex::new(LruCache::new(nz))),
92 }
93 }
94
95 /// Record a fragment and return a list of "complete" candidates if any.
96 /// The returned `Zeroizing<String>` lets the caller scope the
97 /// reassembled credential's lifetime tightly - drop it (or pass it
98 /// to a scan that consumes by reference) and the heap copy is zeroed.
99 pub fn record_and_reassemble(&self, fragment: SecretFragment) -> Vec<Zeroizing<String>> {
100 let scope = fragment.path.as_deref().unwrap_or("");
101 // Shard index is derived from the (prefix, scope) bytes directly via
102 // `shard_index_of`, so the joined key `String` no longer has to be
103 // materialized purely to pick a shard. The owned key is built once,
104 // only where `LruCache::get_or_insert_mut` actually needs to take
105 // ownership of it.
106 let shard_idx = shard_index_of(&fragment.prefix, scope);
107 let mut lock = self.shards[shard_idx].lock();
108
109 let key = scoped_key(&fragment);
110 let cluster = lock.get_or_insert_mut(key, Vec::new);
111
112 // Don't add duplicate fragments (same path/line/value)
113 if !cluster.iter().any(|f| {
114 f.path == fragment.path && f.line == fragment.line && **f.value == **fragment.value
115 }) {
116 cluster.push(fragment);
117 if cluster.len() > MAX_FRAGMENTS_PER_SCOPE {
118 // LRU-style: drop the oldest. The Zeroizing<String> drop
119 // impl scrubs the bytes before the allocator gets them.
120 cluster.remove(0);
121 }
122 }
123
124 // Reassembly is SAME-FILE only. Cross-file joins (every AKIA/AIza
125 // assignment in dir X paired with every other matching assignment
126 // in dir X siblings) were observed to cannibalize the standalone
127 // findings: the cross-file `:reassembled` candidate replaces the
128 // legitimate singleton during downstream resolution, and the
129 // synthesized credential gets attributed to a sibling-file path.
130 // Investigator evidence (mirror-pos-0000091.yaml AKIA glued to a
131 // sibling klaviyo sk_) confirmed the singleton was lost.
132 //
133 // Real split-credential patterns (AWS_ACCESS_KEY in one .env paired
134 // with AWS_SECRET in another) are NOT in the corpus and the loss
135 // they cause is concrete: the standalone finding is dropped.
136 // Restrict reassembly to same-path fragments within 100 lines of
137 // each other; preserves the file-boundary split case (chunked
138 // 1MB+ files) without manufacturing cross-file glue.
139 if cluster.len() >= 2 {
140 let mut candidates = Vec::new();
141 for i in 0..cluster.len() {
142 for j in 0..cluster.len() {
143 if i == j {
144 continue;
145 }
146 let f1 = &cluster[i];
147 let f2 = &cluster[j];
148
149 let near =
150 f1.path == f2.path && (f1.line as isize - f2.line as isize).abs() < 100;
151
152 if near {
153 let mut joined = Zeroizing::new(String::new());
154 joined.push_str(f1.value.as_str());
155 joined.push_str(f2.value.as_str());
156 candidates.push(joined);
157 }
158 }
159 }
160 // Determinism: the `cluster` Vec is ordered by fragment
161 // *arrival*, and under a parallel (rayon) scan the order in
162 // which sibling chunks win the per-shard mutex is a thread
163 // race - so the (i, j) pair enumeration above emits the same
164 // *set* of joins in a run-to-run-varying order. That order
165 // then leaks downstream (synthesized match order, and via the
166 // stamped variant the anchor line attached to each glue), so
167 // identical input produced non-deterministic scan output.
168 // Canonicalize on the glued bytes - content-derived, so it is
169 // independent of arrival order - before returning. Join
170 // semantics (which pairs are produced) are untouched.
171 candidates.sort_unstable_by(|a, b| a.as_bytes().cmp(b.as_bytes()));
172 candidates
173 } else {
174 Vec::new()
175 }
176 }
177
178 /// Path/line-stamped variant of [`record_and_reassemble`]. Returns
179 /// each glued candidate together with the provenance of its anchor
180 /// (prefix) fragment so the caller can attribute a reassembled
181 /// finding to the file that actually contributed the credential,
182 /// rather than to whatever chunk's metadata was in scope when the
183 /// join fired. The join semantics (same-path, within-100-line
184 /// window) are identical to `record_and_reassemble`; this just
185 /// preserves the anchor's `path`/`line` on the way out.
186 pub fn record_and_reassemble_stamped(
187 &self,
188 fragment: SecretFragment,
189 ) -> Vec<ReassembledCandidate> {
190 let scope = fragment.path.as_deref().unwrap_or("");
191 // Same as `record_and_reassemble`: shard from the (prefix, scope)
192 // bytes, allocate the owned key only at the insert point.
193 let shard_idx = shard_index_of(&fragment.prefix, scope);
194 let mut lock = self.shards[shard_idx].lock();
195
196 let key = scoped_key(&fragment);
197 let cluster = lock.get_or_insert_mut(key, Vec::new);
198
199 if !cluster.iter().any(|f| {
200 f.path == fragment.path && f.line == fragment.line && **f.value == **fragment.value
201 }) {
202 cluster.push(fragment);
203 if cluster.len() > MAX_FRAGMENTS_PER_SCOPE {
204 cluster.remove(0);
205 }
206 }
207
208 if cluster.len() >= 2 {
209 let mut candidates = Vec::new();
210 for i in 0..cluster.len() {
211 for j in 0..cluster.len() {
212 if i == j {
213 continue;
214 }
215 let f1 = &cluster[i];
216 let f2 = &cluster[j];
217
218 let near =
219 f1.path == f2.path && (f1.line as isize - f2.line as isize).abs() < 100;
220
221 if near {
222 let mut joined = Zeroizing::new(String::new());
223 joined.push_str(f1.value.as_str());
224 joined.push_str(f2.value.as_str());
225 candidates.push(ReassembledCandidate {
226 value: joined,
227 // f1 is the prefix fragment - the anchor we
228 // stamp the finding with. Cross-file pairs
229 // are already impossible here (same key +
230 // f1.path == f2.path), so f1.path == f2.path
231 // by construction.
232 path: f1.path.clone(),
233 line: f1.line,
234 });
235 }
236 }
237 }
238 // Determinism (see `record_and_reassemble`): `cluster` is in
239 // race-dependent arrival order under a parallel scan, so the
240 // anchor (`line`) stamped onto each glue - and the emission
241 // order - varied run to run for identical input. A symmetric
242 // pair yields two distinct candidates (`A+B` anchored at A's
243 // line, `B+A` at B's line), so the sort key must be the full
244 // (glued bytes, anchor line) tuple, not the bytes alone, to
245 // give a total content-derived order. Within one full-path
246 // key every fragment shares a `path`, so `path` need not enter
247 // the key.
248 candidates.sort_unstable_by(|a, b| {
249 a.value
250 .as_bytes()
251 .cmp(b.value.as_bytes())
252 .then_with(|| a.line.cmp(&b.line))
253 });
254 candidates
255 } else {
256 Vec::new()
257 }
258 }
259
260 pub fn clear(&self) {
261 for shard in &self.shards {
262 shard.lock().clear();
263 }
264 }
265}
266
267fn scoped_key(fragment: &SecretFragment) -> String {
268 // Scope clusters by the FULL file path, not the parent directory.
269 // Coalesced scan batches several files under one rayon map and the
270 // per-chunk `chunk.metadata.path` is the only provenance the
271 // fragment carries; pooling by parent_dir let every AKIA assignment
272 // in dir X share a cluster with every sibling assignment in dir X.
273 // The `f1.path == f2.path` near-guard in `record_and_reassemble`
274 // was then the SOLE defense against cross-file glue - and it leaked
275 // whenever two fragments were recorded under one shared chunk path.
276 // Keying on the full path means fragments from different files never
277 // pool in the first place; same-file chunk-seam splits (a 1 MB+ file
278 // chunked into windows that all carry the identical path) still land
279 // in one cluster and reassemble. The near-guard stays as a redundant
280 // safety net.
281 let scope = fragment.path.as_deref().unwrap_or("");
282 format!("{}\0{}", fragment.prefix, scope)
283}
284
285/// Fold one more byte into the running shard hash. Single home for the
286/// hash recurrence so the joined-key and slice-pair paths can never drift.
287#[inline]
288fn shard_fold(h: usize, b: u8) -> usize {
289 h.wrapping_mul(31).wrapping_add(b as usize)
290}
291
292/// Shard index for a fragment without materializing the joined `prefix\0scope`
293/// key as a `String`. Folds `prefix`, the `\0` separator, then `scope` in the
294/// exact byte order `scoped_key` produces, so a fragment maps to the same shard
295/// whether it is sharded from slices (the hot record path) or from the joined
296/// key. Keeps the per-record heap allocation off the warm path.
297fn shard_index_of(prefix: &str, scope: &str) -> usize {
298 let mut h = 0usize;
299 for &b in prefix.as_bytes() {
300 h = shard_fold(h, b);
301 }
302 h = shard_fold(h, 0);
303 for &b in scope.as_bytes() {
304 h = shard_fold(h, b);
305 }
306 h % SHARD_COUNT
307}
308
309/// Test-only probe for the shard-hash drift invariant: returns
310/// `(slice_pair_shard, joined_key_shard)` for `(prefix, scope)`. The hot
311/// record path shards from slices via [`shard_index_of`] (no allocation); the
312/// joined-key path folds the materialized `prefix\0scope` string. If these two
313/// ever diverge, a fragment recorded under one shard would never be found under
314/// the other, silently breaking reassembly. Exposed (doc-hidden) so the
315/// equivalence test can live in `tests/unit/inline_migrated/` without leaking
316/// the private `shard_fold` / `SHARD_COUNT` internals.
317#[doc(hidden)]
318pub fn shard_index_drift_probe(prefix: &str, scope: &str) -> (usize, usize) {
319 let joined = format!("{prefix}\0{scope}");
320 let joined_key_shard = joined.bytes().fold(0usize, shard_fold) % SHARD_COUNT;
321 (shard_index_of(prefix, scope), joined_key_shard)
322}