1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
//! `process_match`: the per-match post-processing chain.
//!
//! Extracted from `scan.rs` to keep both files under the 500-line cap.
//! Runs the suppression chain, companion-required gate, entropy + camel-
//! shape filters for generic detectors, checksum validation, and finally
//! ML / heuristic scoring. Outputs either a `Final` finding into
//! `scan_state.matches` or queues an `MlPendingMatch` for the post-scan
//! ML batch.
use super::scan_filters::*;
use super::CompiledScanner;
use crate::context;
use crate::pipeline::*;
use crate::types::*;
use keyhog_core::{Chunk, DetectorSpec};
use std::collections::HashMap;
impl CompiledScanner {
#[allow(clippy::too_many_arguments)]
pub(super) fn process_match(
&self,
entry: &CompiledPattern,
detector: &DetectorSpec,
data: &str,
preprocessed: &ScannerPreprocessedText<'_>,
line_offsets: &[usize],
code_lines: &[&str],
documentation_lines: &[bool],
chunk: &Chunk,
scan_state: &mut ScanState,
credential: &str,
match_start: usize,
match_end: usize,
base_line: usize,
base_offset: usize,
keyword_nearby: bool,
sensitive_file: bool,
) {
let (credential, match_end) =
extend_known_prefix_credential(data, credential, match_start, match_end);
if detector.id == "aws-access-key" && credential.len() != 20 {
return;
}
if detector.id == "anthropic-api-key" {
const LEGACY_PREFIX: &str = "sk-ant-api03-";
if let Some(body) = credential.strip_prefix(LEGACY_PREFIX) {
if !(80..=120).contains(&body.len()) {
return;
}
}
}
let line = match_line_number(preprocessed, line_offsets, match_start);
if is_within_hex_context(data, match_start, match_end) {
return;
}
// Digest-fragment guard: a fixed-length hex credential (e.g. a {32}-hex
// API-key body) whose contiguous hex run is EXTENDED by adjacent hex
// digits to digest length (>=40) is a slice of a SHA-1 (40) / SHA-256
// (64) / git-commit hash, not a standalone key. `is_within_hex_context`
// only fires when hex surrounds the match on BOTH sides; a detector
// that matches the leading 32 hex of a 64-hex digest has hex only
// AFTER, so it slipped through (etherscan/iterable firing on a sha256
// substring). Real 32-hex keys (Twilio auth token, Datadog, Algolia,
// Azure subscription) are delimiter-bounded (before==after==0) and are
// never suppressed here, so recall is preserved.
if is_hex_digest_fragment(data, match_start, match_end, credential) {
return;
}
// Probabilistic gate: fast rejection of obvious non-secrets (UUIDs, low-diversity
// strings) BEFORE the expensive false-positive context check and ML scoring.
// Only applied to generic detectors. Specific detectors with known prefixes
// already have high confidence from the prefix match.
if detector.id.starts_with("generic-")
&& crate::confidence::known_prefix_confidence_floor(credential).is_none()
&& !crate::probabilistic_gate::ProbabilisticGate::looks_promising(credential)
{
return;
}
if context::is_false_positive_context(
code_lines,
line.saturating_sub(PREVIOUS_LINE_DISTANCE),
chunk.metadata.path.as_deref(),
) || context::is_false_positive_match_context(
data,
match_start,
chunk.metadata.path.as_deref(),
) {
return;
}
let inferred_context = context::infer_context_with_documentation(
code_lines,
line.saturating_sub(PREVIOUS_LINE_DISTANCE),
chunk.metadata.path.as_deref(),
documentation_lines,
);
let weak_anchor = crate::pipeline::detector_weak_anchor(detector);
if crate::pipeline::should_suppress_named_detector_finding_weak(
credential,
chunk.metadata.path.as_deref(),
inferred_context,
Some(chunk.metadata.source_type.as_str()),
detector.id.as_ref(),
weak_anchor,
) {
return;
}
// `match_companions` returns `None` when a `required = true`
// companion isn't found within the search radius. That is a
// hard skip signal, not "no companions found." The previous
// `.unwrap_or_default()` swallowed it and let the match fire
// anyway, silently nullifying the `required` field on every
// detector that uses it (notably `twilio-auth-token`).
let companions = if self.companions.is_empty() {
HashMap::new()
} else {
match self.match_companions(entry, preprocessed, line) {
Some(c) => c,
None => return,
}
};
let entropy = match_entropy(credential.as_bytes());
let is_generic =
detector.id.starts_with("generic-") && detector.id != "generic-private-key";
let is_weakly_anchored = weak_anchor;
if is_generic || is_weakly_anchored {
// Per-detector entropy floor. Structured tokens (UUIDs, short API keys)
// have lower entropy than random strings. A blanket 3.5 floor misses them.
let floor_id = if is_weakly_anchored {
"generic-api-key"
} else {
detector.id.as_str()
};
let entropy_floor =
generic_entropy_floor(self.config.entropy_threshold, floor_id, credential.len());
if entropy < entropy_floor {
return;
}
// camelCase-without-digits is the false-positive shape (Java/Go
// identifiers like `getUserName`); real tokens almost always carry
// a digit. The cheap digit scan (ASCII bytes, no UTF-8 decode via
// `chars()`) runs first so any credential containing a digit skips
// the O(n) camel-transition window walk entirely. Only no-digit
// credentials pay for the count, and `take(2)` stops it as soon as
// the >=2 threshold is reached. Behavior is identical to the prior
// `transitions >= 2 && !has_digit` gate.
if !credential.bytes().any(|b| b.is_ascii_digit()) {
let camel_transitions = credential
.as_bytes()
.windows(2)
.filter(|w| w[0].is_ascii_lowercase() && w[1].is_ascii_uppercase())
.take(2)
.count();
if camel_transitions >= 2 {
return;
}
}
}
// Checksum validation: tokens with embedded checksums (GitHub, npm, Slack,
// Stripe, GitLab, PyPI) can be verified without network requests.
// Valid checksum -> floor confidence at 0.9 (confirmed real token format).
// Invalid checksum -> cap confidence at 0.1 (confirmed false positive).
let checksum_result = crate::checksum::validate_checksum(credential);
if checksum_result == crate::checksum::ChecksumResult::Invalid {
// Checksum failed: NOT a real token. Skip expensive ML scoring.
return;
}
// A named, service-anchored detector (anything that is not a
// generic-* / entropy-* / private-key fallback) carries positive
// evidence in its own regex: its match IS the credential. The
// probabilistic "looks_promising" gate in `calculate_final_score`
// is built to reject low-diversity / UUID / structured strings for
// the GENERIC entropy path - applied to a named detector it slams
// legitimate UUID/hex API keys (Heroku, Braze, Codecov, Consul,
// Linode, Databricks, +100 others) to 0.1, below the 0.3 report
// floor, silently deleting real secrets. Mirror the same anchor=
// positive-evidence rule the shape-gate bypass already uses so the
// gate stays load-bearing for generic-* but never buries a named hit.
let is_named_detector =
crate::confidence::is_service_anchored_detector(&detector.id) && !weak_anchor;
let Some(score_result) = self.match_confidence(
entry,
chunk,
credential,
data,
line,
entropy,
!companions.is_empty(),
inferred_context,
keyword_nearby,
sensitive_file,
is_named_detector,
scan_state,
) else {
return;
};
match score_result {
super::MlScoreResult::Final(mut confidence) => {
// Boost confidence for checksum-validated tokens (single
// source of truth for the floor; see `checksum::CHECKSUM_VALID_FLOOR`).
if checksum_result == crate::checksum::ChecksumResult::Valid {
confidence = confidence.max(crate::checksum::CHECKSUM_VALID_FLOOR);
}
let raw_match = build_raw_match(
detector,
self.interned_detector_metadata(entry.detector_index),
chunk,
credential,
companions,
match_start + base_offset,
line + base_line,
entropy,
confidence,
scan_state,
entry.client_safe,
);
scan_state.push_match(raw_match, self.config.max_matches_per_chunk);
crate::telemetry::record_match_found();
}
#[cfg(feature = "ml")]
super::MlScoreResult::Pending {
heuristic_conf,
code_context,
credential: pending_credential,
ml_context,
} => {
let raw_match = build_raw_match(
detector,
self.interned_detector_metadata(entry.detector_index),
chunk,
credential,
companions,
match_start + base_offset,
line + base_line,
entropy,
heuristic_conf,
scan_state,
entry.client_safe,
);
scan_state.ml_pending.push(crate::types::MlPendingMatch {
raw_match,
heuristic_conf,
code_context,
credential: pending_credential.into_owned(),
ml_context: ml_context.into_owned(),
// Detector/generic matches: the firing regex is positive
// evidence, so the heuristic stays a confidence FLOOR (the
// model can only raise). Not model-authoritative.
model_authoritative: false,
});
crate::telemetry::record_match_found();
}
#[cfg(not(feature = "ml"))]
super::MlScoreResult::_Lifetime(_) => {
unreachable!("_Lifetime is a never-constructed placeholder variant")
}
}
}
}
/// True when `credential` (a pure-hex token at `data[start..end]`) is a slice
/// of a longer contiguous hex run reaching digest length (>=40 chars: SHA-1,
/// git commit SHA, or SHA-256). Such a match is a fragment of a hash, never a
/// standalone key. A genuine fixed-length hex API key is delimiter-bounded
/// (the byte before and after is `"`/`=`/whitespace/EOL, not another hex
/// digit), so `before == 0 && after == 0` and this returns false - recall on
/// real 32/40/64-hex keys is preserved.
pub(super) fn is_hex_digest_fragment(
data: &str,
start: usize,
end: usize,
credential: &str,
) -> bool {
if credential.len() < 16 || !credential.bytes().all(|b| b.is_ascii_hexdigit()) {
return false;
}
let bytes = data.as_bytes();
if start > end || end > bytes.len() {
return false;
}
let before = bytes[..start]
.iter()
.rev()
.take_while(|b| b.is_ascii_hexdigit())
.count();
let after = bytes[end..]
.iter()
.take_while(|b| b.is_ascii_hexdigit())
.count();
if before == 0 && after == 0 {
return false;
}
before + credential.len() + after >= 40
}