1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
//! Helper functions for fallback_entropy.rs to satisfy line caps.
#[cfg(feature = "entropy")]
pub(crate) fn entropy_path_looks_like_kebab_identifier(value: &str) -> bool {
if value.len() > 24 {
return false;
}
let bytes = value.as_bytes();
let dash_count = bytes.iter().filter(|&&b| b == b'-').count();
if dash_count == 0 {
return false;
}
let lower_count = bytes
.iter()
.filter(|&&b| (b as char).is_ascii_lowercase())
.count();
if lower_count * 2 < bytes.len() {
return false;
}
!bytes.iter().any(|&b| matches!(b as char, '+' | '/' | '='))
}
#[cfg(feature = "entropy")]
pub(crate) fn entropy_path_is_ci_workflow_file(path: Option<&str>) -> bool {
let Some(p) = path else {
return false;
};
p.contains("/.github/workflows/")
|| p.contains("\\.github\\workflows\\")
|| p.contains("/.github/actions/")
|| p.contains("\\.github\\actions\\")
|| p.contains("/.gitlab-ci.yml")
|| p.contains("\\.gitlab-ci.yml")
|| p.ends_with(".gitlab-ci.yml")
|| p.contains("/.circleci/")
|| p.contains("\\.circleci\\")
|| p.contains("/azure-pipelines")
|| p.contains("\\azure-pipelines")
|| p.contains("/bitbucket-pipelines")
|| p.contains("\\bitbucket-pipelines")
|| p.contains("/.travis.yml")
|| p.contains("\\.travis.yml")
|| p.ends_with(".travis.yml")
|| p.contains("/Jenkinsfile")
|| p.contains("\\Jenkinsfile")
|| p.ends_with("/Jenkinsfile")
|| p.ends_with("\\Jenkinsfile")
}
#[cfg(feature = "entropy")]
pub(crate) fn entropy_path_is_i18n_file(path: Option<&str>) -> bool {
let Some(p) = path else {
return false;
};
p.contains("/locale/")
|| p.contains("\\locale\\")
|| p.contains("/locales/")
|| p.contains("\\locales\\")
|| p.contains("/i18n/")
|| p.contains("\\i18n\\")
|| p.contains("/l10n/")
|| p.contains("\\l10n\\")
|| p.contains("/translations/")
|| p.contains("\\translations\\")
|| p.contains("/lang/")
|| p.contains("\\lang\\")
|| p.contains("/langs/")
|| p.contains("\\langs\\")
|| p.ends_with(".po")
|| p.ends_with(".pot")
|| {
let name = p.rsplit(['/', '\\']).next().unwrap_or(p);
(name.starts_with("locale_")
|| name.starts_with("messages_")
|| name.starts_with("strings_"))
&& (name.ends_with(".ini")
|| name.ends_with(".properties")
|| name.ends_with(".xml")
|| name.ends_with(".json")
|| name.ends_with(".yaml")
|| name.ends_with(".yml"))
}
}
#[cfg(feature = "entropy")]
pub(crate) fn entropy_path_looks_like_filename(value: &str) -> bool {
const FILENAME_SUFFIXES: &[&[u8]] = &[
b".jks",
b".yml",
b".yaml",
b".toml",
b".json",
b".properties",
b".pem",
b".key",
b".crt",
b".cer",
b".pfx",
b".p12",
b".keystore",
b".truststore",
b".conf",
b".ini",
b".env",
b".lock",
b".log",
];
let bytes = value.as_bytes();
FILENAME_SUFFIXES
.iter()
.any(|s| crate::ascii_ci::ends_with_ignore_ascii_case(bytes, s))
}
#[cfg(feature = "entropy")]
pub(crate) fn entropy_path_looks_like_random_base64_blob(value: &str) -> bool {
// Lower bound 50 (was 40) so 40-49 char base64-shaped credentials get
// a path through the entropy fallback. Real-world recall fixtures sit
// in this 40-49 char band (Stripe-style restricted-secret-key bodies,
// GitHub legacy 40-char auth secrets). Protobuf-of-random-bytes
// decoys skew larger (median 64 chars per negatives.py: 30-80 random
// bytes) so this band is overwhelmingly real credentials.
if !(50..=300).contains(&value.len()) {
return false;
}
let has_padding = value.ends_with("==") || value.ends_with('=');
let length_mult_4 = value.len() % 4 == 0;
if !has_padding && !length_mult_4 {
return false;
}
let mut has_plus = false;
let mut has_slash = false;
for c in value.chars() {
match c {
'A'..='Z' | 'a'..='z' | '0'..='9' | '=' => {}
'+' => has_plus = true,
'/' => has_slash = true,
_ => return false,
}
}
// Tightened punctuation requirement: require BOTH `+` AND `/` (or
// padding with at least one of them). Real protobuf-of-random-bytes
// encoding produces both `+` and `/` because the byte distribution
// is uniform; restricted-secret-key style positives often contain
// only one. Padded values with at least one `+/` still trip - this
// is a per-byte distribution signal, not a structural one.
(has_plus && has_slash) || (has_padding && (has_plus || has_slash))
}
/// The four synthetic entropy-fallback metadata triples, index-parallel with
/// [`classify_entropy_detector_index`]. Single source of truth: the scanner
/// pre-interns this exact table into `entropy_metadata_by_index` at
/// construction so the emit path clones an `Arc<str>` by index instead of
/// re-interning these constants per finding (PERF-locality_intern-1).
#[cfg(feature = "entropy")]
pub(crate) const ENTROPY_DETECTOR_METADATA: [(&str, &str, &str); 4] = [
("entropy-generic", "Generic High-Entropy Secret", "generic"),
("entropy-password", "Password (Entropy Detected)", "generic"),
("entropy-token", "API Token (Entropy Detected)", "generic"),
("entropy-api-key", "API Key (Entropy Detected)", "generic"),
];
/// Classify an entropy candidate's keyword into the index of its metadata
/// triple in [`ENTROPY_DETECTOR_METADATA`]. The branch order matches the
/// historical keyword→detector mapping, so the resolved detector
/// id/name/service are unchanged; the scanner clones the pre-interned triple
/// at this index at the emit site (PERF-locality_intern-1).
#[cfg(feature = "entropy")]
#[inline]
pub(crate) fn classify_entropy_detector_index(keyword: &str) -> usize {
if keyword == "none (high-entropy)" {
0
} else if keyword.contains("password") || keyword.contains("pwd") {
1
} else if keyword.contains("token") {
2
} else {
3
}
}
/// True when the entropy candidate's keyword indicates a strong credential
/// anchor was directly responsible for the candidate's extraction. The
/// caller uses this to admit the candidate past the file-extension gate
/// in `scan_entropy_fallback`: if the line carries `api_key=`, `token=`,
/// `password=`, etc., the file extension (source code vs. config) is no
/// longer the deciding signal - the keyword anchor IS positive evidence
/// the value is a credential.
///
/// `keyword == "none (high-entropy)"` is the no-keyword path (very-high
/// entropy threshold was used); it is NOT a credential anchor.
#[cfg(feature = "entropy")]
pub(crate) fn keyword_is_credential_anchor(keyword: &str) -> bool {
if keyword == "none (high-entropy)" {
return false;
}
let lower = keyword.to_ascii_lowercase();
const CREDENTIAL_ANCHORS: &[&str] = &[
"secret",
"password",
"passwd",
"pwd",
"token",
"apikey",
"api_key",
"api-key",
"auth",
"credential",
"private_key",
"private-key",
"privatekey",
"client_secret",
"client-secret",
"access_key",
"access-key",
"accesskey",
];
CREDENTIAL_ANCHORS
.iter()
.any(|anchor| lower.contains(anchor))
}
#[cfg(all(test, feature = "entropy"))]
mod helper_tests {
use super::*;
#[test]
fn credential_anchor_recognizes_common_keywords() {
// Positive recall for the common credential keyword shapes the
// entropy scanner emits as `keyword`.
assert!(keyword_is_credential_anchor("api_key"));
assert!(keyword_is_credential_anchor("API_KEY"));
assert!(keyword_is_credential_anchor("apiKey"));
assert!(keyword_is_credential_anchor("apikey"));
assert!(keyword_is_credential_anchor("token"));
assert!(keyword_is_credential_anchor("password"));
assert!(keyword_is_credential_anchor("client_secret"));
assert!(keyword_is_credential_anchor("PRIVATE_KEY"));
assert!(keyword_is_credential_anchor("auth_token"));
}
#[test]
fn credential_anchor_rejects_no_keyword_marker() {
// Negative twin: the entropy scanner uses this exact string for
// the keyword-free very-high-entropy path; it must NOT count as
// a credential anchor (otherwise it would defeat the gate).
assert!(!keyword_is_credential_anchor("none (high-entropy)"));
}
#[test]
fn credential_anchor_rejects_unrelated_keyword() {
// Negative twin: an unrelated word must not be admitted.
assert!(!keyword_is_credential_anchor("description"));
assert!(!keyword_is_credential_anchor("environment"));
}
#[test]
fn random_b64_blob_admits_pure_alnum_when_mult4_diverse() {
// Positive: 56-char padded base64 alphabet, with both + and /
// (current behavior) - kept as a sanity test.
let v = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJ0123456789++//ZZ==";
// The legacy gate already catches this:
assert!(entropy_path_looks_like_random_base64_blob(v));
}
#[test]
fn random_b64_blob_rejects_pure_lowercase_low_diversity() {
// Negative twin: 50 chars of just `a` repeated - low diversity,
// not a base64 blob shape (also fails the new band floor).
let v: String = "a".repeat(50);
assert!(!entropy_path_looks_like_random_base64_blob(&v));
}
#[test]
fn random_b64_blob_admits_real_protobuf_dump() {
// Positive: a real 64-char protobuf-of-random-bytes payload
// with BOTH + and / - this is the dominant shape the gate
// exists to suppress; must still fire after the tightened
// punctuation check (both + and /).
let v = "Vwqk+gg+vh6Pm9mhPgQU/wJPTbFY6cwjNNFQQVY+8jtl/RGABCDEFGHIJKLMNOPQ";
assert_eq!(v.len(), 64);
assert!(entropy_path_looks_like_random_base64_blob(v));
}
#[test]
fn random_b64_blob_releases_short_credential_band() {
// Positive recall: a 44-char credential body in the 40-49 char
// band (where Stripe restricted-key bodies and similar named
// secrets land) is no longer over-suppressed by this gate.
// The previous 40-char floor swept these into the "looks like
// a protobuf dump" bucket; new 50-char floor releases them.
let v = "Hk9PqRsTuVwXyZAbCdEfGhIjKlMnOpQr0123456789ab";
assert_eq!(v.len(), 44);
assert!(!entropy_path_looks_like_random_base64_blob(v));
}
#[test]
fn random_b64_blob_rejects_plus_only_no_slash() {
// Negative twin to the tightened punctuation gate: a 60-char
// value with `+` only (no `/`, no padding) was previously
// suppressed under the OR-of-punct rule. With the tightened
// AND-of-both rule, this releases to the downstream emit path.
// Real protobuf encodings of random bytes virtually always
// produce both `+` and `/` because the byte distribution is
// uniform; a `+`-only base64 is much more likely a credential.
let v = "AAAABBBBCCCCDDDDEEEEFFFFGGGGHHHH+IIIJJJJKKKKLLLLMMMMNNNNOPQR";
assert_eq!(v.len(), 60);
assert!(!entropy_path_looks_like_random_base64_blob(v));
}
}