1use crate::common::{compile_regex, confidence, context_boost, is_boundary};
2use cloakrs_core::{Confidence, EntityType, Locale, PiiEntity, Recognizer, Span};
3use once_cell::sync::Lazy;
4use regex::Regex;
5
6static AWS_ACCESS_KEY_REGEX: Lazy<Regex> = Lazy::new(|| compile_regex(r"\bAKIA[A-Z0-9]{16}\b"));
7static JWT_REGEX: Lazy<Regex> =
8 Lazy::new(|| compile_regex(r"\b[A-Za-z0-9_-]{8,}\.[A-Za-z0-9_-]{8,}\.[A-Za-z0-9_-]{8,}\b"));
9static GENERIC_API_KEY_REGEX: Lazy<Regex> = Lazy::new(|| {
10 compile_regex(
11 r#"(?i)\b(?:api[_-]?key|access[_-]?token|token|secret|authorization)\b\s*(?::|=|=>)\s*(?:bearer\s+)?["']?([A-Za-z0-9][A-Za-z0-9_\-+/=]{19,})["']?"#,
12 )
13});
14
15const SECRET_CONTEXT_WORDS: &[&str] = &[
16 "api_key",
17 "api key",
18 "access_token",
19 "token",
20 "secret",
21 "authorization",
22 "bearer",
23 "credential",
24];
25
26#[derive(Debug, Clone, Copy, Default)]
38pub struct AwsAccessKeyRecognizer;
39
40impl Recognizer for AwsAccessKeyRecognizer {
41 fn id(&self) -> &str {
42 "aws_access_key_v1"
43 }
44
45 fn entity_type(&self) -> EntityType {
46 EntityType::AwsAccessKey
47 }
48
49 fn supported_locales(&self) -> &[Locale] {
50 &[]
51 }
52
53 fn scan(&self, text: &str) -> Vec<PiiEntity> {
54 AWS_ACCESS_KEY_REGEX
55 .find_iter(text)
56 .filter(|matched| self.is_valid_match(text, matched.start(), matched.end()))
57 .map(|matched| PiiEntity {
58 entity_type: self.entity_type(),
59 span: Span::new(matched.start(), matched.end()),
60 text: matched.as_str().to_string(),
61 confidence: self.compute_confidence(text, matched.start()),
62 recognizer_id: self.id().to_string(),
63 })
64 .collect()
65 }
66
67 fn validate(&self, candidate: &str) -> bool {
68 candidate.len() == 20
69 && candidate.starts_with("AKIA")
70 && candidate
71 .chars()
72 .all(|c| c.is_ascii_uppercase() || c.is_ascii_digit())
73 }
74}
75
76impl AwsAccessKeyRecognizer {
77 fn is_valid_match(&self, text: &str, start: usize, end: usize) -> bool {
78 self.validate(&text[start..end]) && is_boundary(text, start, end)
79 }
80
81 fn compute_confidence(&self, text: &str, start: usize) -> Confidence {
82 confidence(0.99 + context_boost(text, start, SECRET_CONTEXT_WORDS))
83 }
84}
85
86#[derive(Debug, Clone, Copy, Default)]
99pub struct JwtRecognizer;
100
101impl Recognizer for JwtRecognizer {
102 fn id(&self) -> &str {
103 "jwt_regex_v1"
104 }
105
106 fn entity_type(&self) -> EntityType {
107 EntityType::Jwt
108 }
109
110 fn supported_locales(&self) -> &[Locale] {
111 &[]
112 }
113
114 fn scan(&self, text: &str) -> Vec<PiiEntity> {
115 JWT_REGEX
116 .find_iter(text)
117 .filter(|matched| self.is_valid_match(text, matched.start(), matched.end()))
118 .map(|matched| PiiEntity {
119 entity_type: self.entity_type(),
120 span: Span::new(matched.start(), matched.end()),
121 text: matched.as_str().to_string(),
122 confidence: self.compute_confidence(text, matched.start()),
123 recognizer_id: self.id().to_string(),
124 })
125 .collect()
126 }
127
128 fn validate(&self, candidate: &str) -> bool {
129 let mut parts = candidate.split('.');
130 let Some(header) = parts.next() else {
131 return false;
132 };
133 let Some(payload) = parts.next() else {
134 return false;
135 };
136 let Some(signature) = parts.next() else {
137 return false;
138 };
139 parts.next().is_none()
140 && header.starts_with("ey")
141 && payload.starts_with("ey")
142 && validate_jwt_segment(header, 8)
143 && validate_jwt_segment(payload, 8)
144 && validate_jwt_segment(signature, 8)
145 }
146}
147
148impl JwtRecognizer {
149 fn is_valid_match(&self, text: &str, start: usize, end: usize) -> bool {
150 self.validate(&text[start..end]) && is_secret_boundary(text, start, end)
151 }
152
153 fn compute_confidence(&self, text: &str, start: usize) -> Confidence {
154 confidence(0.92 + context_boost(text, start, SECRET_CONTEXT_WORDS))
155 }
156}
157
158#[derive(Debug, Clone, Copy, Default)]
170pub struct ApiKeyRecognizer;
171
172impl Recognizer for ApiKeyRecognizer {
173 fn id(&self) -> &str {
174 "api_key_context_v1"
175 }
176
177 fn entity_type(&self) -> EntityType {
178 EntityType::ApiKey
179 }
180
181 fn supported_locales(&self) -> &[Locale] {
182 &[]
183 }
184
185 fn scan(&self, text: &str) -> Vec<PiiEntity> {
186 GENERIC_API_KEY_REGEX
187 .captures_iter(text)
188 .filter_map(|captures| captures.get(1))
189 .filter(|matched| self.is_valid_match(text, matched.start(), matched.end()))
190 .map(|matched| PiiEntity {
191 entity_type: self.entity_type(),
192 span: Span::new(matched.start(), matched.end()),
193 text: matched.as_str().to_string(),
194 confidence: self.compute_confidence(text, matched.start(), matched.as_str()),
195 recognizer_id: self.id().to_string(),
196 })
197 .collect()
198 }
199
200 fn validate(&self, candidate: &str) -> bool {
201 validate_generic_secret(candidate)
202 }
203}
204
205impl ApiKeyRecognizer {
206 fn is_valid_match(&self, text: &str, start: usize, end: usize) -> bool {
207 self.validate(&text[start..end]) && is_secret_boundary(text, start, end)
208 }
209
210 fn compute_confidence(&self, text: &str, start: usize, candidate: &str) -> Confidence {
211 let base = if looks_structured_secret(candidate) {
212 0.85
213 } else {
214 0.75
215 };
216 confidence(base + context_boost(text, start, SECRET_CONTEXT_WORDS))
217 }
218}
219
220fn validate_jwt_segment(segment: &str, min_len: usize) -> bool {
221 segment.len() >= min_len
222 && segment
223 .chars()
224 .all(|c| c.is_ascii_alphanumeric() || matches!(c, '_' | '-'))
225}
226
227fn validate_generic_secret(candidate: &str) -> bool {
228 let trimmed = candidate.trim_matches(|c| matches!(c, '"' | '\'' | ',' | ';'));
229 trimmed.len() >= 20
230 && !trimmed.chars().all(|c| c == trimmed.as_bytes()[0] as char)
231 && trimmed
232 .chars()
233 .all(|c| c.is_ascii_alphanumeric() || matches!(c, '_' | '-' | '+' | '/' | '='))
234}
235
236fn looks_structured_secret(candidate: &str) -> bool {
237 let has_letter = candidate.chars().any(|c| c.is_ascii_alphabetic());
238 let has_digit = candidate.chars().any(|c| c.is_ascii_digit());
239 let has_symbol = candidate
240 .chars()
241 .any(|c| matches!(c, '_' | '-' | '+' | '/' | '='));
242 has_letter && has_digit && has_symbol
243}
244
245fn is_secret_boundary(text: &str, start: usize, end: usize) -> bool {
246 let before = text[..start].chars().next_back();
247 let after = text[end..].chars().next();
248 !before.is_some_and(is_secret_prefix_char) && !after.is_some_and(is_secret_suffix_char)
249}
250
251fn is_secret_prefix_char(c: char) -> bool {
252 c.is_ascii_alphanumeric() || matches!(c, '_' | '-' | '+' | '/' | '.')
253}
254
255fn is_secret_suffix_char(c: char) -> bool {
256 c.is_ascii_alphanumeric() || matches!(c, '_' | '-' | '+' | '/' | '=' | '.')
257}
258
259#[cfg(test)]
260mod tests {
261 use super::*;
262 use crate::default_registry;
263
264 fn aws_texts(input: &str) -> Vec<String> {
265 AwsAccessKeyRecognizer
266 .scan(input)
267 .into_iter()
268 .map(|finding| finding.text)
269 .collect()
270 }
271
272 fn jwt_texts(input: &str) -> Vec<String> {
273 JwtRecognizer
274 .scan(input)
275 .into_iter()
276 .map(|finding| finding.text)
277 .collect()
278 }
279
280 fn api_key_texts(input: &str) -> Vec<String> {
281 ApiKeyRecognizer
282 .scan(input)
283 .into_iter()
284 .map(|finding| finding.text)
285 .collect()
286 }
287
288 #[test]
289 fn test_aws_access_key_valid_detected() {
290 assert_eq!(
291 aws_texts("aws AKIAIOSFODNN7EXAMPLE"),
292 ["AKIAIOSFODNN7EXAMPLE"]
293 );
294 }
295
296 #[test]
297 fn test_aws_access_key_token_context_detected() {
298 assert_eq!(
299 aws_texts("access token AKIA1234567890ABCDEF"),
300 ["AKIA1234567890ABCDEF"]
301 );
302 }
303
304 #[test]
305 fn test_aws_access_key_lowercase_rejected() {
306 assert!(aws_texts("akiaiosfodnn7example").is_empty());
307 }
308
309 #[test]
310 fn test_aws_access_key_too_short_rejected() {
311 assert!(aws_texts("AKIAIOSFODNN7EXAMP").is_empty());
312 }
313
314 #[test]
315 fn test_aws_access_key_embedded_in_word_rejected() {
316 assert!(aws_texts("idAKIAIOSFODNN7EXAMPLE").is_empty());
317 }
318
319 #[test]
320 fn test_aws_access_key_context_boosts_confidence() {
321 let with_context = AwsAccessKeyRecognizer.scan("secret AKIAIOSFODNN7EXAMPLE");
322 let without_context = AwsAccessKeyRecognizer.scan("value AKIAIOSFODNN7EXAMPLE");
323 assert!(with_context[0].confidence >= without_context[0].confidence);
324 }
325
326 #[test]
327 fn test_jwt_valid_detected() {
328 let token = "eyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiIxMjM0NTY3ODkwIn0.abc123456789_xyz";
329 assert_eq!(jwt_texts(token), [token]);
330 }
331
332 #[test]
333 fn test_jwt_with_bearer_context_detected() {
334 let token = "eyJ0eXAiOiJKV1QifQ.eyJyb2xlIjoiYWRtaW4ifQ.signature_123456";
335 assert_eq!(jwt_texts(&format!("Bearer {token}")), [token]);
336 }
337
338 #[test]
339 fn test_jwt_short_version_like_value_rejected() {
340 assert!(jwt_texts("version 1.2.3").is_empty());
341 }
342
343 #[test]
344 fn test_jwt_two_segments_rejected() {
345 assert!(jwt_texts("eyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiIxMjM").is_empty());
346 }
347
348 #[test]
349 fn test_jwt_embedded_in_larger_secret_rejected() {
350 let token = "xeyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiIxMjM0NTY3ODkwIn0.abc123456789_xyz";
351 assert!(jwt_texts(token).is_empty());
352 }
353
354 #[test]
355 fn test_jwt_context_boosts_confidence() {
356 let token = "eyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiIxMjM0NTY3ODkwIn0.abc123456789_xyz";
357 let with_context = JwtRecognizer.scan(&format!("authorization: bearer {token}"));
358 let without_context = JwtRecognizer.scan(token);
359 assert!(with_context[0].confidence > without_context[0].confidence);
360 }
361
362 #[test]
363 fn test_api_key_after_api_key_label_detected() {
364 assert_eq!(
365 api_key_texts("api_key = sk_live_0123456789abcdef"),
366 ["sk_live_0123456789abcdef"]
367 );
368 }
369
370 #[test]
371 fn test_api_key_after_token_label_detected() {
372 assert_eq!(
373 api_key_texts("token: abcdef1234567890ABCDEF12"),
374 ["abcdef1234567890ABCDEF12"]
375 );
376 }
377
378 #[test]
379 fn test_api_key_after_authorization_bearer_detected() {
380 assert_eq!(
381 api_key_texts("Authorization: Bearer abcdef1234567890ABCDEF12"),
382 ["abcdef1234567890ABCDEF12"]
383 );
384 }
385
386 #[test]
387 fn test_api_key_after_secret_label_detected() {
388 assert_eq!(
389 api_key_texts("secret=ZXhhbXBsZS1zZWNyZXQtdmFsdWU="),
390 ["ZXhhbXBsZS1zZWNyZXQtdmFsdWU="]
391 );
392 }
393
394 #[test]
395 fn test_api_key_without_context_rejected() {
396 assert!(api_key_texts("value abcdef1234567890ABCDEF12").is_empty());
397 }
398
399 #[test]
400 fn test_api_key_short_value_rejected() {
401 assert!(api_key_texts("api_key=abc123").is_empty());
402 }
403
404 #[test]
405 fn test_api_key_repeated_value_rejected() {
406 assert!(api_key_texts("api_key=aaaaaaaaaaaaaaaaaaaa").is_empty());
407 }
408
409 #[test]
410 fn test_api_key_context_boosts_confidence() {
411 let structured = ApiKeyRecognizer.scan("api_key=sk_live_0123456789abcdef");
412 let plain = ApiKeyRecognizer.scan("token=abcdef1234567890ABCDEF12");
413 assert!(structured[0].confidence > plain[0].confidence);
414 }
415
416 #[test]
417 fn test_secret_default_registry_detects_all_secret_types() {
418 let findings = default_registry().scan_all(concat!(
419 "aws AKIAIOSFODNN7EXAMPLE\n",
420 "jwt eyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiIxMjM0NTY3ODkwIn0.abc123456789_xyz\n",
421 "api_key=sk_live_0123456789abcdef\n",
422 ));
423
424 assert!(findings
425 .iter()
426 .any(|finding| finding.entity_type == EntityType::AwsAccessKey));
427 assert!(findings
428 .iter()
429 .any(|finding| finding.entity_type == EntityType::Jwt));
430 assert!(findings
431 .iter()
432 .any(|finding| finding.entity_type == EntityType::ApiKey));
433 }
434}