1#![deny(unsafe_code)]
12#![warn(missing_docs)]
13#![warn(rust_2018_idioms)]
14
15use rayon::prelude::*;
16use regex::Regex;
17use serde::{Deserialize, Serialize};
18use thiserror::Error;
19
20pub type Result<T> = std::result::Result<T, ScannerError>;
22
23#[derive(Error, Debug)]
25pub enum ScannerError {
26 #[error("regex error: {0}")]
28 Regex(#[from] regex::Error),
29 #[error("invalid config: {0}")]
31 InvalidConfig(String),
32}
33
34#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
36pub struct ScannerConfig {
37 pub min_entropy: f32,
40 pub min_entropy_length: usize,
42 pub include_high_entropy: bool,
44}
45
46impl Default for ScannerConfig {
47 fn default() -> Self {
48 Self {
49 min_entropy: 4.5,
50 min_entropy_length: 32,
51 include_high_entropy: true,
52 }
53 }
54}
55
56#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
58pub struct Finding {
59 pub kind: String,
61 pub line: usize,
63 pub column: usize,
65 pub start: usize,
67 pub end: usize,
69 pub matched: String,
71 pub entropy: f32,
73}
74
75pub struct Scanner {
77 cfg: ScannerConfig,
78 rules: Vec<(&'static str, Regex)>,
79 high_entropy_re: Regex,
80}
81
82const RULES: &[(&str, &str)] = &[
83 ("AWS_ACCESS_KEY", r"\bAKIA[0-9A-Z]{16}\b"),
84 ("GITHUB_TOKEN", r"\bgh[pours]_[A-Za-z0-9]{36,}\b"),
87 ("SLACK_TOKEN", r"\bxox[baprs]-[A-Za-z0-9-]{10,}\b"),
89 (
91 "STRIPE_KEY",
92 r"\b(?:sk|pk|rk)_(?:live|test)_[A-Za-z0-9]{20,}\b",
93 ),
94 (
96 "JWT",
97 r"\beyJ[A-Za-z0-9_-]+\.eyJ[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\b",
98 ),
99 ("RSA_PRIVATE_KEY", r"-----BEGIN RSA PRIVATE KEY-----"),
101 ("SSH_PRIVATE_KEY", r"-----BEGIN OPENSSH PRIVATE KEY-----"),
102 (
104 "GENERIC_API_KEY",
105 r#"(?i)\bapi[_-]?key\s*[=:]\s*['"]([A-Za-z0-9_\-=]{16,})['"]"#,
106 ),
107 ("OPENAI_KEY", r"\bsk-(?:proj-)?[A-Za-z0-9_-]{20,}\b"),
110 (
112 "ANTHROPIC_KEY",
113 r"\bsk-ant-(?:api03-|sid01-)?[A-Za-z0-9_-]{20,}\b",
114 ),
115 ("TWILIO_AUTH_TOKEN", r"\bSK[a-fA-F0-9]{32}\b"),
117 (
119 "SENDGRID_KEY",
120 r"\bSG\.[A-Za-z0-9_-]{22}\.[A-Za-z0-9_-]{43}\b",
121 ),
122];
123
124impl Scanner {
125 pub fn new() -> Self {
127 Self::with_config(ScannerConfig::default()).expect("default config compiles")
128 }
129
130 pub fn with_config(cfg: ScannerConfig) -> Result<Self> {
132 if cfg.min_entropy < 0.0 || cfg.min_entropy > 8.0 {
133 return Err(ScannerError::InvalidConfig(format!(
134 "min_entropy out of range [0, 8]: {}",
135 cfg.min_entropy
136 )));
137 }
138 let rules: Vec<(&'static str, Regex)> = RULES
139 .iter()
140 .map(|(k, p)| Regex::new(p).map(|r| (*k, r)))
141 .collect::<std::result::Result<_, _>>()?;
142 let pat = format!(r"[A-Za-z0-9+/=_\-]{{{},}}", cfg.min_entropy_length);
145 let high_entropy_re = Regex::new(&pat)?;
146 Ok(Self {
147 cfg,
148 rules,
149 high_entropy_re,
150 })
151 }
152
153 pub fn scan(&self, source: &str) -> Vec<Finding> {
155 let mut findings: Vec<Finding> = Vec::new();
156 let mut covered: Vec<(usize, usize)> = Vec::new();
157
158 for (kind, regex) in &self.rules {
160 for m in regex.find_iter(source) {
161 let entropy = shannon_entropy(m.as_str());
162 let (line, column) = line_col(source, m.start());
163 findings.push(Finding {
164 kind: (*kind).to_string(),
165 line,
166 column,
167 start: m.start(),
168 end: m.end(),
169 matched: m.as_str().to_string(),
170 entropy,
171 });
172 covered.push((m.start(), m.end()));
173 }
174 }
175
176 if self.cfg.include_high_entropy {
178 for m in self.high_entropy_re.find_iter(source) {
179 if overlaps(&covered, m.start(), m.end()) {
180 continue;
181 }
182 let entropy = shannon_entropy(m.as_str());
183 if entropy < self.cfg.min_entropy {
184 continue;
185 }
186 let (line, column) = line_col(source, m.start());
187 findings.push(Finding {
188 kind: "HIGH_ENTROPY".to_string(),
189 line,
190 column,
191 start: m.start(),
192 end: m.end(),
193 matched: m.as_str().to_string(),
194 entropy,
195 });
196 }
197 }
198
199 findings.sort_by_key(|f| f.start);
200 findings
201 }
202
203 pub fn scan_many(&self, sources: &[&str], parallel: bool) -> Vec<Vec<Finding>> {
205 if parallel {
206 sources.par_iter().map(|s| self.scan(s)).collect()
207 } else {
208 sources.iter().map(|s| self.scan(s)).collect()
209 }
210 }
211}
212
213impl Default for Scanner {
214 fn default() -> Self {
215 Self::new()
216 }
217}
218
219fn shannon_entropy(s: &str) -> f32 {
220 if s.is_empty() {
221 return 0.0;
222 }
223 let mut counts = [0u32; 256];
224 let mut n = 0u32;
225 for &b in s.as_bytes() {
226 counts[b as usize] += 1;
227 n += 1;
228 }
229 let mut e = 0.0_f32;
230 let n_f = n as f32;
231 for &c in &counts {
232 if c == 0 {
233 continue;
234 }
235 let p = c as f32 / n_f;
236 e -= p * p.log2();
237 }
238 e
239}
240
241fn line_col(source: &str, byte_offset: usize) -> (usize, usize) {
242 let mut line = 1usize;
243 let mut last_newline = 0usize;
244 for (i, b) in source.as_bytes().iter().take(byte_offset).enumerate() {
245 if *b == b'\n' {
246 line += 1;
247 last_newline = i + 1;
248 }
249 }
250 (line, byte_offset - last_newline + 1)
251}
252
253fn overlaps(ranges: &[(usize, usize)], start: usize, end: usize) -> bool {
254 ranges.iter().any(|&(s, e)| start < e && end > s)
255}
256
257#[cfg(test)]
258mod tests {
259 use super::*;
260
261 #[test]
262 fn aws_key_detected() {
263 let s = Scanner::new();
264 let r = s.scan("aws = AKIAIOSFODNN7EXAMPLE\n");
265 assert_eq!(r.len(), 1);
266 assert_eq!(r[0].kind, "AWS_ACCESS_KEY");
267 assert_eq!(r[0].line, 1);
268 }
269
270 #[test]
271 fn github_token_detected() {
272 let s = Scanner::new();
273 let token = "ghp_abcdefghijklmnopqrstuvwxyz0123456789";
274 let r = s.scan(&format!("token = {token}\n"));
275 assert!(r.iter().any(|f| f.kind == "GITHUB_TOKEN"));
276 }
277
278 #[test]
279 fn slack_token_detected() {
280 let s = Scanner::new();
281 let r = s.scan("slack = xoxb-1234567890-abcdef\n");
282 assert!(r.iter().any(|f| f.kind == "SLACK_TOKEN"));
283 }
284
285 #[test]
286 fn stripe_key_detected() {
287 let s = Scanner::new();
288 let r = s.scan("STRIPE = sk_live_abcdefghij1234567890\n");
289 assert!(r.iter().any(|f| f.kind == "STRIPE_KEY"));
290 }
291
292 #[test]
293 fn jwt_detected() {
294 let s = Scanner::new();
295 let jwt = "eyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJ1MSJ9.signature_part_long_enough";
296 let r = s.scan(&format!("auth = '{jwt}'"));
297 assert!(r.iter().any(|f| f.kind == "JWT"));
298 }
299
300 #[test]
301 fn openai_key_classic_detected() {
302 let s = Scanner::new();
303 let key = "sk-abcdefghijklmnopqrstuvwxyz0123456789ABCDEFGHIJKL";
304 let r = s.scan(&format!("OPENAI = {key}\n"));
305 assert!(r.iter().any(|f| f.kind == "OPENAI_KEY"));
306 }
307
308 #[test]
309 fn openai_key_project_scoped_detected() {
310 let s = Scanner::new();
311 let key = "sk-proj-abcdefghij_KLMNOPQRSTU-vwxyz0123456789";
312 let r = s.scan(&format!("OPENAI = {key}\n"));
313 assert!(r.iter().any(|f| f.kind == "OPENAI_KEY"));
314 }
315
316 #[test]
317 fn anthropic_key_detected() {
318 let s = Scanner::new();
319 let key = "sk-ant-api03-abcdefghijklmnopqrstuvwxyz0123456789ABCD";
320 let r = s.scan(&format!("ANTHROPIC = {key}\n"));
321 assert!(r.iter().any(|f| f.kind == "ANTHROPIC_KEY"));
322 }
323
324 #[test]
325 fn twilio_auth_token_detected() {
326 let s = Scanner::new();
327 let key = "SK0123456789abcdef0123456789abcdef";
329 let r = s.scan(&format!("TWILIO = {key}\n"));
330 assert!(r.iter().any(|f| f.kind == "TWILIO_AUTH_TOKEN"));
331 }
332
333 #[test]
334 fn sendgrid_key_detected() {
335 let s = Scanner::new();
336 let body22 = "abcdefghijklmnopqrstuv";
338 let sig43 = "abcdefghijklmnopqrstuvwxyz0123456789ABCDEFG";
339 let key = format!("SG.{body22}.{sig43}");
340 let r = s.scan(&format!("SG = {key}\n"));
341 assert!(r.iter().any(|f| f.kind == "SENDGRID_KEY"));
342 }
343
344 #[test]
345 fn rsa_marker_detected() {
346 let s = Scanner::new();
347 let r = s.scan("-----BEGIN RSA PRIVATE KEY-----\nMII...\n");
348 assert!(r.iter().any(|f| f.kind == "RSA_PRIVATE_KEY"));
349 }
350
351 #[test]
352 fn ssh_marker_detected() {
353 let s = Scanner::new();
354 let r = s.scan("-----BEGIN OPENSSH PRIVATE KEY-----\nb3Bl...\n");
355 assert!(r.iter().any(|f| f.kind == "SSH_PRIVATE_KEY"));
356 }
357
358 #[test]
359 fn generic_api_key_assignment_detected() {
360 let s = Scanner::new();
361 let r = s.scan(r#"api_key = "abcdefghijklmnopqrst""#);
362 assert!(r.iter().any(|f| f.kind == "GENERIC_API_KEY"));
363 }
364
365 #[test]
366 fn high_entropy_detected() {
367 let s = Scanner::new();
368 let blob = "K3s9Q2pXq9ZTm4Lp2Vw7Yc1RnFb5Xh6N";
370 let r = s.scan(&format!("token = '{blob}'"));
371 assert!(r.iter().any(|f| f.kind == "HIGH_ENTROPY"));
372 }
373
374 #[test]
375 fn low_entropy_skipped() {
376 let s = Scanner::new();
377 let blob = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
379 let r = s.scan(&format!("v = '{blob}'"));
380 assert!(!r.iter().any(|f| f.kind == "HIGH_ENTROPY"));
381 }
382
383 #[test]
384 fn no_finding_on_clean_source() {
385 let s = Scanner::new();
386 let r = s.scan("def add(a: int, b: int) -> int:\n return a + b\n");
387 assert!(r.is_empty());
388 }
389
390 #[test]
391 fn line_column_correct_on_multiline() {
392 let s = Scanner::new();
393 let src = "line1\nline2 AKIAIOSFODNN7EXAMPLE\nline3\n";
394 let r = s.scan(src);
395 assert_eq!(r.len(), 1);
396 assert_eq!(r[0].line, 2);
397 assert_eq!(r[0].column, 7);
399 }
400
401 #[test]
402 fn findings_sorted_by_position() {
403 let s = Scanner::new();
404 let src = "ghp_abcdefghijklmnopqrstuvwxyz0123456789 then AKIAIOSFODNN7EXAMPLE";
405 let r = s.scan(src);
406 assert!(r.len() >= 2);
407 for w in r.windows(2) {
408 assert!(w[0].start <= w[1].start);
409 }
410 }
411
412 #[test]
413 fn high_entropy_does_not_double_up_on_known_pattern() {
414 let s = Scanner::new();
415 let token = "ghp_abcdefghijklmnopqrstuvwxyz0123456789";
418 let r = s.scan(&format!("t = '{token}'"));
419 let kinds: Vec<&str> = r.iter().map(|f| f.kind.as_str()).collect();
420 assert!(kinds.contains(&"GITHUB_TOKEN"));
421 for f in &r {
424 if f.kind == "HIGH_ENTROPY" {
425 let token_start = src_pos(&format!("t = '{token}'"), token);
426 let token_end = token_start + token.len();
427 assert!(
428 !(f.start >= token_start && f.end <= token_end),
429 "HIGH_ENTROPY overlaps GITHUB_TOKEN at {}..{}",
430 f.start,
431 f.end
432 );
433 }
434 }
435 }
436
437 fn src_pos(haystack: &str, needle: &str) -> usize {
438 haystack.find(needle).unwrap()
439 }
440
441 #[test]
442 fn invalid_entropy_threshold_rejected() {
443 let cfg = ScannerConfig {
444 min_entropy: 100.0,
445 ..Default::default()
446 };
447 assert!(Scanner::with_config(cfg).is_err());
448 }
449
450 #[test]
451 fn high_entropy_can_be_disabled() {
452 let cfg = ScannerConfig {
453 include_high_entropy: false,
454 ..Default::default()
455 };
456 let s = Scanner::with_config(cfg).unwrap();
457 let blob = "K3s9Q2pXq9ZTm4Lp2Vw7Yc1RnFb5Xh6N";
458 let r = s.scan(&format!("token = '{blob}'"));
459 assert!(!r.iter().any(|f| f.kind == "HIGH_ENTROPY"));
460 }
461
462 #[test]
463 fn scan_many_serial_and_parallel_match() {
464 let s = Scanner::new();
465 let sources: Vec<&str> = vec!["aws = AKIAIOSFODNN7EXAMPLE", "no secret here"];
466 let a = s.scan_many(&sources, false);
467 let b = s.scan_many(&sources, true);
468 assert_eq!(a, b);
469 assert_eq!(a[0].len(), 1);
470 assert_eq!(a[1].len(), 0);
471 }
472
473 #[test]
474 fn shannon_entropy_zero_for_constant_string() {
475 assert_eq!(shannon_entropy("aaaa"), 0.0);
476 }
477
478 #[test]
479 fn shannon_entropy_max_for_equal_distribution() {
480 let e = shannon_entropy("abcdabcd");
482 assert!((e - 2.0).abs() < 1e-4);
483 }
484}