Skip to main content

secretsniff_core/
lib.rs

1//! Pure-Rust core for `secretsniff`. Source-code secret scanner.
2//!
3//! Two layers:
4//!
5//! - **Regex detectors** for known formats (AWS keys, GitHub PATs, etc.).
6//!   These are fast, low-false-positive when patterns match exactly.
7//! - **High-entropy filter** that flags any base64/hex-ish substring of
8//!   length ≥ `min_entropy_length` whose Shannon entropy meets a
9//!   threshold. Catches one-off secrets that don't fit a known format.
10
11#![deny(unsafe_code)]
12#![warn(missing_docs)]
13#![warn(rust_2018_idioms)]
14
15use rayon::prelude::*;
16use regex::Regex;
17use serde::{Deserialize, Serialize};
18use thiserror::Error;
19
20/// Crate-wide result alias.
21pub type Result<T> = std::result::Result<T, ScannerError>;
22
23/// All errors surfaced by `secretsniff-core`.
24#[derive(Error, Debug)]
25pub enum ScannerError {
26    /// A regex failed to compile.
27    #[error("regex error: {0}")]
28    Regex(#[from] regex::Error),
29    /// Caller supplied an invalid configuration.
30    #[error("invalid config: {0}")]
31    InvalidConfig(String),
32}
33
34/// Scanner configuration.
35#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
36pub struct ScannerConfig {
37    /// Shannon-entropy threshold (bits per char) for the high-entropy
38    /// fallback rule. 4.5 is a reasonable starting point.
39    pub min_entropy: f32,
40    /// Minimum length (in characters) for the high-entropy rule.
41    pub min_entropy_length: usize,
42    /// If false, skip the high-entropy rule entirely.
43    pub include_high_entropy: bool,
44}
45
46impl Default for ScannerConfig {
47    fn default() -> Self {
48        Self {
49            min_entropy: 4.5,
50            min_entropy_length: 32,
51            include_high_entropy: true,
52        }
53    }
54}
55
56/// One finding.
57#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
58pub struct Finding {
59    /// Detector name (e.g. `AWS_ACCESS_KEY`).
60    pub kind: String,
61    /// 1-indexed line number.
62    pub line: usize,
63    /// 1-indexed byte offset within the line.
64    pub column: usize,
65    /// Byte offset of the match start in the source.
66    pub start: usize,
67    /// Byte offset (exclusive) of the match end.
68    pub end: usize,
69    /// The matched substring.
70    pub matched: String,
71    /// Shannon entropy in bits/char of the matched string.
72    pub entropy: f32,
73}
74
75/// Compiled scanner.
76pub struct Scanner {
77    cfg: ScannerConfig,
78    rules: Vec<(&'static str, Regex)>,
79    high_entropy_re: Regex,
80}
81
82const RULES: &[(&str, &str)] = &[
83    ("AWS_ACCESS_KEY", r"\bAKIA[0-9A-Z]{16}\b"),
84    // GitHub token formats: ghp_ (PAT), gho_ (OAuth), ghu_ (user-to-server),
85    // ghr_ (refresh), ghs_ (server-to-server).
86    ("GITHUB_TOKEN", r"\bgh[pours]_[A-Za-z0-9]{36,}\b"),
87    // Slack tokens
88    ("SLACK_TOKEN", r"\bxox[baprs]-[A-Za-z0-9-]{10,}\b"),
89    // Stripe keys
90    (
91        "STRIPE_KEY",
92        r"\b(?:sk|pk|rk)_(?:live|test)_[A-Za-z0-9]{20,}\b",
93    ),
94    // JWT (three url-safe-base64 segments)
95    (
96        "JWT",
97        r"\beyJ[A-Za-z0-9_-]+\.eyJ[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\b",
98    ),
99    // PEM-style markers
100    ("RSA_PRIVATE_KEY", r"-----BEGIN RSA PRIVATE KEY-----"),
101    ("SSH_PRIVATE_KEY", r"-----BEGIN OPENSSH PRIVATE KEY-----"),
102    // Generic api_key = "..." assignments. Captures the value via a group.
103    (
104        "GENERIC_API_KEY",
105        r#"(?i)\bapi[_-]?key\s*[=:]\s*['"]([A-Za-z0-9_\-=]{16,})['"]"#,
106    ),
107    // OpenAI API keys: classic (`sk-` + 48 chars) and project-scoped
108    // (`sk-proj-...`). Both surface high-entropy bodies.
109    ("OPENAI_KEY", r"\bsk-(?:proj-)?[A-Za-z0-9_-]{20,}\b"),
110    // Anthropic API keys.
111    (
112        "ANTHROPIC_KEY",
113        r"\bsk-ant-(?:api03-|sid01-)?[A-Za-z0-9_-]{20,}\b",
114    ),
115    // Twilio account SID + auth token.
116    ("TWILIO_AUTH_TOKEN", r"\bSK[a-fA-F0-9]{32}\b"),
117    // SendGrid API key (always 69 chars, prefixed `SG.`).
118    (
119        "SENDGRID_KEY",
120        r"\bSG\.[A-Za-z0-9_-]{22}\.[A-Za-z0-9_-]{43}\b",
121    ),
122];
123
124impl Scanner {
125    /// Build a scanner with the default config (all rules enabled).
126    pub fn new() -> Self {
127        Self::with_config(ScannerConfig::default()).expect("default config compiles")
128    }
129
130    /// Build a scanner with a custom config.
131    pub fn with_config(cfg: ScannerConfig) -> Result<Self> {
132        if cfg.min_entropy < 0.0 || cfg.min_entropy > 8.0 {
133            return Err(ScannerError::InvalidConfig(format!(
134                "min_entropy out of range [0, 8]: {}",
135                cfg.min_entropy
136            )));
137        }
138        let rules: Vec<(&'static str, Regex)> = RULES
139            .iter()
140            .map(|(k, p)| Regex::new(p).map(|r| (*k, r)))
141            .collect::<std::result::Result<_, _>>()?;
142        // High-entropy candidate match: substrings of length >= min that look
143        // like base64/hex/url-safe-base64.
144        let pat = format!(r"[A-Za-z0-9+/=_\-]{{{},}}", cfg.min_entropy_length);
145        let high_entropy_re = Regex::new(&pat)?;
146        Ok(Self {
147            cfg,
148            rules,
149            high_entropy_re,
150        })
151    }
152
153    /// Scan `source`, returning findings in source order.
154    pub fn scan(&self, source: &str) -> Vec<Finding> {
155        let mut findings: Vec<Finding> = Vec::new();
156        let mut covered: Vec<(usize, usize)> = Vec::new();
157
158        // Built-in rules.
159        for (kind, regex) in &self.rules {
160            for m in regex.find_iter(source) {
161                let entropy = shannon_entropy(m.as_str());
162                let (line, column) = line_col(source, m.start());
163                findings.push(Finding {
164                    kind: (*kind).to_string(),
165                    line,
166                    column,
167                    start: m.start(),
168                    end: m.end(),
169                    matched: m.as_str().to_string(),
170                    entropy,
171                });
172                covered.push((m.start(), m.end()));
173            }
174        }
175
176        // High-entropy fallback. Skip ranges already covered.
177        if self.cfg.include_high_entropy {
178            for m in self.high_entropy_re.find_iter(source) {
179                if overlaps(&covered, m.start(), m.end()) {
180                    continue;
181                }
182                let entropy = shannon_entropy(m.as_str());
183                if entropy < self.cfg.min_entropy {
184                    continue;
185                }
186                let (line, column) = line_col(source, m.start());
187                findings.push(Finding {
188                    kind: "HIGH_ENTROPY".to_string(),
189                    line,
190                    column,
191                    start: m.start(),
192                    end: m.end(),
193                    matched: m.as_str().to_string(),
194                    entropy,
195                });
196            }
197        }
198
199        findings.sort_by_key(|f| f.start);
200        findings
201    }
202
203    /// Bulk scan. With `parallel = true`, distributes across rayon's pool.
204    pub fn scan_many(&self, sources: &[&str], parallel: bool) -> Vec<Vec<Finding>> {
205        if parallel {
206            sources.par_iter().map(|s| self.scan(s)).collect()
207        } else {
208            sources.iter().map(|s| self.scan(s)).collect()
209        }
210    }
211}
212
213impl Default for Scanner {
214    fn default() -> Self {
215        Self::new()
216    }
217}
218
219fn shannon_entropy(s: &str) -> f32 {
220    if s.is_empty() {
221        return 0.0;
222    }
223    let mut counts = [0u32; 256];
224    let mut n = 0u32;
225    for &b in s.as_bytes() {
226        counts[b as usize] += 1;
227        n += 1;
228    }
229    let mut e = 0.0_f32;
230    let n_f = n as f32;
231    for &c in &counts {
232        if c == 0 {
233            continue;
234        }
235        let p = c as f32 / n_f;
236        e -= p * p.log2();
237    }
238    e
239}
240
241fn line_col(source: &str, byte_offset: usize) -> (usize, usize) {
242    let mut line = 1usize;
243    let mut last_newline = 0usize;
244    for (i, b) in source.as_bytes().iter().take(byte_offset).enumerate() {
245        if *b == b'\n' {
246            line += 1;
247            last_newline = i + 1;
248        }
249    }
250    (line, byte_offset - last_newline + 1)
251}
252
253fn overlaps(ranges: &[(usize, usize)], start: usize, end: usize) -> bool {
254    ranges.iter().any(|&(s, e)| start < e && end > s)
255}
256
257#[cfg(test)]
258mod tests {
259    use super::*;
260
261    #[test]
262    fn aws_key_detected() {
263        let s = Scanner::new();
264        let r = s.scan("aws = AKIAIOSFODNN7EXAMPLE\n");
265        assert_eq!(r.len(), 1);
266        assert_eq!(r[0].kind, "AWS_ACCESS_KEY");
267        assert_eq!(r[0].line, 1);
268    }
269
270    #[test]
271    fn github_token_detected() {
272        let s = Scanner::new();
273        let token = "ghp_abcdefghijklmnopqrstuvwxyz0123456789";
274        let r = s.scan(&format!("token = {token}\n"));
275        assert!(r.iter().any(|f| f.kind == "GITHUB_TOKEN"));
276    }
277
278    #[test]
279    fn slack_token_detected() {
280        let s = Scanner::new();
281        let r = s.scan("slack = xoxb-1234567890-abcdef\n");
282        assert!(r.iter().any(|f| f.kind == "SLACK_TOKEN"));
283    }
284
285    #[test]
286    fn stripe_key_detected() {
287        let s = Scanner::new();
288        let r = s.scan("STRIPE = sk_live_abcdefghij1234567890\n");
289        assert!(r.iter().any(|f| f.kind == "STRIPE_KEY"));
290    }
291
292    #[test]
293    fn jwt_detected() {
294        let s = Scanner::new();
295        let jwt = "eyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJ1MSJ9.signature_part_long_enough";
296        let r = s.scan(&format!("auth = '{jwt}'"));
297        assert!(r.iter().any(|f| f.kind == "JWT"));
298    }
299
300    #[test]
301    fn openai_key_classic_detected() {
302        let s = Scanner::new();
303        let key = "sk-abcdefghijklmnopqrstuvwxyz0123456789ABCDEFGHIJKL";
304        let r = s.scan(&format!("OPENAI = {key}\n"));
305        assert!(r.iter().any(|f| f.kind == "OPENAI_KEY"));
306    }
307
308    #[test]
309    fn openai_key_project_scoped_detected() {
310        let s = Scanner::new();
311        let key = "sk-proj-abcdefghij_KLMNOPQRSTU-vwxyz0123456789";
312        let r = s.scan(&format!("OPENAI = {key}\n"));
313        assert!(r.iter().any(|f| f.kind == "OPENAI_KEY"));
314    }
315
316    #[test]
317    fn anthropic_key_detected() {
318        let s = Scanner::new();
319        let key = "sk-ant-api03-abcdefghijklmnopqrstuvwxyz0123456789ABCD";
320        let r = s.scan(&format!("ANTHROPIC = {key}\n"));
321        assert!(r.iter().any(|f| f.kind == "ANTHROPIC_KEY"));
322    }
323
324    #[test]
325    fn twilio_auth_token_detected() {
326        let s = Scanner::new();
327        // Twilio auth tokens are 32 hex chars prefixed with `SK`.
328        let key = "SK0123456789abcdef0123456789abcdef";
329        let r = s.scan(&format!("TWILIO = {key}\n"));
330        assert!(r.iter().any(|f| f.kind == "TWILIO_AUTH_TOKEN"));
331    }
332
333    #[test]
334    fn sendgrid_key_detected() {
335        let s = Scanner::new();
336        // SendGrid keys are exactly SG.<22 chars>.<43 chars>.
337        let body22 = "abcdefghijklmnopqrstuv";
338        let sig43 = "abcdefghijklmnopqrstuvwxyz0123456789ABCDEFG";
339        let key = format!("SG.{body22}.{sig43}");
340        let r = s.scan(&format!("SG = {key}\n"));
341        assert!(r.iter().any(|f| f.kind == "SENDGRID_KEY"));
342    }
343
344    #[test]
345    fn rsa_marker_detected() {
346        let s = Scanner::new();
347        let r = s.scan("-----BEGIN RSA PRIVATE KEY-----\nMII...\n");
348        assert!(r.iter().any(|f| f.kind == "RSA_PRIVATE_KEY"));
349    }
350
351    #[test]
352    fn ssh_marker_detected() {
353        let s = Scanner::new();
354        let r = s.scan("-----BEGIN OPENSSH PRIVATE KEY-----\nb3Bl...\n");
355        assert!(r.iter().any(|f| f.kind == "SSH_PRIVATE_KEY"));
356    }
357
358    #[test]
359    fn generic_api_key_assignment_detected() {
360        let s = Scanner::new();
361        let r = s.scan(r#"api_key = "abcdefghijklmnopqrst""#);
362        assert!(r.iter().any(|f| f.kind == "GENERIC_API_KEY"));
363    }
364
365    #[test]
366    fn high_entropy_detected() {
367        let s = Scanner::new();
368        // 32 chars of varied base64-ish content; entropy is high.
369        let blob = "K3s9Q2pXq9ZTm4Lp2Vw7Yc1RnFb5Xh6N";
370        let r = s.scan(&format!("token = '{blob}'"));
371        assert!(r.iter().any(|f| f.kind == "HIGH_ENTROPY"));
372    }
373
374    #[test]
375    fn low_entropy_skipped() {
376        let s = Scanner::new();
377        // 32 chars of a low-entropy string ('aaaa...').
378        let blob = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
379        let r = s.scan(&format!("v = '{blob}'"));
380        assert!(!r.iter().any(|f| f.kind == "HIGH_ENTROPY"));
381    }
382
383    #[test]
384    fn no_finding_on_clean_source() {
385        let s = Scanner::new();
386        let r = s.scan("def add(a: int, b: int) -> int:\n    return a + b\n");
387        assert!(r.is_empty());
388    }
389
390    #[test]
391    fn line_column_correct_on_multiline() {
392        let s = Scanner::new();
393        let src = "line1\nline2 AKIAIOSFODNN7EXAMPLE\nline3\n";
394        let r = s.scan(src);
395        assert_eq!(r.len(), 1);
396        assert_eq!(r[0].line, 2);
397        // "line2 " is 6 bytes; AKIA starts at column 7.
398        assert_eq!(r[0].column, 7);
399    }
400
401    #[test]
402    fn findings_sorted_by_position() {
403        let s = Scanner::new();
404        let src = "ghp_abcdefghijklmnopqrstuvwxyz0123456789 then AKIAIOSFODNN7EXAMPLE";
405        let r = s.scan(src);
406        assert!(r.len() >= 2);
407        for w in r.windows(2) {
408            assert!(w[0].start <= w[1].start);
409        }
410    }
411
412    #[test]
413    fn high_entropy_does_not_double_up_on_known_pattern() {
414        let s = Scanner::new();
415        // ghp_ token would also score as high-entropy. We should only get
416        // GITHUB_TOKEN, not also HIGH_ENTROPY for the same span.
417        let token = "ghp_abcdefghijklmnopqrstuvwxyz0123456789";
418        let r = s.scan(&format!("t = '{token}'"));
419        let kinds: Vec<&str> = r.iter().map(|f| f.kind.as_str()).collect();
420        assert!(kinds.contains(&"GITHUB_TOKEN"));
421        // HIGH_ENTROPY may still appear if it matches a different range, but
422        // not over the GITHUB_TOKEN span.
423        for f in &r {
424            if f.kind == "HIGH_ENTROPY" {
425                let token_start = src_pos(&format!("t = '{token}'"), token);
426                let token_end = token_start + token.len();
427                assert!(
428                    !(f.start >= token_start && f.end <= token_end),
429                    "HIGH_ENTROPY overlaps GITHUB_TOKEN at {}..{}",
430                    f.start,
431                    f.end
432                );
433            }
434        }
435    }
436
437    fn src_pos(haystack: &str, needle: &str) -> usize {
438        haystack.find(needle).unwrap()
439    }
440
441    #[test]
442    fn invalid_entropy_threshold_rejected() {
443        let cfg = ScannerConfig {
444            min_entropy: 100.0,
445            ..Default::default()
446        };
447        assert!(Scanner::with_config(cfg).is_err());
448    }
449
450    #[test]
451    fn high_entropy_can_be_disabled() {
452        let cfg = ScannerConfig {
453            include_high_entropy: false,
454            ..Default::default()
455        };
456        let s = Scanner::with_config(cfg).unwrap();
457        let blob = "K3s9Q2pXq9ZTm4Lp2Vw7Yc1RnFb5Xh6N";
458        let r = s.scan(&format!("token = '{blob}'"));
459        assert!(!r.iter().any(|f| f.kind == "HIGH_ENTROPY"));
460    }
461
462    #[test]
463    fn scan_many_serial_and_parallel_match() {
464        let s = Scanner::new();
465        let sources: Vec<&str> = vec!["aws = AKIAIOSFODNN7EXAMPLE", "no secret here"];
466        let a = s.scan_many(&sources, false);
467        let b = s.scan_many(&sources, true);
468        assert_eq!(a, b);
469        assert_eq!(a[0].len(), 1);
470        assert_eq!(a[1].len(), 0);
471    }
472
473    #[test]
474    fn shannon_entropy_zero_for_constant_string() {
475        assert_eq!(shannon_entropy("aaaa"), 0.0);
476    }
477
478    #[test]
479    fn shannon_entropy_max_for_equal_distribution() {
480        // 4 distinct chars equally distributed -> 2 bits/char.
481        let e = shannon_entropy("abcdabcd");
482        assert!((e - 2.0).abs() < 1e-4);
483    }
484}