#![deny(unsafe_code)]
#![warn(missing_docs)]
#![warn(rust_2018_idioms)]
use rayon::prelude::*;
use regex::Regex;
use serde::{Deserialize, Serialize};
use thiserror::Error;
pub type Result<T> = std::result::Result<T, ScannerError>;
#[derive(Error, Debug)]
pub enum ScannerError {
#[error("regex error: {0}")]
Regex(#[from] regex::Error),
#[error("invalid config: {0}")]
InvalidConfig(String),
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct ScannerConfig {
pub min_entropy: f32,
pub min_entropy_length: usize,
pub include_high_entropy: bool,
}
impl Default for ScannerConfig {
fn default() -> Self {
Self {
min_entropy: 4.5,
min_entropy_length: 32,
include_high_entropy: true,
}
}
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct Finding {
pub kind: String,
pub line: usize,
pub column: usize,
pub start: usize,
pub end: usize,
pub matched: String,
pub entropy: f32,
}
pub struct Scanner {
cfg: ScannerConfig,
rules: Vec<(&'static str, Regex)>,
high_entropy_re: Regex,
}
const RULES: &[(&str, &str)] = &[
("AWS_ACCESS_KEY", r"\bAKIA[0-9A-Z]{16}\b"),
("GITHUB_TOKEN", r"\bgh[pours]_[A-Za-z0-9]{36,}\b"),
("SLACK_TOKEN", r"\bxox[baprs]-[A-Za-z0-9-]{10,}\b"),
(
"STRIPE_KEY",
r"\b(?:sk|pk|rk)_(?:live|test)_[A-Za-z0-9]{20,}\b",
),
(
"JWT",
r"\beyJ[A-Za-z0-9_-]+\.eyJ[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\b",
),
("RSA_PRIVATE_KEY", r"-----BEGIN RSA PRIVATE KEY-----"),
("SSH_PRIVATE_KEY", r"-----BEGIN OPENSSH PRIVATE KEY-----"),
(
"GENERIC_API_KEY",
r#"(?i)\bapi[_-]?key\s*[=:]\s*['"]([A-Za-z0-9_\-=]{16,})['"]"#,
),
];
impl Scanner {
pub fn new() -> Self {
Self::with_config(ScannerConfig::default()).expect("default config compiles")
}
pub fn with_config(cfg: ScannerConfig) -> Result<Self> {
if cfg.min_entropy < 0.0 || cfg.min_entropy > 8.0 {
return Err(ScannerError::InvalidConfig(format!(
"min_entropy out of range [0, 8]: {}",
cfg.min_entropy
)));
}
let rules: Vec<(&'static str, Regex)> = RULES
.iter()
.map(|(k, p)| Regex::new(p).map(|r| (*k, r)))
.collect::<std::result::Result<_, _>>()?;
let pat = format!(r"[A-Za-z0-9+/=_\-]{{{},}}", cfg.min_entropy_length);
let high_entropy_re = Regex::new(&pat)?;
Ok(Self {
cfg,
rules,
high_entropy_re,
})
}
pub fn scan(&self, source: &str) -> Vec<Finding> {
let mut findings: Vec<Finding> = Vec::new();
let mut covered: Vec<(usize, usize)> = Vec::new();
for (kind, regex) in &self.rules {
for m in regex.find_iter(source) {
let entropy = shannon_entropy(m.as_str());
let (line, column) = line_col(source, m.start());
findings.push(Finding {
kind: (*kind).to_string(),
line,
column,
start: m.start(),
end: m.end(),
matched: m.as_str().to_string(),
entropy,
});
covered.push((m.start(), m.end()));
}
}
if self.cfg.include_high_entropy {
for m in self.high_entropy_re.find_iter(source) {
if overlaps(&covered, m.start(), m.end()) {
continue;
}
let entropy = shannon_entropy(m.as_str());
if entropy < self.cfg.min_entropy {
continue;
}
let (line, column) = line_col(source, m.start());
findings.push(Finding {
kind: "HIGH_ENTROPY".to_string(),
line,
column,
start: m.start(),
end: m.end(),
matched: m.as_str().to_string(),
entropy,
});
}
}
findings.sort_by_key(|f| f.start);
findings
}
pub fn scan_many(&self, sources: &[&str], parallel: bool) -> Vec<Vec<Finding>> {
if parallel {
sources.par_iter().map(|s| self.scan(s)).collect()
} else {
sources.iter().map(|s| self.scan(s)).collect()
}
}
}
impl Default for Scanner {
fn default() -> Self {
Self::new()
}
}
fn shannon_entropy(s: &str) -> f32 {
if s.is_empty() {
return 0.0;
}
let mut counts = [0u32; 256];
let mut n = 0u32;
for &b in s.as_bytes() {
counts[b as usize] += 1;
n += 1;
}
let mut e = 0.0_f32;
let n_f = n as f32;
for &c in &counts {
if c == 0 {
continue;
}
let p = c as f32 / n_f;
e -= p * p.log2();
}
e
}
fn line_col(source: &str, byte_offset: usize) -> (usize, usize) {
let mut line = 1usize;
let mut last_newline = 0usize;
for (i, b) in source.as_bytes().iter().take(byte_offset).enumerate() {
if *b == b'\n' {
line += 1;
last_newline = i + 1;
}
}
(line, byte_offset - last_newline + 1)
}
fn overlaps(ranges: &[(usize, usize)], start: usize, end: usize) -> bool {
ranges.iter().any(|&(s, e)| start < e && end > s)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn aws_key_detected() {
let s = Scanner::new();
let r = s.scan("aws = AKIAIOSFODNN7EXAMPLE\n");
assert_eq!(r.len(), 1);
assert_eq!(r[0].kind, "AWS_ACCESS_KEY");
assert_eq!(r[0].line, 1);
}
#[test]
fn github_token_detected() {
let s = Scanner::new();
let token = "ghp_abcdefghijklmnopqrstuvwxyz0123456789";
let r = s.scan(&format!("token = {token}\n"));
assert!(r.iter().any(|f| f.kind == "GITHUB_TOKEN"));
}
#[test]
fn slack_token_detected() {
let s = Scanner::new();
let r = s.scan("slack = xoxb-1234567890-abcdef\n");
assert!(r.iter().any(|f| f.kind == "SLACK_TOKEN"));
}
#[test]
fn stripe_key_detected() {
let s = Scanner::new();
let r = s.scan("STRIPE = sk_live_abcdefghij1234567890\n");
assert!(r.iter().any(|f| f.kind == "STRIPE_KEY"));
}
#[test]
fn jwt_detected() {
let s = Scanner::new();
let jwt = "eyJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJ1MSJ9.signature_part_long_enough";
let r = s.scan(&format!("auth = '{jwt}'"));
assert!(r.iter().any(|f| f.kind == "JWT"));
}
#[test]
fn rsa_marker_detected() {
let s = Scanner::new();
let r = s.scan("-----BEGIN RSA PRIVATE KEY-----\nMII...\n");
assert!(r.iter().any(|f| f.kind == "RSA_PRIVATE_KEY"));
}
#[test]
fn ssh_marker_detected() {
let s = Scanner::new();
let r = s.scan("-----BEGIN OPENSSH PRIVATE KEY-----\nb3Bl...\n");
assert!(r.iter().any(|f| f.kind == "SSH_PRIVATE_KEY"));
}
#[test]
fn generic_api_key_assignment_detected() {
let s = Scanner::new();
let r = s.scan(r#"api_key = "abcdefghijklmnopqrst""#);
assert!(r.iter().any(|f| f.kind == "GENERIC_API_KEY"));
}
#[test]
fn high_entropy_detected() {
let s = Scanner::new();
let blob = "K3s9Q2pXq9ZTm4Lp2Vw7Yc1RnFb5Xh6N";
let r = s.scan(&format!("token = '{blob}'"));
assert!(r.iter().any(|f| f.kind == "HIGH_ENTROPY"));
}
#[test]
fn low_entropy_skipped() {
let s = Scanner::new();
let blob = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
let r = s.scan(&format!("v = '{blob}'"));
assert!(!r.iter().any(|f| f.kind == "HIGH_ENTROPY"));
}
#[test]
fn no_finding_on_clean_source() {
let s = Scanner::new();
let r = s.scan("def add(a: int, b: int) -> int:\n return a + b\n");
assert!(r.is_empty());
}
#[test]
fn line_column_correct_on_multiline() {
let s = Scanner::new();
let src = "line1\nline2 AKIAIOSFODNN7EXAMPLE\nline3\n";
let r = s.scan(src);
assert_eq!(r.len(), 1);
assert_eq!(r[0].line, 2);
assert_eq!(r[0].column, 7);
}
#[test]
fn findings_sorted_by_position() {
let s = Scanner::new();
let src = "ghp_abcdefghijklmnopqrstuvwxyz0123456789 then AKIAIOSFODNN7EXAMPLE";
let r = s.scan(src);
assert!(r.len() >= 2);
for w in r.windows(2) {
assert!(w[0].start <= w[1].start);
}
}
#[test]
fn high_entropy_does_not_double_up_on_known_pattern() {
let s = Scanner::new();
let token = "ghp_abcdefghijklmnopqrstuvwxyz0123456789";
let r = s.scan(&format!("t = '{token}'"));
let kinds: Vec<&str> = r.iter().map(|f| f.kind.as_str()).collect();
assert!(kinds.contains(&"GITHUB_TOKEN"));
for f in &r {
if f.kind == "HIGH_ENTROPY" {
let token_start = src_pos(&format!("t = '{token}'"), token);
let token_end = token_start + token.len();
assert!(
!(f.start >= token_start && f.end <= token_end),
"HIGH_ENTROPY overlaps GITHUB_TOKEN at {}..{}",
f.start,
f.end
);
}
}
}
fn src_pos(haystack: &str, needle: &str) -> usize {
haystack.find(needle).unwrap()
}
#[test]
fn invalid_entropy_threshold_rejected() {
let cfg = ScannerConfig {
min_entropy: 100.0,
..Default::default()
};
assert!(Scanner::with_config(cfg).is_err());
}
#[test]
fn high_entropy_can_be_disabled() {
let cfg = ScannerConfig {
include_high_entropy: false,
..Default::default()
};
let s = Scanner::with_config(cfg).unwrap();
let blob = "K3s9Q2pXq9ZTm4Lp2Vw7Yc1RnFb5Xh6N";
let r = s.scan(&format!("token = '{blob}'"));
assert!(!r.iter().any(|f| f.kind == "HIGH_ENTROPY"));
}
#[test]
fn scan_many_serial_and_parallel_match() {
let s = Scanner::new();
let sources: Vec<&str> = vec!["aws = AKIAIOSFODNN7EXAMPLE", "no secret here"];
let a = s.scan_many(&sources, false);
let b = s.scan_many(&sources, true);
assert_eq!(a, b);
assert_eq!(a[0].len(), 1);
assert_eq!(a[1].len(), 0);
}
#[test]
fn shannon_entropy_zero_for_constant_string() {
assert_eq!(shannon_entropy("aaaa"), 0.0);
}
#[test]
fn shannon_entropy_max_for_equal_distribution() {
let e = shannon_entropy("abcdabcd");
assert!((e - 2.0).abs() < 1e-4);
}
}