mod signatures;
use aho_corasick::AhoCorasick;
use rayon::prelude::*;
use regex::bytes::{Regex, RegexBuilder};
use std::fs;
use std::io::{self, BufRead, Write};
use std::path::Path;
use thiserror::Error;
pub use signatures::{
default_signatures, trufflehog_detector_signatures, trufflehog_generated_detector_count,
trufflehog_generated_signature_count, trufflehog_source_commit,
};
#[derive(Debug, Error)]
pub enum ScrubError {
#[error("invalid signature regex '{name}': {reason}")]
InvalidRegex { name: String, reason: String },
#[error("failed to read signature file: {0}")]
SignatureIo(#[from] std::io::Error),
}
#[derive(Debug, Clone)]
pub struct SignatureSpec {
pub name: String,
pub pattern: String,
}
#[derive(Debug)]
pub struct Scrubber {
literal_matcher: Option<AhoCorasick>,
regexes: Vec<Regex>,
mask_byte: u8,
}
impl Scrubber {
pub fn new() -> Result<Self, ScrubError> {
Self::build(default_signatures(), b'*')
}
pub fn with_signatures(specs: Vec<SignatureSpec>, mask_byte: u8) -> Result<Self, ScrubError> {
Self::build(specs, mask_byte)
}
fn build(specs: Vec<SignatureSpec>, mask_byte: u8) -> Result<Self, ScrubError> {
let mut literal_patterns: Vec<Vec<u8>> = Vec::new();
let mut regexes = Vec::new();
for spec in specs {
if is_plain_literal(&spec.pattern) {
literal_patterns.push(spec.pattern.into_bytes());
} else {
match RegexBuilder::new(&spec.pattern).unicode(false).build() {
Ok(regex) => regexes.push(regex),
Err(_) if spec.name.starts_with("trufflehog_") => {
continue;
}
Err(e) => {
return Err(ScrubError::InvalidRegex {
name: spec.name.clone(),
reason: e.to_string(),
});
}
}
}
}
let literal_matcher = if literal_patterns.is_empty() {
None
} else {
Some(AhoCorasick::new(literal_patterns).expect("valid automaton"))
};
Ok(Self {
literal_matcher,
regexes,
mask_byte,
})
}
pub fn scrub_in_place(&self, input: &mut [u8]) {
let haystack: &[u8] = input;
let mut ranges: Vec<(usize, usize)> = Vec::with_capacity(64);
if let Some(matcher) = &self.literal_matcher {
for m in matcher.find_iter(haystack) {
ranges.push((m.start(), m.end()));
}
}
if should_parallelize_regex_scan(haystack.len(), self.regexes.len()) {
let mut regex_ranges: Vec<(usize, usize)> = self
.regexes
.par_iter()
.flat_map_iter(|re| re.find_iter(haystack).map(|m| (m.start(), m.end())))
.collect();
ranges.append(&mut regex_ranges);
} else {
for re in &self.regexes {
for m in re.find_iter(haystack) {
ranges.push((m.start(), m.end()));
}
}
}
if ranges.is_empty() {
return;
}
ranges.sort_unstable();
let mut current = ranges[0];
for &(start, end) in &ranges[1..] {
if start <= current.1 {
if end > current.1 {
current.1 = end;
}
} else {
input[current.0..current.1].fill(self.mask_byte);
current = (start, end);
}
}
input[current.0..current.1].fill(self.mask_byte);
}
pub fn scrubbed(&self, input: &[u8]) -> Vec<u8> {
let mut out = input.to_vec();
self.scrub_in_place(&mut out);
out
}
pub fn scrub_lines<R: BufRead, W: Write>(
&self,
mut reader: R,
mut writer: W,
) -> io::Result<()> {
let mut line = Vec::with_capacity(8 * 1024);
loop {
line.clear();
let read = reader.read_until(b'\n', &mut line)?;
if read == 0 {
return Ok(());
}
self.scrub_in_place(&mut line);
writer.write_all(&line)?;
}
}
}
fn should_parallelize_regex_scan(haystack_len: usize, regex_count: usize) -> bool {
haystack_len >= 1024 * 1024 && regex_count >= 4
}
fn is_plain_literal(pattern: &str) -> bool {
!pattern.bytes().any(|b| {
matches!(
b as char,
'.' | '*' | '+' | '?' | '(' | ')' | '[' | ']' | '{' | '}' | '\\' | '^' | '$' | '|'
)
})
}
pub fn parse_scrub_file(path: &Path) -> Result<Vec<SignatureSpec>, ScrubError> {
let content = fs::read_to_string(path)?;
let mut specs = Vec::new();
for (idx, line) in content.lines().enumerate() {
let line = line.trim();
if line.is_empty() || line.starts_with('#') {
continue;
}
let (name, pattern) = if let Some((name, pattern)) = line.split_once('=') {
(name.trim().to_string(), pattern.trim().to_string())
} else {
(format!("custom_line_{}", idx + 1), line.to_string())
};
specs.push(SignatureSpec { name, pattern });
}
Ok(specs)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn redacts_known_patterns() {
let scrubber = Scrubber::new().unwrap();
let mut payload =
b"token=ghp_123456789012345678901234567890123456 aws=AKIA1234567890ABCDEF".to_vec();
scrubber.scrub_in_place(&mut payload);
let s = String::from_utf8(payload).unwrap();
assert!(!s.contains("ghp_"));
assert!(!s.contains("AKIA"));
}
#[test]
fn custom_scrub_file_works() {
let specs = vec![SignatureSpec {
name: "foo".into(),
pattern: "foo-secret".into(),
}];
let scrubber = Scrubber::with_signatures(specs, b'#').unwrap();
let out = scrubber.scrubbed(b"hello foo-secret world");
assert_eq!(String::from_utf8(out).unwrap(), "hello ########## world");
}
#[test]
fn default_signatures_build_via_with_signatures() {
Scrubber::with_signatures(default_signatures(), b'*').unwrap();
}
#[test]
fn invalid_custom_regex_still_errors() {
let specs = vec![SignatureSpec {
name: "broken".into(),
pattern: "{".into(),
}];
let err = Scrubber::with_signatures(specs, b'*').unwrap_err();
assert!(matches!(err, ScrubError::InvalidRegex { .. }));
}
#[test]
fn scrub_lines_redacts_line_delimited_input() {
let scrubber = Scrubber::new().unwrap();
let input = b"safe\nprefix ghp_123456789012345678901234567890123456 suffix\n";
let mut output = Vec::new();
scrubber
.scrub_lines(std::io::Cursor::new(input), &mut output)
.unwrap();
assert_eq!(
output,
b"safe\nprefix **************************************** suffix\n"
);
}
}