use crate::detectors::{ContextAnalyzer, PatternDetector};
use crate::models::{Finding, Location};
use anyhow::Result;
use ignore::WalkBuilder;
use rayon::prelude::*;
use std::fs;
use std::path::{Path, PathBuf};
pub struct FilesystemScanner {
root_path: PathBuf,
entropy_threshold: f64,
max_file_size: u64,
respect_gitignore: bool,
}
impl FilesystemScanner {
pub fn new(root_path: PathBuf) -> Self {
Self {
root_path,
entropy_threshold: 3.5,
max_file_size: 1024 * 1024, respect_gitignore: true,
}
}
pub fn with_entropy_threshold(mut self, threshold: f64) -> Self {
self.entropy_threshold = threshold;
self
}
pub fn with_max_file_size(mut self, size: u64) -> Self {
self.max_file_size = size;
self
}
pub fn with_gitignore(mut self, respect: bool) -> Self {
self.respect_gitignore = respect;
self
}
pub fn scan(&self) -> Result<Vec<Finding>> {
let files = self.collect_files()?;
let findings: Vec<Finding> = files
.par_iter()
.flat_map(|file_path| self.scan_file(file_path).unwrap_or_default())
.collect();
Ok(findings)
}
fn collect_files(&self) -> Result<Vec<PathBuf>> {
let mut files = Vec::new();
let walker = WalkBuilder::new(&self.root_path)
.git_ignore(self.respect_gitignore)
.git_global(self.respect_gitignore)
.git_exclude(self.respect_gitignore)
.hidden(false)
.build();
for entry in walker {
let entry = entry?;
let path = entry.path();
if !path.is_file() {
continue;
}
if let Ok(metadata) = fs::metadata(path) {
if metadata.len() > self.max_file_size {
continue;
}
}
if self.is_likely_binary(path) {
continue;
}
let file_context = ContextAnalyzer::analyze_file(path);
if file_context.is_vendor {
continue;
}
files.push(path.to_path_buf());
}
Ok(files)
}
fn scan_file(&self, file_path: &PathBuf) -> Result<Vec<Finding>> {
let mut findings = Vec::new();
let content = match fs::read_to_string(file_path) {
Ok(c) => c,
Err(_) => return Ok(findings),
};
if self.is_lockfile(file_path) || self.is_minified(&content) {
return Ok(findings);
}
let lines: Vec<&str> = content.lines().collect();
let detector = PatternDetector::new();
let file_context = ContextAnalyzer::analyze_file(file_path);
for (line_num, line) in lines.iter().enumerate() {
if ContextAnalyzer::is_placeholder(line) {
continue;
}
if line.len() > 5000 {
continue;
}
let is_comment = ContextAnalyzer::is_comment(line);
let pattern_matches = detector.scan_line_with_positions(line, self.entropy_threshold);
for pattern_match in pattern_matches {
let mut secret = pattern_match.secret;
if is_comment {
secret.confidence *= 0.75;
}
let line_before = if line_num > 0 {
Some(lines[line_num - 1].to_string())
} else {
None
};
let line_after = if line_num + 1 < lines.len() {
Some(lines[line_num + 1].to_string())
} else {
None
};
secret.severity = ContextAnalyzer::adjust_severity(
secret.severity,
&Context {
line_before: line_before.clone(),
line_content: line.to_string(),
line_after: line_after.clone(),
is_test_file: file_context.is_test_file,
is_config_file: file_context.is_config_file,
is_documentation: file_context.is_documentation,
file_extension: file_context.file_extension.clone(),
},
);
let location = Location {
file_path: file_path.clone(),
line_number: line_num + 1,
column_start: pattern_match.column_start,
column_end: pattern_match.column_end,
commit_hash: None,
commit_author: None,
commit_date: None,
};
let context = ContextAnalyzer::build_context(
line.to_string(),
line_before,
line_after,
&file_context,
);
let finding = Finding::new(secret, location, context);
findings.push(finding);
}
}
Ok(findings)
}
fn is_lockfile(&self, path: &Path) -> bool {
let filename = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
let lockfiles = [
"package-lock.json",
"yarn.lock",
"pnpm-lock.yaml",
"Cargo.lock",
"Gemfile.lock",
"poetry.lock",
"Pipfile.lock",
"composer.lock",
"go.sum",
"flake.lock",
"pubspec.lock",
"packages.lock.json",
"bun.lockb",
];
lockfiles.contains(&filename)
}
fn is_minified(&self, content: &str) -> bool {
if let Some(first_line) = content.lines().next() {
if first_line.len() > 10_000 {
return true;
}
}
let line_count = content.lines().count();
if line_count > 0 && line_count < 10 && content.len() > 50_000 {
return true;
}
false
}
fn is_likely_binary(&self, path: &Path) -> bool {
let binary_extensions = [
"exe", "dll", "so", "dylib", "bin", "com", "msi",
"o", "a", "lib", "obj", "class", "pyc", "pyo", "elc", "beam",
"zip", "tar", "gz", "bz2", "xz", "7z", "rar", "zst", "lz4", "lzma",
"jpg", "jpeg", "png", "gif", "bmp", "ico", "svg", "webp", "tiff", "avif",
"mp3", "mp4", "avi", "mov", "mkv", "flac", "wav", "ogg", "webm",
"woff", "woff2", "ttf", "eot", "otf",
"pdf", "doc", "docx", "xls", "xlsx", "ppt", "pptx",
"dat", "db", "sqlite", "sqlite3", "mdb", "parquet", "arrow",
"iso", "img", "dmg", "vmdk",
"map",
];
if let Some(ext) = path.extension() {
if let Some(ext_str) = ext.to_str() {
if binary_extensions.contains(&ext_str.to_lowercase().as_str()) {
return true;
}
}
}
if let Ok(file) = fs::File::open(path) {
use std::io::Read;
let mut buffer = [0u8; 512];
let mut reader = std::io::BufReader::new(file);
if let Ok(n) = reader.read(&mut buffer) {
if buffer[..n].contains(&0) {
return true;
}
}
}
false
}
pub fn get_stats(&self) -> Result<ScanStats> {
let files = self.collect_files()?;
let total_size: u64 = files
.iter()
.filter_map(|f| fs::metadata(f).ok())
.map(|m| m.len())
.sum();
Ok(ScanStats {
total_files: files.len(),
total_size,
})
}
}
use crate::models::Context;
#[derive(Debug)]
pub struct ScanStats {
pub total_files: usize,
pub total_size: u64,
}
#[cfg(test)]
mod tests {
use super::*;
use std::fs;
use tempfile::TempDir;
#[test]
fn test_filesystem_scanner_creation() {
let scanner = FilesystemScanner::new(PathBuf::from("."));
assert_eq!(scanner.entropy_threshold, 3.5);
assert!(scanner.respect_gitignore);
}
#[test]
fn test_scan_file_with_secret() -> Result<()> {
let temp_dir = TempDir::new()?;
let file_path = temp_dir.path().join("test.txt");
fs::write(&file_path, "AWS_ACCESS_KEY=AKIAZ52HGXYRN4WBTEST")?;
let scanner = FilesystemScanner::new(temp_dir.path().to_path_buf());
let findings = scanner.scan()?;
assert!(!findings.is_empty());
Ok(())
}
#[test]
fn test_skip_binary_files() {
let scanner = FilesystemScanner::new(PathBuf::from("."));
assert!(scanner.is_likely_binary(&PathBuf::from("test.exe")));
assert!(scanner.is_likely_binary(&PathBuf::from("image.png")));
assert!(!scanner.is_likely_binary(&PathBuf::from("code.rs")));
}
}