use crate::detection::{
Finding, FoundSecret,
patterns::PatternRegistry,
entropy::EntropyAnalyzer,
rules::RuleEngine
};
use crate::error::{CargoCryptError, CryptoResult};
use ignore::WalkBuilder;
use rayon::prelude::*;
use serde::{Deserialize, Serialize};
use std::fs;
use std::path::{Path, PathBuf};
use std::sync::Arc;
use std::time::Instant;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ScanConfig {
pub max_file_size: u64,
pub follow_links: bool,
pub scan_hidden: bool,
pub include_extensions: Vec<String>,
pub exclude_extensions: Vec<String>,
pub exclude_paths: Vec<String>,
pub parallel: bool,
pub num_threads: Option<usize>,
pub respect_gitignore: bool,
pub max_depth: Option<usize>,
}
impl Default for ScanConfig {
fn default() -> Self {
Self {
max_file_size: 10 * 1024 * 1024, follow_links: false,
scan_hidden: false,
include_extensions: Vec::new(),
exclude_extensions: vec![
"exe".to_string(), "dll".to_string(), "so".to_string(), "dylib".to_string(),
"bin".to_string(), "obj".to_string(), "lib".to_string(), "a".to_string(),
"png".to_string(), "jpg".to_string(), "jpeg".to_string(), "gif".to_string(),
"bmp".to_string(), "svg".to_string(), "ico".to_string(),
"mp4".to_string(), "avi".to_string(), "mov".to_string(), "wmv".to_string(),
"zip".to_string(), "tar".to_string(), "gz".to_string(), "rar".to_string(),
"7z".to_string(), "bz2".to_string(),
"pdf".to_string(), "doc".to_string(), "docx".to_string(), "xls".to_string(),
"xlsx".to_string(), "ppt".to_string(), "pptx".to_string(),
],
exclude_paths: vec![
"node_modules".to_string(),
"target".to_string(),
".git".to_string(),
".svn".to_string(),
".hg".to_string(),
"build".to_string(),
"dist".to_string(),
"vendor".to_string(),
".cargo".to_string(),
],
parallel: true,
num_threads: None, respect_gitignore: true,
max_depth: None,
}
}
}
impl ScanConfig {
pub fn for_source_code() -> Self {
let mut config = Self::default();
config.include_extensions = vec![
"rs".to_string(), "py".to_string(), "js".to_string(), "ts".to_string(),
"go".to_string(), "java".to_string(), "c".to_string(), "cpp".to_string(),
"h".to_string(), "hpp".to_string(), "cs".to_string(), "php".to_string(),
"rb".to_string(), "swift".to_string(), "kt".to_string(), "scala".to_string(),
"json".to_string(), "yaml".to_string(), "yml".to_string(), "toml".to_string(),
"ini".to_string(), "cfg".to_string(), "conf".to_string(), "config".to_string(),
"env".to_string(), "sh".to_string(), "bash".to_string(), "zsh".to_string(),
"ps1".to_string(), "bat".to_string(), "cmd".to_string(),
];
config
}
pub fn for_config_files() -> Self {
let mut config = Self::default();
config.include_extensions = vec![
"env".to_string(), "json".to_string(), "yaml".to_string(), "yml".to_string(),
"toml".to_string(), "ini".to_string(), "cfg".to_string(), "conf".to_string(),
"config".to_string(), "properties".to_string(),
];
config.scan_hidden = true; config
}
pub fn scan_all_files(mut self) -> Self {
self.include_extensions.clear();
self.exclude_extensions.clear();
self
}
pub fn with_max_file_size(mut self, size: u64) -> Self {
self.max_file_size = size;
self
}
pub fn with_parallel(mut self, parallel: bool) -> Self {
self.parallel = parallel;
self
}
pub fn with_threads(mut self, threads: usize) -> Self {
self.num_threads = Some(threads);
self
}
}
#[derive(Debug, Clone)]
pub struct ScanResult {
pub file_path: PathBuf,
pub findings: Vec<Finding>,
pub scan_time_ms: u64,
pub file_size: u64,
pub skipped: bool,
pub skip_reason: Option<String>,
}
impl ScanResult {
pub fn new(file_path: PathBuf) -> Self {
Self {
file_path,
findings: Vec::new(),
scan_time_ms: 0,
file_size: 0,
skipped: false,
skip_reason: None,
}
}
pub fn skipped_with_reason(mut self, reason: String) -> Self {
self.skipped = true;
self.skip_reason = Some(reason);
self
}
pub fn with_findings(mut self, findings: Vec<Finding>) -> Self {
self.findings = findings;
self
}
pub fn with_scan_time(mut self, time_ms: u64) -> Self {
self.scan_time_ms = time_ms;
self
}
pub fn with_file_size(mut self, size: u64) -> Self {
self.file_size = size;
self
}
}
pub struct FileScanner {
pattern_registry: Arc<PatternRegistry>,
entropy_analyzer: Arc<EntropyAnalyzer>,
rule_engine: Arc<RuleEngine>,
config: ScanConfig,
}
impl FileScanner {
pub fn new(config: ScanConfig) -> CryptoResult<Self> {
let pattern_registry = Arc::new(PatternRegistry::new()
.map_err(|e| CargoCryptError::detection_error(&format!("Failed to create pattern registry: {}", e)))?);
let entropy_analyzer = Arc::new(EntropyAnalyzer::new());
let rule_engine = Arc::new(RuleEngine::new());
Ok(Self {
pattern_registry,
entropy_analyzer,
rule_engine,
config,
})
}
pub fn with_components(
pattern_registry: PatternRegistry,
entropy_analyzer: EntropyAnalyzer,
rule_engine: RuleEngine,
config: ScanConfig,
) -> Self {
Self {
pattern_registry: Arc::new(pattern_registry),
entropy_analyzer: Arc::new(entropy_analyzer),
rule_engine: Arc::new(rule_engine),
config,
}
}
pub fn scan_file<P: AsRef<Path>>(&self, path: P) -> CryptoResult<ScanResult> {
let path = path.as_ref();
let start_time = Instant::now();
if let Some(skip_reason) = self.should_skip_file(path)? {
return Ok(ScanResult::new(path.to_path_buf())
.skipped_with_reason(skip_reason));
}
let content = match fs::read_to_string(path) {
Ok(content) => content,
Err(e) => {
return Ok(ScanResult::new(path.to_path_buf())
.skipped_with_reason(format!("Failed to read file: {}", e)));
}
};
let file_size = content.len() as u64;
let findings = self.scan_content(&content, path)?;
let scan_time_ms = start_time.elapsed().as_millis() as u64;
Ok(ScanResult::new(path.to_path_buf())
.with_findings(findings)
.with_scan_time(scan_time_ms)
.with_file_size(file_size))
}
pub fn scan_directory<P: AsRef<Path>>(&self, path: P) -> CryptoResult<Vec<ScanResult>> {
let start_time = Instant::now();
let mut builder = WalkBuilder::new(path.as_ref());
builder
.follow_links(self.config.follow_links)
.hidden(!self.config.scan_hidden)
.ignore(self.config.respect_gitignore)
.git_ignore(self.config.respect_gitignore);
if let Some(max_depth) = self.config.max_depth {
builder.max_depth(Some(max_depth));
}
let files: Vec<PathBuf> = builder
.build()
.filter_map(|entry| {
let entry = entry.ok()?;
let path = entry.path();
if path.is_file() {
Some(path.to_path_buf())
} else {
None
}
})
.collect();
let results = if self.config.parallel {
if let Some(num_threads) = self.config.num_threads {
rayon::ThreadPoolBuilder::new()
.num_threads(num_threads)
.build()
.map_err(|e| CargoCryptError::detection_error(&format!("Failed to create thread pool: {}", e)))?
.install(|| {
files.par_iter()
.map(|file| self.scan_file(file))
.collect::<Result<Vec<_>, _>>()
})?
} else {
files.par_iter()
.map(|file| self.scan_file(file))
.collect::<Result<Vec<_>, _>>()?
}
} else {
files.iter()
.map(|file| self.scan_file(file))
.collect::<Result<Vec<_>, _>>()?
};
tracing::info!(
"Scanned {} files in {:.2}s",
results.len(),
start_time.elapsed().as_secs_f64()
);
Ok(results)
}
pub fn scan_content(&self, content: &str, file_path: &Path) -> CryptoResult<Vec<Finding>> {
let mut findings = Vec::new();
let mut found_positions = std::collections::HashSet::new();
let pattern_matches = self.pattern_registry.find_all_matches(content);
for pattern_match in pattern_matches {
let line_info = self.get_line_info(content, pattern_match.start);
let context_lines = self.get_context_lines(content, line_info.line_number, 2);
let secret = FoundSecret::new(
pattern_match.matched_text.clone(),
pattern_match.secret_type.to_string(),
pattern_match.start,
pattern_match.end,
line_info.line_number,
line_info.column_number,
);
let base_confidence = pattern_match.base_confidence;
let context_text = context_lines.join(" ");
let entropy_result = self.entropy_analyzer.analyze(&pattern_match.matched_text);
let adjusted_confidence = self.calculate_composite_confidence(
base_confidence,
&pattern_match.matched_text,
&context_text,
&entropy_result,
file_path,
);
let finding = Finding::new(
file_path.to_path_buf(),
secret,
adjusted_confidence,
"pattern_matcher".to_string(),
)
.with_context_lines(context_lines)
.with_entropy_score(entropy_result.shannon_entropy);
findings.push(finding);
found_positions.insert((pattern_match.start, pattern_match.end));
}
let rule_matches = self.rule_engine.execute_rules(content, Some(&file_path.to_string_lossy()))?;
for rule_match in rule_matches {
if found_positions.contains(&(rule_match.start, rule_match.end)) {
continue;
}
let line_info = self.get_line_info(content, rule_match.start);
let context_lines = self.get_context_lines(content, line_info.line_number, 2);
let secret = FoundSecret::new(
rule_match.matched_text.clone(),
"custom_rule".to_string(),
rule_match.start,
rule_match.end,
line_info.line_number,
line_info.column_number,
);
let entropy_result = self.entropy_analyzer.analyze(&rule_match.matched_text);
let context_text = context_lines.join(" ");
let adjusted_confidence = self.calculate_composite_confidence(
rule_match.confidence,
&rule_match.matched_text,
&context_text,
&entropy_result,
file_path,
);
let finding = Finding::new(
file_path.to_path_buf(),
secret,
adjusted_confidence,
rule_match.rule_id,
)
.with_context_lines(context_lines)
.with_entropy_score(entropy_result.shannon_entropy);
findings.push(finding);
found_positions.insert((rule_match.start, rule_match.end));
}
let entropy_findings = self.detect_high_entropy_secrets(content, &found_positions);
for (substring, start, entropy_result) in entropy_findings {
let line_info = self.get_line_info(content, start);
let context_lines = self.get_context_lines(content, line_info.line_number, 2);
let context_text = context_lines.join(" ");
let secret = FoundSecret::new(
substring.clone(),
self.classify_entropy_secret(&substring, &entropy_result),
start,
start + substring.len(),
line_info.line_number,
line_info.column_number,
);
let adjusted_confidence = self.calculate_composite_confidence(
entropy_result.confidence,
&substring,
&context_text,
&entropy_result,
file_path,
);
let finding = Finding::new(
file_path.to_path_buf(),
secret,
adjusted_confidence,
"entropy_analyzer".to_string(),
)
.with_context_lines(context_lines)
.with_entropy_score(entropy_result.shannon_entropy);
findings.push(finding);
}
let contextual_findings = self.detect_contextual_secrets(content, &found_positions);
for (text, start, end, confidence) in contextual_findings {
let line_info = self.get_line_info(content, start);
let context_lines = self.get_context_lines(content, line_info.line_number, 2);
let secret = FoundSecret::new(
text,
"contextual_pattern".to_string(),
start,
end,
line_info.line_number,
line_info.column_number,
);
let finding = Finding::new(
file_path.to_path_buf(),
secret,
confidence,
"contextual_analyzer".to_string(),
)
.with_context_lines(context_lines);
findings.push(finding);
}
self.deduplicate_findings(&mut findings);
findings.sort_by(|a, b| b.confidence.partial_cmp(&a.confidence).unwrap());
Ok(findings)
}
fn should_skip_file(&self, path: &Path) -> CryptoResult<Option<String>> {
let metadata = fs::metadata(path)?;
if metadata.len() > self.config.max_file_size {
return Ok(Some(format!("File too large: {} bytes", metadata.len())));
}
if !metadata.is_file() {
return Ok(Some("Not a regular file".to_string()));
}
if let Some(extension) = path.extension().and_then(|ext| ext.to_str()) {
let extension = extension.to_lowercase();
if !self.config.include_extensions.is_empty() &&
!self.config.include_extensions.contains(&extension) {
return Ok(Some(format!("Extension not in include list: {}", extension)));
}
if self.config.exclude_extensions.contains(&extension) {
return Ok(Some(format!("Extension in exclude list: {}", extension)));
}
}
let path_str = path.to_string_lossy().to_lowercase();
for exclude_path in &self.config.exclude_paths {
if path_str.contains(&exclude_path.to_lowercase()) {
return Ok(Some(format!("Path contains excluded component: {}", exclude_path)));
}
}
Ok(None)
}
fn get_line_info(&self, content: &str, position: usize) -> LineInfo {
let mut line_number = 1;
let mut column_number = 1;
let mut current_pos = 0;
for ch in content.chars() {
if current_pos >= position {
break;
}
if ch == '\n' {
line_number += 1;
column_number = 1;
} else {
column_number += 1;
}
current_pos += ch.len_utf8();
}
LineInfo {
line_number,
column_number,
}
}
fn get_context_lines(&self, content: &str, line_number: usize, context: usize) -> Vec<String> {
let lines: Vec<&str> = content.lines().collect();
let start = line_number.saturating_sub(context + 1);
let end = std::cmp::min(line_number + context, lines.len());
lines[start..end]
.iter()
.map(|&line| line.to_string())
.collect()
}
fn adjust_confidence_with_context(
&self,
base_confidence: f64,
matched_text: &str,
context: &str,
) -> f64 {
let mut confidence = base_confidence;
let context_lower = context.to_lowercase();
let matched_lower = matched_text.to_lowercase();
let test_indicators = [
"test", "example", "sample", "placeholder", "dummy", "fake",
"mock", "demo", "todo", "fixme", "changeme",
];
for indicator in &test_indicators {
if context_lower.contains(indicator) || matched_lower.contains(indicator) {
confidence -= 0.2;
}
}
if context.trim_start().starts_with("//") ||
context.trim_start().starts_with("#") ||
context.trim_start().starts_with("/*") {
confidence -= 0.1;
}
let config_indicators = [
"config", "settings", "env", "environment", "production", "prod",
"staging", "live", "secret", "key", "token", "password",
];
for indicator in &config_indicators {
if context_lower.contains(indicator) {
confidence += 0.1;
}
}
confidence.max(0.0).min(1.0)
}
fn calculate_composite_confidence(
&self,
base_confidence: f64,
matched_text: &str,
context: &str,
entropy_result: &crate::detection::entropy::EntropyResult,
file_path: &Path,
) -> f64 {
let mut confidence = base_confidence;
confidence = self.adjust_confidence_with_context(confidence, matched_text, context);
if entropy_result.shannon_entropy > 0.0 {
if entropy_result.shannon_entropy < 2.5 {
confidence *= 0.7; } else if entropy_result.shannon_entropy > 4.5 {
confidence *= 1.1; }
}
let length = matched_text.len();
if length < 8 {
confidence *= 0.6; } else if length > 40 && length < 100 {
confidence *= 1.05; } else if length > 200 {
confidence *= 0.8; }
let char_types = self.count_character_types(matched_text);
if char_types >= 3 {
confidence *= 1.1; } else if char_types == 1 {
confidence *= 0.7; }
if let Some(ext) = file_path.extension().and_then(|e| e.to_str()) {
match ext.to_lowercase().as_str() {
"env" | "config" | "conf" | "ini" | "toml" | "yaml" | "yml" => confidence *= 1.2,
"rs" | "py" | "js" | "go" | "java" | "php" | "rb" => confidence *= 1.0,
"md" | "txt" | "rst" | "doc" => confidence *= 0.7,
_ if file_path.to_string_lossy().contains("test") => confidence *= 0.5,
_ => {}
}
}
if self.is_likely_false_positive(matched_text) {
confidence *= 0.3;
}
confidence.max(0.0).min(1.0)
}
fn detect_high_entropy_secrets(
&self,
content: &str,
found_positions: &std::collections::HashSet<(usize, usize)>,
) -> Vec<(String, usize, crate::detection::entropy::EntropyResult)> {
let mut results = Vec::new();
let tokens = self.tokenize_content(content);
for (token, start_pos) in tokens {
let end_pos = start_pos + token.len();
if found_positions.iter().any(|(s, e)| {
(start_pos >= *s && start_pos < *e) || (end_pos > *s && end_pos <= *e)
}) {
continue;
}
if token.len() < 8 {
continue;
}
let entropy_result = self.entropy_analyzer.analyze(&token);
if entropy_result.is_likely_secret {
if self.validate_entropy_candidate(&token, &entropy_result) {
results.push((token, start_pos, entropy_result));
}
}
}
let base64_candidates = self.find_base64_candidates(content, found_positions);
for (candidate, start_pos) in base64_candidates {
let entropy_result = self.entropy_analyzer.analyze(&candidate);
if entropy_result.confidence > 0.6 {
results.push((candidate, start_pos, entropy_result));
}
}
results
}
fn detect_contextual_secrets(
&self,
content: &str,
found_positions: &std::collections::HashSet<(usize, usize)>,
) -> Vec<(String, usize, usize, f64)> {
let mut results = Vec::new();
let secret_keywords = [
("password", r#"[:\s=]+["']?([^"'\s]{8,})["']?"#),
("api_key", r#"[:\s=]+["']?([A-Za-z0-9_\-]{20,})["']?"#),
("secret", r#"[:\s=]+["']?([A-Za-z0-9_\-]{12,})["']?"#),
("token", r#"[:\s=]+["']?([A-Za-z0-9_\-]{20,})["']?"#),
("auth", r#"[:\s=]+["']?([A-Za-z0-9_\-]{16,})["']?"#),
("credential", r#"[:\s=]+["']?([^"'\s]{10,})["']?"#),
("private_key", r#"[:\s=]+["']?([A-Za-z0-9+/=]{40,})["']?"#),
];
for (keyword, pattern) in &secret_keywords {
let regex_pattern = format!(r"(?i){}{}", keyword, pattern);
if let Ok(regex) = regex::Regex::new(®ex_pattern) {
for cap in regex.captures_iter(content) {
if let Some(secret_match) = cap.get(1) {
let start = secret_match.start();
let end = secret_match.end();
if found_positions.contains(&(start, end)) {
continue;
}
let matched_text = secret_match.as_str();
if self.validate_contextual_candidate(matched_text, keyword) {
let confidence = self.calculate_contextual_confidence(matched_text, keyword);
results.push((matched_text.to_string(), start, end, confidence));
}
}
}
}
}
results
}
fn classify_entropy_secret(
&self,
text: &str,
entropy_result: &crate::detection::entropy::EntropyResult,
) -> String {
if text.starts_with("AKIA") {
return "aws_access_key".to_string();
}
if text.starts_with("sk_") || text.starts_with("pk_") {
return "api_key".to_string();
}
if text.len() == 40 && text.chars().all(|c| c.is_ascii_hexdigit()) {
return "sha1_hash_or_token".to_string();
}
if text.len() == 64 && text.chars().all(|c| c.is_ascii_hexdigit()) {
return "sha256_hash_or_token".to_string();
}
let has_uppercase = text.chars().any(|c| c.is_ascii_uppercase());
let has_lowercase = text.chars().any(|c| c.is_ascii_lowercase());
let has_digits = text.chars().any(|c| c.is_ascii_digit());
let has_special = text.chars().any(|c| !c.is_ascii_alphanumeric());
if has_uppercase && has_lowercase && has_digits && has_special {
return "high_entropy_password".to_string();
}
if entropy_result.charset_size > 50 {
return "high_entropy_token".to_string();
}
"high_entropy_string".to_string()
}
fn deduplicate_findings(&self, findings: &mut Vec<Finding>) {
let mut seen = std::collections::HashSet::new();
findings.retain(|f| {
let key = (
f.file_path.clone(),
f.secret.start_position,
f.secret.end_position,
);
seen.insert(key)
});
}
fn count_character_types(&self, text: &str) -> usize {
let has_lowercase = text.chars().any(|c| c.is_ascii_lowercase());
let has_uppercase = text.chars().any(|c| c.is_ascii_uppercase());
let has_digits = text.chars().any(|c| c.is_ascii_digit());
let has_special = text.chars().any(|c| !c.is_ascii_alphanumeric());
[has_lowercase, has_uppercase, has_digits, has_special]
.iter()
.filter(|&&x| x)
.count()
}
fn is_likely_false_positive(&self, text: &str) -> bool {
let text_lower = text.to_lowercase();
let false_positive_patterns = [
"aaaaaaa", "bbbbbbb", "1234567", "abcdefg",
"qwertyu", "password", "12345678", "87654321",
"00000000", "11111111", "ffffffff", "deadbeef",
"cafebabe", "test1234", "admin123", "user1234",
];
for pattern in &false_positive_patterns {
if text_lower.contains(pattern) {
return true;
}
}
if text.len() >= 8 {
let first_char = text.chars().next().unwrap();
if text.chars().all(|c| c == first_char) {
return true;
}
}
if self.is_sequential_pattern(text) {
return true;
}
false
}
fn tokenize_content(&self, content: &str) -> Vec<(String, usize)> {
let mut tokens = Vec::new();
let mut current_token = String::new();
let mut start_pos = 0;
let mut in_token = false;
for (i, ch) in content.char_indices() {
if ch.is_alphanumeric() || "-_+/=".contains(ch) {
if !in_token {
start_pos = i;
in_token = true;
}
current_token.push(ch);
} else {
if in_token && current_token.len() >= 8 {
tokens.push((current_token.clone(), start_pos));
}
current_token.clear();
in_token = false;
}
}
if in_token && current_token.len() >= 8 {
tokens.push((current_token, start_pos));
}
tokens
}
fn find_base64_candidates(
&self,
content: &str,
found_positions: &std::collections::HashSet<(usize, usize)>,
) -> Vec<(String, usize)> {
let mut candidates = Vec::new();
let base64_regex = regex::Regex::new(r"[A-Za-z0-9+/]{20,}={0,2}").unwrap();
for m in base64_regex.find_iter(content) {
let start = m.start();
let end = m.end();
if found_positions.iter().any(|(s, e)| {
(start >= *s && start < *e) || (end > *s && end <= *e)
}) {
continue;
}
let candidate = m.as_str();
if candidate.len() % 4 == 0 || (candidate.len() % 4 == 2 && candidate.ends_with("==")) ||
(candidate.len() % 4 == 3 && candidate.ends_with("=")) {
candidates.push((candidate.to_string(), start));
}
}
candidates
}
fn validate_entropy_candidate(
&self,
text: &str,
entropy_result: &crate::detection::entropy::EntropyResult,
) -> bool {
if entropy_result.shannon_entropy < 3.5 {
return false;
}
if entropy_result.charset_size < 10 {
return false;
}
if self.is_likely_false_positive(text) {
return false;
}
if entropy_result.shannon_entropy > 5.5 {
if !text.chars().all(|c| c.is_ascii_graphic() || c.is_ascii_whitespace()) {
return false;
}
}
true
}
fn validate_contextual_candidate(&self, text: &str, keyword: &str) -> bool {
let placeholders = ["your", "my", "insert", "replace", "change", "enter", "here"];
for placeholder in &placeholders {
if text.to_lowercase().contains(placeholder) {
return false;
}
}
match keyword {
"password" => self.count_character_types(text) >= 2,
"api_key" | "token" => text.len() >= 20,
"secret" => text.len() >= 12,
_ => true,
}
}
fn calculate_contextual_confidence(&self, text: &str, keyword: &str) -> f64 {
let mut confidence = 0.6;
match keyword {
"password" | "private_key" => confidence += 0.15,
"api_key" | "token" | "secret" => confidence += 0.1,
_ => {}
}
let char_types = self.count_character_types(text);
confidence += (char_types as f64) * 0.05;
if text.len() > 30 {
confidence += 0.1;
}
let entropy_result = self.entropy_analyzer.analyze(text);
if entropy_result.shannon_entropy > 4.0 {
confidence += 0.1;
}
confidence.min(0.95)
}
fn is_sequential_pattern(&self, text: &str) -> bool {
if text.len() < 4 {
return false;
}
let chars: Vec<char> = text.chars().collect();
let mut ascending = true;
let mut descending = true;
for i in 1..chars.len() {
if chars[i] as u32 != chars[i-1] as u32 + 1 {
ascending = false;
}
if chars[i] as u32 != chars[i-1] as u32 - 1 {
descending = false;
}
}
ascending || descending
}
}
#[derive(Debug, Clone)]
struct LineInfo {
line_number: usize,
column_number: usize,
}
#[cfg(test)]
mod tests {
use super::*;
use tempfile::TempDir;
use std::fs;
#[test]
fn test_scan_config_creation() {
let config = ScanConfig::default();
assert!(config.parallel);
assert!(config.respect_gitignore);
assert!(!config.scan_hidden);
let source_config = ScanConfig::for_source_code();
assert!(source_config.include_extensions.contains(&"rs".to_string()));
assert!(source_config.include_extensions.contains(&"py".to_string()));
}
#[test]
fn test_should_skip_file() {
let scanner = FileScanner::new(ScanConfig::default()).unwrap();
}
#[test]
fn test_scan_content() {
let scanner = FileScanner::new(ScanConfig::default()).unwrap();
let content = "AWS_ACCESS_KEY_ID=AKIAIOSFODNN7EXAMPLE\nSECRET=my_secret_value_123";
let path = Path::new("test.env");
let findings = scanner.scan_content(content, path).unwrap();
assert!(!findings.is_empty());
assert!(findings.iter().any(|f| f.secret.value.contains("AKIA")));
for finding in &findings {
assert!(finding.confidence > 0.0);
assert!(finding.confidence <= 1.0);
}
}
#[test]
fn test_line_info_calculation() {
let scanner = FileScanner::new(ScanConfig::default()).unwrap();
let content = "line 1\nline 2\nline 3 with secret";
let line_info = scanner.get_line_info(content, content.find("secret").unwrap());
assert_eq!(line_info.line_number, 3);
assert!(line_info.column_number > 10);
}
#[test]
fn test_context_lines() {
let scanner = FileScanner::new(ScanConfig::default()).unwrap();
let content = "line 1\nline 2\nline 3\nline 4\nline 5";
let context = scanner.get_context_lines(content, 3, 1);
assert_eq!(context.len(), 3); assert_eq!(context[1], "line 3");
}
#[test]
fn test_confidence_adjustment() {
let scanner = FileScanner::new(ScanConfig::default()).unwrap();
let adjusted = scanner.adjust_confidence_with_context(
0.9,
"test_secret",
"this is a test example",
);
assert!(adjusted < 0.9);
let adjusted = scanner.adjust_confidence_with_context(
0.7,
"real_secret",
"production config secret",
);
assert!(adjusted > 0.7);
}
#[test]
fn test_entropy_detection() {
let scanner = FileScanner::new(ScanConfig::default()).unwrap();
let content = "password=wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY";
let path = Path::new("config.env");
let findings = scanner.scan_content(content, path).unwrap();
assert!(!findings.is_empty());
let high_entropy_finding = findings.iter()
.find(|f| f.secret.secret_type.contains("entropy") || f.secret.secret_type.contains("password"));
assert!(high_entropy_finding.is_some());
}
#[test]
fn test_contextual_detection() {
let scanner = FileScanner::new(ScanConfig::default()).unwrap();
let content = "api_key = sk_live_abcdef1234567890\ntoken: ghp_1234567890abcdef1234567890abcdef12345678";
let path = Path::new("config.rs");
let findings = scanner.scan_content(content, path).unwrap();
assert!(!findings.is_empty());
let contextual_finding = findings.iter()
.find(|f| f.detector_name == "contextual_analyzer");
}
#[test]
fn test_false_positive_detection() {
let scanner = FileScanner::new(ScanConfig::default()).unwrap();
assert!(scanner.is_likely_false_positive("aaaaaaaaaa"));
assert!(scanner.is_likely_false_positive("1234567890"));
assert!(scanner.is_likely_false_positive("abcdefghij"));
assert!(scanner.is_likely_false_positive("password123"));
assert!(!scanner.is_likely_false_positive("AKIAIOSFODNN7EXAMPLE"));
assert!(!scanner.is_likely_false_positive("sk_live_abcdef1234567890"));
}
#[test]
fn test_character_type_counting() {
let scanner = FileScanner::new(ScanConfig::default()).unwrap();
assert_eq!(scanner.count_character_types("abc"), 1); assert_eq!(scanner.count_character_types("ABC"), 1); assert_eq!(scanner.count_character_types("123"), 1); assert_eq!(scanner.count_character_types("Abc"), 2); assert_eq!(scanner.count_character_types("Abc123"), 3); assert_eq!(scanner.count_character_types("Abc123!"), 4); }
#[test]
fn test_sequential_pattern_detection() {
let scanner = FileScanner::new(ScanConfig::default()).unwrap();
assert!(scanner.is_sequential_pattern("abcd"));
assert!(scanner.is_sequential_pattern("1234"));
assert!(scanner.is_sequential_pattern("dcba"));
assert!(scanner.is_sequential_pattern("4321"));
assert!(!scanner.is_sequential_pattern("abdc"));
assert!(!scanner.is_sequential_pattern("1324"));
assert!(!scanner.is_sequential_pattern("AKIAIOSFODNN7EXAMPLE"));
}
#[test]
fn test_tokenization() {
let scanner = FileScanner::new(ScanConfig::default()).unwrap();
let content = "key=AKIAIOSFODNN7EXAMPLE value=\"wJalrXUtnFEMI/K7MDENG\"";
let tokens = scanner.tokenize_content(content);
assert!(tokens.iter().any(|(token, _)| token.contains("AKIA")));
assert!(tokens.iter().any(|(token, _)| token.contains("wJalrXUtnFEMI")));
for (token, _) in &tokens {
assert!(token.len() >= 8);
}
}
#[test]
fn test_base64_detection() {
let scanner = FileScanner::new(ScanConfig::default()).unwrap();
let content = "secret=dGVzdF9zZWNyZXRfa2V5XzEyMzQ1Njc4OTA=";
let found_positions = std::collections::HashSet::new();
let candidates = scanner.find_base64_candidates(content, &found_positions);
assert!(!candidates.is_empty());
let base64_candidate = candidates.iter()
.find(|(candidate, _)| candidate.contains("dGVzdF"));
assert!(base64_candidate.is_some());
}
#[test]
fn test_deduplication() {
let scanner = FileScanner::new(ScanConfig::default()).unwrap();
let content = "AWS_ACCESS_KEY_ID=AKIAIOSFODNN7EXAMPLE\nAWS_ACCESS_KEY_ID=AKIAIOSFODNN7EXAMPLE";
let path = Path::new("test.env");
let findings = scanner.scan_content(content, path).unwrap();
let mut positions = std::collections::HashSet::new();
for finding in &findings {
let key = (finding.secret.start_position, finding.secret.end_position);
assert!(positions.insert(key), "Duplicate finding at same position");
}
}
}