use crate::detectors::{ContextAnalyzer, PatternDetector};
use crate::models::{Context, Finding, Location};
use crate::scan_warn;
use anyhow::Result;
use std::path::PathBuf;
pub struct S3Scanner {
bucket: String,
prefix: Option<String>,
region: Option<String>,
entropy_threshold: f64,
max_object_size: u64,
custom_patterns: Vec<crate::config::settings::CustomPattern>,
}
const BINARY_EXTENSIONS: &[&str] = &[
"zip", "tar", "gz", "bz2", "xz", "7z", "rar", "zst", "lz4", "exe", "dll", "so", "dylib",
"bin", "jpg", "jpeg", "png", "gif", "bmp", "ico", "svg", "webp", "tiff", "avif", "mp3",
"mp4", "avi", "mov", "mkv", "flac", "wav", "ogg", "webm", "woff", "woff2", "ttf", "eot",
"otf", "pdf", "doc", "docx", "xls", "xlsx", "ppt", "pptx", "dat", "db", "sqlite", "sqlite3",
"parquet", "arrow", "iso", "img", "dmg", "vmdk", "map", "o", "a", "lib", "obj", "class",
"pyc",
];
impl S3Scanner {
pub fn new(bucket: String) -> Self {
Self {
bucket,
prefix: None,
region: None,
entropy_threshold: 3.5,
max_object_size: 5 * 1024 * 1024, custom_patterns: Vec::new(),
}
}
pub fn with_prefix(mut self, prefix: String) -> Self {
self.prefix = Some(prefix);
self
}
pub fn with_region(mut self, region: String) -> Self {
self.region = Some(region);
self
}
pub fn with_entropy_threshold(mut self, threshold: f64) -> Self {
self.entropy_threshold = threshold;
self
}
pub fn with_max_object_size(mut self, size: u64) -> Self {
self.max_object_size = size;
self
}
pub fn with_custom_patterns(
mut self,
patterns: Vec<crate::config::settings::CustomPattern>,
) -> Self {
self.custom_patterns = patterns;
self
}
pub async fn scan(&self) -> Result<Vec<Finding>> {
let aws_config = self.build_aws_config().await;
let client = aws_sdk_s3::Client::new(&aws_config);
let detector = if self.custom_patterns.is_empty() {
PatternDetector::new()
} else {
PatternDetector::with_custom_patterns(&self.custom_patterns)
};
let mut all_findings: Vec<Finding> = Vec::new();
let mut continuation_token: Option<String> = None;
loop {
let mut req = client
.list_objects_v2()
.bucket(&self.bucket)
.max_keys(1000);
if let Some(ref prefix) = self.prefix {
req = req.prefix(prefix);
}
if let Some(ref token) = continuation_token {
req = req.continuation_token(token);
}
let response = req.send().await.map_err(|e| {
anyhow::anyhow!(
"Failed to list objects in s3://{}: {}",
self.bucket,
e
)
})?;
let contents = response.contents();
for obj in contents {
let key = match obj.key() {
Some(k) => k.to_string(),
None => continue,
};
if self.is_binary_key(&key) {
continue;
}
let size = obj.size().unwrap_or(0) as u64;
if size > self.max_object_size || size == 0 {
continue;
}
if key.ends_with('/') {
continue;
}
match self
.scan_object(&client, &key, &detector)
.await
{
Ok(findings) => all_findings.extend(findings),
Err(e) => {
scan_warn!("s3", "failed to scan s3://{}/{}: {}", self.bucket, key, e);
}
}
}
if response.is_truncated() == Some(true) {
continuation_token = response.next_continuation_token().map(|s| s.to_string());
} else {
break;
}
}
Ok(all_findings)
}
async fn scan_object(
&self,
client: &aws_sdk_s3::Client,
key: &str,
detector: &PatternDetector,
) -> Result<Vec<Finding>> {
let resp = client
.get_object()
.bucket(&self.bucket)
.key(key)
.send()
.await?;
let body_bytes = resp.body.collect().await?.into_bytes();
if body_bytes.len() >= 2 && body_bytes[..body_bytes.len().min(512)].contains(&0u8) {
return Ok(Vec::new());
}
let content = match String::from_utf8(body_bytes.to_vec()) {
Ok(s) => s,
Err(_) => return Ok(Vec::new()), };
let virtual_path = PathBuf::from(format!("s3://{}/{}", self.bucket, key));
self.scan_text_content(&content, &virtual_path, detector)
}
fn scan_text_content(
&self,
content: &str,
virtual_path: &std::path::Path,
detector: &PatternDetector,
) -> Result<Vec<Finding>> {
let mut findings = Vec::new();
let file_context = ContextAnalyzer::analyze_file(virtual_path);
let lines: Vec<&str> = content.lines().collect();
for (line_num, line) in lines.iter().enumerate() {
if ContextAnalyzer::is_placeholder(line) || line.len() > 5000 {
continue;
}
let is_comment = ContextAnalyzer::is_comment(line);
let pattern_matches =
detector.scan_line_with_positions(line, self.entropy_threshold);
for pm in pattern_matches {
let mut secret = pm.secret;
if is_comment {
secret.confidence *= 0.75;
}
let line_before = if line_num > 0 {
Some(lines[line_num - 1].to_string())
} else {
None
};
let line_after = if line_num + 1 < lines.len() {
Some(lines[line_num + 1].to_string())
} else {
None
};
secret.severity = ContextAnalyzer::adjust_severity(
secret.severity,
&Context {
line_before: line_before.clone(),
line_content: line.to_string(),
line_after: line_after.clone(),
is_test_file: file_context.is_test_file,
is_config_file: file_context.is_config_file,
is_documentation: file_context.is_documentation,
file_extension: file_context.file_extension.clone(),
},
);
let location = Location {
file_path: virtual_path.to_path_buf(),
line_number: line_num + 1,
column_start: pm.column_start,
column_end: pm.column_end,
commit_hash: None,
commit_author: None,
commit_date: None,
};
let context = ContextAnalyzer::build_context(
line.to_string(),
line_before,
line_after,
&file_context,
);
findings.push(Finding::new(secret, location, context));
}
}
Ok(findings)
}
fn is_binary_key(&self, key: &str) -> bool {
if let Some(dot_pos) = key.rfind('.') {
let ext = &key[dot_pos + 1..];
BINARY_EXTENSIONS.contains(&ext.to_lowercase().as_str())
} else {
false
}
}
pub async fn count_objects(&self) -> Result<usize> {
let aws_config = self.build_aws_config().await;
let client = aws_sdk_s3::Client::new(&aws_config);
let mut count = 0usize;
let mut continuation_token: Option<String> = None;
loop {
let mut req = client
.list_objects_v2()
.bucket(&self.bucket)
.max_keys(1000);
if let Some(ref prefix) = self.prefix {
req = req.prefix(prefix);
}
if let Some(ref token) = continuation_token {
req = req.continuation_token(token);
}
let response = req.send().await?;
let contents = response.contents();
count += contents
.iter()
.filter(|o| {
let key = o.key().unwrap_or("");
let size = o.size().unwrap_or(0) as u64;
!key.ends_with('/')
&& size > 0
&& size <= self.max_object_size
&& !self.is_binary_key(key)
})
.count();
if response.is_truncated() == Some(true) {
continuation_token = response.next_continuation_token().map(|s| s.to_string());
} else {
break;
}
}
Ok(count)
}
async fn build_aws_config(&self) -> aws_config::SdkConfig {
let mut loader = aws_config::defaults(aws_config::BehaviorVersion::latest());
if let Some(ref region) = self.region {
loader = loader.region(aws_config::Region::new(region.clone()));
}
loader.load().await
}
}