#![allow(dead_code)]
use crate::config::{glob_match, ExcludeConfig};
use crate::detectors::IncrementalCache;
use crate::models::Finding;
use anyhow::{Context, Result};
use console::style;
use ignore::{WalkBuilder, WalkState};
use indicatif::{ProgressBar, ProgressStyle};
use std::path::{Path, PathBuf};
use std::sync::Arc;
pub(crate) const SUPPORTED_EXTENSIONS: &[&str] = &[
"py", "pyi", "ts", "tsx", "js", "jsx", "mjs", "rs", "go", "java", "c", "h", "cpp", "hpp", "cc", "cs", "kt", "kts", "rb", "php", "swift", ];
pub(super) struct FileCollectionResult {
pub all_files: Vec<PathBuf>,
pub files_to_parse: Vec<PathBuf>,
pub cached_findings: Vec<Finding>,
}
const MAX_ANALYSIS_FILE_BYTES: u64 = 2 * 1024 * 1024;
fn validate_file(path: &Path, repo_canonical: &Path) -> Option<PathBuf> {
match std::fs::symlink_metadata(path) {
Ok(meta) => {
if meta.file_type().is_symlink() {
tracing::warn!("Skipping symlink: {}", path.display());
return None;
}
if meta.len() > MAX_ANALYSIS_FILE_BYTES {
tracing::warn!(
"Skipping oversized file: {} ({:.1}MB exceeds {}MB limit)",
path.display(),
meta.len() as f64 / (1024.0 * 1024.0),
MAX_ANALYSIS_FILE_BYTES / (1024 * 1024),
);
return None;
}
}
Err(e) => {
tracing::warn!("Cannot stat file {}: {}", path.display(), e);
return None;
}
}
match path.canonicalize() {
Ok(canonical) => {
if !canonical.starts_with(repo_canonical) {
tracing::warn!(
"Skipping file outside repository boundary: {} (resolves to {})",
path.display(),
canonical.display(),
);
return None;
}
Some(canonical)
}
Err(e) => {
tracing::warn!("Cannot canonicalize {}: {}", path.display(), e);
None
}
}
}
pub fn collect_file_list(repo_path: &Path, exclude: &ExcludeConfig) -> Result<Vec<PathBuf>> {
let repo_canonical = repo_path.canonicalize().with_context(|| {
format!("Cannot canonicalize repository path: {}", repo_path.display())
})?;
let effective = exclude.effective_patterns();
let mut files = Vec::new();
let walker = WalkBuilder::new(repo_path)
.hidden(true)
.git_ignore(true)
.git_global(false)
.git_exclude(true)
.build();
for entry in walker.filter_map(|e| e.ok()) {
let path = entry.path();
if !path.is_file() {
continue;
}
let Some(ext) = path.extension().and_then(|e| e.to_str()) else {
continue;
};
if !SUPPORTED_EXTENSIONS.contains(&ext) {
continue;
}
if let Ok(rel) = path.strip_prefix(repo_path) {
let rel_str = rel.to_string_lossy();
if effective.iter().any(|p| glob_match(p, &rel_str)) {
continue;
}
}
if let Some(validated) = validate_file(path, &repo_canonical) {
files.push(validated);
}
}
files.sort();
Ok(files)
}
pub(super) fn collect_files_for_analysis(
repo_path: &Path,
since: &Option<String>,
is_incremental_mode: bool,
incremental_cache: &mut IncrementalCache,
multi: &indicatif::MultiProgress,
spinner_style: &ProgressStyle,
exclude: &ExcludeConfig,
) -> Result<FileCollectionResult> {
let walk_spinner = multi.add(ProgressBar::new_spinner());
walk_spinner.set_style(spinner_style.clone());
let (all_files, files_to_parse, cached_findings) = if let Some(ref commit) = since {
walk_spinner.set_message(format!("Finding files changed since {}...", commit));
walk_spinner.enable_steady_tick(std::time::Duration::from_millis(100));
let changed = get_changed_files_since(repo_path, commit)?;
let all = collect_source_files(repo_path, exclude)?;
walk_spinner.finish_with_message(format!(
"{}Found {} changed files (since {}) out of {} total",
style("✓ ").green(),
style(changed.len()).cyan(),
style(commit).yellow(),
style(all.len()).dim()
));
let cached = get_cached_findings_for_unchanged(&all, &changed, incremental_cache);
(all, changed, cached)
} else if is_incremental_mode {
walk_spinner.set_message("Discovering source files (incremental mode)...");
walk_spinner.enable_steady_tick(std::time::Duration::from_millis(100));
let all = collect_source_files(repo_path, exclude)?;
let changed = incremental_cache.changed_files(&all);
let cache_stats = incremental_cache.stats();
walk_spinner.finish_with_message(format!(
"{}Found {} changed files out of {} total ({} cached)",
style("✓ ").green(),
style(changed.len()).cyan(),
style(all.len()).dim(),
style(cache_stats.cached_files).dim()
));
let cached = get_cached_findings_for_unchanged(&all, &changed, incremental_cache);
(all, changed, cached)
} else {
walk_spinner.set_message("Discovering source files...");
walk_spinner.enable_steady_tick(std::time::Duration::from_millis(100));
let files = collect_source_files(repo_path, exclude)?;
walk_spinner.finish_with_message(format!(
"{}Found {} source files",
style("✓ ").green(),
style(files.len()).cyan()
));
(files.clone(), files, Vec::new())
};
Ok(FileCollectionResult {
all_files,
files_to_parse,
cached_findings,
})
}
pub(crate) fn walk_files_to_channel(
repo_path: &Path,
exclude: &ExcludeConfig,
sender: crossbeam_channel::Sender<PathBuf>,
early_files: Option<Arc<std::sync::OnceLock<Vec<PathBuf>>>>,
) -> Result<Vec<PathBuf>> {
let repo_canonical = repo_path.canonicalize().with_context(|| {
format!(
"Cannot canonicalize repository path: {}",
repo_path.display()
)
})?;
let effective = exclude.effective_patterns();
let mut builder = WalkBuilder::new(repo_path);
builder
.hidden(true)
.git_ignore(true)
.git_global(true)
.git_exclude(true)
.require_git(false)
.add_custom_ignore_filename(".repotoireignore");
let files = std::sync::Mutex::new(Vec::new());
builder.build_parallel().run(|| {
let files = &files;
let repo_canonical = &repo_canonical;
let effective = &effective;
let repo_path_ref = repo_path;
Box::new(move |entry| {
let entry = match entry {
Ok(e) => e,
Err(_) => return WalkState::Continue,
};
let path = entry.path();
if !path.is_file() {
return WalkState::Continue;
}
let Some(ext) = path.extension().and_then(|e| e.to_str()) else {
return WalkState::Continue;
};
if !SUPPORTED_EXTENSIONS.contains(&ext) {
return WalkState::Continue;
}
if let Ok(rel) = path.strip_prefix(repo_path_ref) {
let rel_str = rel.to_string_lossy();
if effective.iter().any(|p| glob_match(p, &rel_str)) {
return WalkState::Continue;
}
}
if let Some(validated) = validate_file(path, repo_canonical) {
if let Ok(mut f) = files.lock() {
f.push(validated);
}
}
WalkState::Continue
})
});
let mut files = files.into_inner().expect("walk mutex poisoned");
files.sort();
if let Some(early) = early_files {
let _ = early.set(files.clone());
}
for file in &files {
let _ = sender.send(file.clone());
}
drop(sender);
Ok(files)
}
fn collect_source_files(repo_path: &Path, exclude: &ExcludeConfig) -> Result<Vec<PathBuf>> {
let repo_canonical = repo_path.canonicalize().with_context(|| {
format!("Cannot canonicalize repository path: {}", repo_path.display())
})?;
let effective = exclude.effective_patterns();
let mut builder = WalkBuilder::new(repo_path);
builder
.hidden(true)
.git_ignore(true)
.git_global(true)
.git_exclude(true)
.require_git(false)
.add_custom_ignore_filename(".repotoireignore");
let files = std::sync::Mutex::new(Vec::new());
builder.build_parallel().run(|| {
let files = &files;
let repo_canonical = &repo_canonical;
let effective = &effective;
let repo_path_ref = repo_path;
Box::new(move |entry| {
let entry = match entry {
Ok(e) => e,
Err(_) => return WalkState::Continue,
};
let path = entry.path();
if !path.is_file() {
return WalkState::Continue;
}
let Some(ext) = path.extension().and_then(|e| e.to_str()) else {
return WalkState::Continue;
};
if !SUPPORTED_EXTENSIONS.contains(&ext) {
return WalkState::Continue;
}
if let Ok(rel) = path.strip_prefix(repo_path_ref) {
let rel_str = rel.to_string_lossy();
if effective.iter().any(|p| glob_match(p, &rel_str)) {
return WalkState::Continue;
}
}
if let Some(validated) = validate_file(path, repo_canonical) {
if let Ok(mut f) = files.lock() {
f.push(validated);
}
}
WalkState::Continue
})
});
let mut files = files.into_inner().expect("walk mutex poisoned");
files.sort();
Ok(files)
}
fn get_changed_files_since(repo_path: &Path, since: &str) -> Result<Vec<PathBuf>> {
use std::process::Command;
if since.starts_with('-') {
anyhow::bail!(
"Invalid --since value '{}': must be a commit hash, branch name, or tag (cannot start with '-')",
since
);
}
let repo_canonical = repo_path.canonicalize().with_context(|| {
format!("Cannot canonicalize repository path: {}", repo_path.display())
})?;
let output = Command::new("git")
.args(["diff", "--name-only", since, "HEAD"])
.current_dir(repo_path)
.output()
.with_context(|| format!("Failed to run git diff since '{}'", since))?;
if !output.status.success() {
let stderr = String::from_utf8_lossy(&output.stderr);
anyhow::bail!("git diff failed: {}", stderr.trim());
}
let stdout = String::from_utf8_lossy(&output.stdout);
let mut files: Vec<PathBuf> = stdout
.lines()
.filter(|l| !l.is_empty())
.filter_map(|l| {
let joined = repo_path.join(l);
if joined.exists() {
validate_file(&joined, &repo_canonical)
} else {
None
}
})
.collect();
let untracked = Command::new("git")
.args(["ls-files", "--others", "--exclude-standard"])
.current_dir(repo_path)
.output();
let untracked_files = untracked.ok()
.filter(|out| out.status.success())
.map(|out| String::from_utf8_lossy(&out.stdout).to_string())
.unwrap_or_default();
for line in untracked_files.lines().filter(|l| !l.is_empty()) {
let path = repo_path.join(line);
if !path.exists() { continue; }
let Some(validated) = validate_file(&path, &repo_canonical) else { continue; };
if !files.contains(&validated) {
files.push(validated);
}
}
files.retain(|p| {
p.extension()
.and_then(|e| e.to_str())
.map(|ext| SUPPORTED_EXTENSIONS.contains(&ext))
.unwrap_or(false)
});
Ok(files)
}
fn get_cached_findings_for_unchanged(
all_files: &[PathBuf],
changed_files: &[PathBuf],
incremental_cache: &IncrementalCache,
) -> Vec<Finding> {
let unchanged: Vec<_> = all_files
.iter()
.filter(|f| !changed_files.contains(f))
.collect();
let mut cached = Vec::new();
for file in unchanged {
cached.extend(incremental_cache.cached_findings(file));
}
cached
}
#[cfg(test)]
mod tests {
use super::*;
use std::fs;
use tempfile::TempDir;
#[test]
fn test_validate_file_accepts_normal_file() {
let dir = TempDir::new().expect("create temp dir");
let file = dir.path().join("test.py");
fs::write(&file, "print('hello')").expect("write test file");
let repo_canonical = dir.path().canonicalize().expect("canonicalize path");
assert!(validate_file(&file, &repo_canonical).is_some());
}
#[test]
fn test_validate_file_rejects_nonexistent() {
let dir = TempDir::new().expect("create temp dir");
let repo_canonical = dir.path().canonicalize().expect("canonicalize path");
let fake = dir.path().join("nope.py");
assert!(validate_file(&fake, &repo_canonical).is_none());
}
#[test]
fn test_validate_file_rejects_oversized() {
let dir = TempDir::new().expect("create temp dir");
let file = dir.path().join("big.py");
let data = vec![b'x'; 2 * 1024 * 1024 + 1];
fs::write(&file, &data).expect("write oversized file");
let repo_canonical = dir.path().canonicalize().expect("canonicalize path");
assert!(validate_file(&file, &repo_canonical).is_none());
}
#[test]
fn test_validate_file_rejects_symlink() {
let dir = TempDir::new().expect("create temp dir");
let real = dir.path().join("real.py");
fs::write(&real, "x = 1").expect("write real file");
let link = dir.path().join("link.py");
#[cfg(unix)]
{
std::os::unix::fs::symlink(&real, &link).expect("create symlink");
let repo_canonical = dir.path().canonicalize().expect("canonicalize path");
assert!(validate_file(&link, &repo_canonical).is_none());
}
}
#[test]
fn test_validate_file_rejects_outside_boundary() {
let parent = TempDir::new().expect("create temp dir");
let repo = parent.path().join("repo");
fs::create_dir(&repo).expect("create repo dir");
let outside = parent.path().join("secret.py");
fs::write(&outside, "password = 'hunter2'").expect("write outside file");
let repo_canonical = repo.canonicalize().expect("canonicalize path");
let traversal_path = repo.join("..").join("secret.py");
assert!(validate_file(&traversal_path, &repo_canonical).is_none());
}
}