use crate::config::{glob_match, ExcludeConfig};
use anyhow::{Context, Result};
use ignore::WalkBuilder;
use std::path::{Path, PathBuf};
pub(crate) const SUPPORTED_EXTENSIONS: &[&str] = &[
"py", "pyi", "ts", "tsx", "js", "jsx", "mjs", "rs", "go", "java", "c", "h", "cpp", "hpp", "cc", "cs", "kt", "kts", "rb", "php", "swift", ];
const MAX_ANALYSIS_FILE_BYTES: u64 = 2 * 1024 * 1024;
fn validate_file(path: &Path, repo_canonical: &Path) -> Option<PathBuf> {
match std::fs::symlink_metadata(path) {
Ok(meta) => {
if meta.file_type().is_symlink() {
tracing::warn!("Skipping symlink: {}", path.display());
return None;
}
if meta.len() > MAX_ANALYSIS_FILE_BYTES {
tracing::warn!(
"Skipping oversized file: {} ({:.1}MB exceeds {}MB limit)",
path.display(),
meta.len() as f64 / (1024.0 * 1024.0),
MAX_ANALYSIS_FILE_BYTES / (1024 * 1024),
);
return None;
}
}
Err(e) => {
tracing::warn!("Cannot stat file {}: {}", path.display(), e);
return None;
}
}
match path.canonicalize() {
Ok(canonical) => {
if !canonical.starts_with(repo_canonical) {
tracing::warn!(
"Skipping file outside repository boundary: {} (resolves to {})",
path.display(),
canonical.display(),
);
return None;
}
Some(canonical)
}
Err(e) => {
tracing::warn!("Cannot canonicalize {}: {}", path.display(), e);
None
}
}
}
pub fn collect_file_list(repo_path: &Path, exclude: &ExcludeConfig) -> Result<Vec<PathBuf>> {
let repo_canonical = repo_path.canonicalize().with_context(|| {
format!(
"Cannot canonicalize repository path: {}",
repo_path.display()
)
})?;
let effective = exclude.effective_patterns();
let mut files = Vec::new();
let walker = WalkBuilder::new(repo_path)
.hidden(true)
.git_ignore(true)
.git_global(false)
.git_exclude(true)
.build();
for entry in walker.filter_map(|e| e.ok()) {
let path = entry.path();
if !path.is_file() {
continue;
}
let Some(ext) = path.extension().and_then(|e| e.to_str()) else {
continue;
};
if !SUPPORTED_EXTENSIONS.contains(&ext) {
continue;
}
if let Ok(rel) = path.strip_prefix(repo_path) {
let rel_str = rel.to_string_lossy();
if effective.iter().any(|p| glob_match(p, &rel_str)) {
continue;
}
}
if let Some(validated) = validate_file(path, &repo_canonical) {
files.push(validated);
}
}
files.sort();
Ok(files)
}
#[cfg(test)]
mod tests {
use super::*;
use std::fs;
use tempfile::TempDir;
#[test]
fn test_validate_file_accepts_normal_file() {
let dir = TempDir::new().expect("create temp dir");
let file = dir.path().join("test.py");
fs::write(&file, "print('hello')").expect("write test file");
let repo_canonical = dir.path().canonicalize().expect("canonicalize path");
assert!(validate_file(&file, &repo_canonical).is_some());
}
#[test]
fn test_validate_file_rejects_nonexistent() {
let dir = TempDir::new().expect("create temp dir");
let repo_canonical = dir.path().canonicalize().expect("canonicalize path");
let fake = dir.path().join("nope.py");
assert!(validate_file(&fake, &repo_canonical).is_none());
}
#[test]
fn test_validate_file_rejects_oversized() {
let dir = TempDir::new().expect("create temp dir");
let file = dir.path().join("big.py");
let data = vec![b'x'; 2 * 1024 * 1024 + 1];
fs::write(&file, &data).expect("write oversized file");
let repo_canonical = dir.path().canonicalize().expect("canonicalize path");
assert!(validate_file(&file, &repo_canonical).is_none());
}
#[test]
#[cfg(unix)] fn test_validate_file_rejects_symlink() {
let dir = TempDir::new().expect("create temp dir");
let real = dir.path().join("real.py");
fs::write(&real, "x = 1").expect("write real file");
let link = dir.path().join("link.py");
std::os::unix::fs::symlink(&real, &link).expect("create symlink");
let repo_canonical = dir.path().canonicalize().expect("canonicalize path");
assert!(validate_file(&link, &repo_canonical).is_none());
}
#[test]
fn test_validate_file_rejects_outside_boundary() {
let parent = TempDir::new().expect("create temp dir");
let repo = parent.path().join("repo");
fs::create_dir(&repo).expect("create repo dir");
let outside = parent.path().join("secret.py");
fs::write(&outside, "password = 'hunter2'").expect("write outside file");
let repo_canonical = repo.canonicalize().expect("canonicalize path");
let traversal_path = repo.join("..").join("secret.py");
assert!(validate_file(&traversal_path, &repo_canonical).is_none());
}
}