repotoire 0.7.1

Graph-powered code analysis CLI. 110 detectors for security, architecture, bus factor, and code quality.
Documentation
//! File collection and discovery for the analyze command.
//!
//! Provides `collect_file_list` (used by `engine/stages/collect.rs`) and the
//! `validate_file` helper that enforces symlink/boundary/size policy.

use crate::config::{glob_match, ExcludeConfig};

use anyhow::{Context, Result};
use ignore::WalkBuilder;
use std::path::{Path, PathBuf};

/// Supported file extensions for analysis
pub(crate) const SUPPORTED_EXTENSIONS: &[&str] = &[
    "py", "pyi", // Python
    "ts", "tsx", // TypeScript
    "js", "jsx", "mjs",  // JavaScript
    "rs",   // Rust
    "go",   // Go
    "java", // Java
    "c", "h", // C
    "cpp", "hpp", "cc", // C++
    "cs", // C#
    "kt", "kts",   // Kotlin
    "rb",    // Ruby
    "php",   // PHP
    "swift", // Swift
];

/// Maximum file size to accept for analysis (2MB, matching parser guardrail).
const MAX_ANALYSIS_FILE_BYTES: u64 = 2 * 1024 * 1024;

/// Validate a file for analysis: reject symlinks, out-of-boundary paths, and oversized files.
///
/// Returns `Some(canonical_path)` if the file passes all checks, `None` otherwise.
fn validate_file(path: &Path, repo_canonical: &Path) -> Option<PathBuf> {
    // 1. Reject symlinks
    match std::fs::symlink_metadata(path) {
        Ok(meta) => {
            if meta.file_type().is_symlink() {
                tracing::warn!("Skipping symlink: {}", path.display());
                return None;
            }
            // 2. Check file size
            if meta.len() > MAX_ANALYSIS_FILE_BYTES {
                tracing::warn!(
                    "Skipping oversized file: {} ({:.1}MB exceeds {}MB limit)",
                    path.display(),
                    meta.len() as f64 / (1024.0 * 1024.0),
                    MAX_ANALYSIS_FILE_BYTES / (1024 * 1024),
                );
                return None;
            }
        }
        Err(e) => {
            tracing::warn!("Cannot stat file {}: {}", path.display(), e);
            return None;
        }
    }

    // 3. Canonicalize and check boundary
    match path.canonicalize() {
        Ok(canonical) => {
            if !canonical.starts_with(repo_canonical) {
                tracing::warn!(
                    "Skipping file outside repository boundary: {} (resolves to {})",
                    path.display(),
                    canonical.display(),
                );
                return None;
            }
            Some(canonical)
        }
        Err(e) => {
            tracing::warn!("Cannot canonicalize {}: {}", path.display(), e);
            None
        }
    }
}

/// Quick file list collection (no git, no incremental checking) for cache validation
pub fn collect_file_list(repo_path: &Path, exclude: &ExcludeConfig) -> Result<Vec<PathBuf>> {
    let repo_canonical = repo_path.canonicalize().with_context(|| {
        format!(
            "Cannot canonicalize repository path: {}",
            repo_path.display()
        )
    })?;
    let effective = exclude.effective_patterns();
    let mut files = Vec::new();

    let walker = WalkBuilder::new(repo_path)
        .hidden(true)
        .git_ignore(true)
        .git_global(false)
        .git_exclude(true)
        .build();

    for entry in walker.filter_map(|e| e.ok()) {
        let path = entry.path();
        if !path.is_file() {
            continue;
        }
        let Some(ext) = path.extension().and_then(|e| e.to_str()) else {
            continue;
        };
        if !SUPPORTED_EXTENSIONS.contains(&ext) {
            continue;
        }

        // Skip files matching exclusion patterns
        if let Ok(rel) = path.strip_prefix(repo_path) {
            let rel_str = rel.to_string_lossy();
            if effective.iter().any(|p| glob_match(p, &rel_str)) {
                continue;
            }
        }
        if let Some(validated) = validate_file(path, &repo_canonical) {
            files.push(validated);
        }
    }

    // Sort for deterministic ordering — WalkBuilder does not guarantee
    // consistent order across runs on the same filesystem.
    files.sort();

    Ok(files)
}
#[cfg(test)]
mod tests {
    use super::*;
    use std::fs;
    use tempfile::TempDir;

    #[test]
    fn test_validate_file_accepts_normal_file() {
        let dir = TempDir::new().expect("create temp dir");
        let file = dir.path().join("test.py");
        fs::write(&file, "print('hello')").expect("write test file");
        let repo_canonical = dir.path().canonicalize().expect("canonicalize path");
        assert!(validate_file(&file, &repo_canonical).is_some());
    }

    #[test]
    fn test_validate_file_rejects_nonexistent() {
        let dir = TempDir::new().expect("create temp dir");
        let repo_canonical = dir.path().canonicalize().expect("canonicalize path");
        let fake = dir.path().join("nope.py");
        assert!(validate_file(&fake, &repo_canonical).is_none());
    }

    #[test]
    fn test_validate_file_rejects_oversized() {
        let dir = TempDir::new().expect("create temp dir");
        let file = dir.path().join("big.py");
        let data = vec![b'x'; 2 * 1024 * 1024 + 1];
        fs::write(&file, &data).expect("write oversized file");
        let repo_canonical = dir.path().canonicalize().expect("canonicalize path");
        assert!(validate_file(&file, &repo_canonical).is_none());
    }

    #[test]
    #[cfg(unix)] // symlink creation here is Unix-only; Windows symlinks need admin
    fn test_validate_file_rejects_symlink() {
        let dir = TempDir::new().expect("create temp dir");
        let real = dir.path().join("real.py");
        fs::write(&real, "x = 1").expect("write real file");
        let link = dir.path().join("link.py");

        std::os::unix::fs::symlink(&real, &link).expect("create symlink");
        let repo_canonical = dir.path().canonicalize().expect("canonicalize path");
        assert!(validate_file(&link, &repo_canonical).is_none());
    }

    #[test]
    fn test_validate_file_rejects_outside_boundary() {
        let parent = TempDir::new().expect("create temp dir");
        let repo = parent.path().join("repo");
        fs::create_dir(&repo).expect("create repo dir");
        let outside = parent.path().join("secret.py");
        fs::write(&outside, "password = 'hunter2'").expect("write outside file");

        let repo_canonical = repo.canonicalize().expect("canonicalize path");
        let traversal_path = repo.join("..").join("secret.py");
        assert!(validate_file(&traversal_path, &repo_canonical).is_none());
    }
}