leindex 1.6.0

LeIndex MCP and semantic code search engine for AI tools and large codebases
use crate::phase::options::{DocsMode, PhaseOptions};
use anyhow::{Context, Result};
use std::path::{Path, PathBuf};

/// Files selected for a phase run.
#[derive(Debug, Clone, Default)]
pub struct CollectedFiles {
    /// Source-code files supported by leparse.
    pub code_files: Vec<PathBuf>,
    /// Optional docs files (markdown/text) when explicitly enabled.
    pub docs_files: Vec<PathBuf>,
}

/// Collect source files with optional docs based on options.
pub fn collect_files(root: &Path, options: &PhaseOptions) -> Result<CollectedFiles> {
    let mut collected = CollectedFiles::default();

    // All file extensions supported by the parse module.
    // Must stay in sync with crate::parse::grammar::LanguageId::from_extension.
    let code_exts = [
        "rs", "py", "js", "jsx", "mjs", "cjs", "ts", "tsx", "mts", "cts", // Main languages
        "go", "java", "cpp", "cc", "cxx", "c", "h", "hpp", // Systems languages
        "cs",  // C#
        "rb", "php", "lua", "scala", "sc", // Scripting languages
        "sh", "bash", // Shell
        "json", // Data
    ];

    // Optional focused-file mode (used by MCP when path points to a single file).
    if !options.focus_files.is_empty() {
        for raw_path in &options.focus_files {
            let candidate = if raw_path.is_absolute() {
                raw_path.clone()
            } else {
                root.join(raw_path)
            };

            if !candidate.is_file() {
                continue;
            }

            if let Some(ext) = candidate.extension().and_then(|e| e.to_str()) {
                let ext = ext.to_ascii_lowercase();
                if code_exts.contains(&ext.as_str()) {
                    collected.code_files.push(candidate.clone());
                } else if options.include_docs && include_docs_extension(&ext, options.docs_mode) {
                    collected.docs_files.push(candidate.clone());
                }
            }

            if collected.code_files.len() >= options.max_files {
                break;
            }
        }

        collected.code_files.sort();
        collected.code_files.dedup();
        collected.docs_files.sort();
        collected.docs_files.dedup();

        if !collected.code_files.is_empty() || !collected.docs_files.is_empty() {
            return Ok(collected);
        }
    }

    let mut walker = walkdir::WalkDir::new(root).into_iter();
    while let Some(entry) = walker.next() {
        let entry = match entry {
            Ok(e) => e,
            Err(_) => continue,
        };

        let path = entry.path();
        let file_name = entry.file_name().to_string_lossy();

        if entry.file_type().is_dir() {
            if should_skip_dir(&file_name) {
                walker.skip_current_dir();
                continue;
            }
            continue;
        }

        if !entry.file_type().is_file() {
            continue;
        }

        if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
            let ext = ext.to_ascii_lowercase();
            if code_exts.contains(&ext.as_str()) {
                collected.code_files.push(path.to_path_buf());
                if collected.code_files.len() >= options.max_files {
                    break;
                }
                continue;
            }

            if options.include_docs && include_docs_extension(&ext, options.docs_mode) {
                collected.docs_files.push(path.to_path_buf());
            }
        }

        if collected.code_files.len() >= options.max_files {
            break;
        }
    }

    Ok(collected)
}

/// Build (path, hash) inventory for freshness checks.
pub fn hash_inventory(paths: &[PathBuf]) -> Result<Vec<(PathBuf, String)>> {
    let mut out = Vec::with_capacity(paths.len());
    for path in paths {
        let bytes = std::fs::read(path)
            .with_context(|| format!("failed reading file for hashing: {}", path.display()))?;
        let hash = blake3::hash(&bytes).to_hex().to_string();
        out.push((path.clone(), hash));
    }
    out.sort_by(|a, b| a.0.cmp(&b.0));
    Ok(out)
}

/// Render a path relative to root when possible.
pub fn display_path(root: &Path, path: &Path) -> String {
    path.strip_prefix(root)
        .map(|p| p.display().to_string())
        .unwrap_or_else(|_| path.display().to_string())
}

fn should_skip_dir(file_name: &str) -> bool {
    // Must stay in sync with lepasserelle::leindex::ALWAYS_SKIP_DIRS
    // and config::ExclusionConfig defaults.
    matches!(
        file_name,
        // Version control
        ".git" | ".hg" | ".svn"
        // Build outputs
        | "target" | "build" | "dist" | "out" | ".next" | "coverage"
        // Package managers / dependencies
        | "node_modules" | "vendor" | "bower_components"
        // Python virtual environments & caches
        | ".venv" | "venv" | "env" | "__pycache__" | ".tox" | ".mypy_cache"
        | ".pytest_cache" | ".ruff_cache"
        // IDE / editor metadata
        | ".idea" | ".vscode"
        // Misc generated
        | ".leindex"
    )
}

fn include_docs_extension(ext: &str, mode: DocsMode) -> bool {
    (mode.include_markdown() && matches!(ext, "md" | "markdown"))
        || (mode.include_text() && matches!(ext, "txt" | "text"))
}

#[cfg(test)]
mod tests {
    use super::*;
    use tempfile::tempdir;

    #[test]
    fn collect_files_respects_docs_gating_and_skips_dirs() {
        let dir = tempdir().expect("tempdir");
        std::fs::create_dir_all(dir.path().join("src")).expect("mkdir src");
        std::fs::create_dir_all(dir.path().join("target")).expect("mkdir target");

        std::fs::write(dir.path().join("src/lib.rs"), "pub fn f(){}\n").expect("write code");
        std::fs::write(dir.path().join("README.md"), "# Doc\n").expect("write readme");
        std::fs::write(dir.path().join("notes.txt"), "hello\n").expect("write text");
        std::fs::write(dir.path().join("target/ignored.rs"), "pub fn g(){}\n")
            .expect("write ignored");

        let no_docs = collect_files(
            dir.path(),
            &PhaseOptions {
                root: dir.path().to_path_buf(),
                include_docs: false,
                docs_mode: DocsMode::All,
                ..PhaseOptions::default()
            },
        )
        .expect("collect no docs");

        assert_eq!(no_docs.code_files.len(), 1);
        assert!(no_docs.docs_files.is_empty());

        let with_docs = collect_files(
            dir.path(),
            &PhaseOptions {
                root: dir.path().to_path_buf(),
                include_docs: true,
                docs_mode: DocsMode::All,
                ..PhaseOptions::default()
            },
        )
        .expect("collect docs");

        assert_eq!(with_docs.code_files.len(), 1);
        assert_eq!(with_docs.docs_files.len(), 2);
    }

    #[test]
    fn hash_inventory_and_display_path_work() {
        let dir = tempdir().expect("tempdir");
        let file = dir.path().join("src/lib.rs");
        std::fs::create_dir_all(file.parent().expect("parent")).expect("mkdir");
        std::fs::write(&file, "pub fn f(){}\n").expect("write");

        let inventory = hash_inventory(&[file.clone()]).expect("inventory");
        assert_eq!(inventory.len(), 1);
        assert_eq!(inventory[0].0, file);
        assert!(!inventory[0].1.is_empty());

        let rendered = display_path(dir.path(), &inventory[0].0);
        assert_eq!(rendered, "src/lib.rs");
    }

    #[test]
    fn collect_files_respects_max_files_limit() {
        let dir = tempdir().expect("tempdir");
        std::fs::create_dir_all(dir.path().join("src")).expect("mkdir src");
        std::fs::write(dir.path().join("src/a.rs"), "pub fn a(){}\n").expect("write a");
        std::fs::write(dir.path().join("src/b.rs"), "pub fn b(){}\n").expect("write b");

        let collected = collect_files(
            dir.path(),
            &PhaseOptions {
                root: dir.path().to_path_buf(),
                max_files: 1,
                ..PhaseOptions::default()
            },
        )
        .expect("collect limited");

        assert_eq!(collected.code_files.len(), 1);
    }

    #[test]
    fn docs_mode_filters_markdown_vs_text() {
        let dir = tempdir().expect("tempdir");
        std::fs::create_dir_all(dir.path().join("src")).expect("mkdir src");
        std::fs::write(dir.path().join("src/lib.rs"), "pub fn x(){}\n").expect("write code");
        std::fs::write(dir.path().join("README.md"), "# Doc\n").expect("write md");
        std::fs::write(dir.path().join("notes.txt"), "text\n").expect("write txt");

        let markdown_only = collect_files(
            dir.path(),
            &PhaseOptions {
                root: dir.path().to_path_buf(),
                include_docs: true,
                docs_mode: DocsMode::Markdown,
                ..PhaseOptions::default()
            },
        )
        .expect("collect markdown");
        assert_eq!(markdown_only.docs_files.len(), 1);

        let text_only = collect_files(
            dir.path(),
            &PhaseOptions {
                root: dir.path().to_path_buf(),
                include_docs: true,
                docs_mode: DocsMode::Text,
                ..PhaseOptions::default()
            },
        )
        .expect("collect text");
        assert_eq!(text_only.docs_files.len(), 1);
    }

    #[test]
    fn collect_files_uses_focus_files_when_provided() {
        let dir = tempdir().expect("tempdir");
        std::fs::create_dir_all(dir.path().join("src")).expect("mkdir src");
        let a = dir.path().join("src/a.rs");
        let b = dir.path().join("src/b.rs");
        std::fs::write(&a, "pub fn a(){}\n").expect("write a");
        std::fs::write(&b, "pub fn b(){}\n").expect("write b");

        let collected = collect_files(
            dir.path(),
            &PhaseOptions {
                root: dir.path().to_path_buf(),
                focus_files: vec![b.clone()],
                max_files: 1,
                ..PhaseOptions::default()
            },
        )
        .expect("collect focused");

        assert_eq!(collected.code_files, vec![b]);
    }

    #[test]
    fn should_skip_dir_covers_all_build_output_dirs() {
        // Version control
        assert!(should_skip_dir(".git"));
        assert!(should_skip_dir(".hg"));
        assert!(should_skip_dir(".svn"));
        // Build outputs
        assert!(should_skip_dir("target"));
        assert!(should_skip_dir("build"));
        assert!(should_skip_dir("dist"));
        assert!(should_skip_dir("out"));
        assert!(should_skip_dir(".next"));
        assert!(should_skip_dir("coverage"));
        // Package managers / dependencies
        assert!(should_skip_dir("node_modules"));
        assert!(should_skip_dir("vendor"));
        assert!(should_skip_dir("bower_components"));
        // Python virtual environments & caches
        assert!(should_skip_dir(".venv"));
        assert!(should_skip_dir("venv"));
        assert!(should_skip_dir("env"));
        assert!(should_skip_dir("__pycache__"));
        assert!(should_skip_dir(".tox"));
        assert!(should_skip_dir(".mypy_cache"));
        assert!(should_skip_dir(".pytest_cache"));
        assert!(should_skip_dir(".ruff_cache"));
        // IDE / editor metadata
        assert!(should_skip_dir(".idea"));
        assert!(should_skip_dir(".vscode"));
        // Misc generated
        assert!(should_skip_dir(".leindex"));
        // Ensure normal dirs are NOT skipped
        assert!(!should_skip_dir("src"));
        assert!(!should_skip_dir("lib"));
        assert!(!should_skip_dir("tests"));
        assert!(!should_skip_dir("docs"));
    }

    #[test]
    fn collect_files_skips_all_build_output_dirs() {
        let dir = tempdir().expect("tempdir");
        std::fs::create_dir_all(dir.path().join("src")).expect("mkdir src");
        std::fs::write(dir.path().join("src/lib.rs"), "pub fn f(){}\n").expect("write code");

        // Create files inside build output directories - they should be skipped
        for build_dir in &["target", "build", "dist", "out", ".next", "coverage"] {
            std::fs::create_dir_all(dir.path().join(build_dir)).expect("mkdir build_dir");
            std::fs::write(
                dir.path().join(format!("{}/generated.rs", build_dir)),
                "pub fn g(){}\n",
            )
            .expect("write generated");
        }

        let collected = collect_files(
            dir.path(),
            &PhaseOptions {
                root: dir.path().to_path_buf(),
                ..PhaseOptions::default()
            },
        )
        .expect("collect");

        assert_eq!(
            collected.code_files.len(),
            1,
            "only src/lib.rs should be collected"
        );
        assert!(
            collected.code_files[0].ends_with("src/lib.rs"),
            "collected file should be src/lib.rs, got: {:?}",
            collected.code_files[0]
        );
    }
}