ripvec-core 0.13.26

Semantic code search engine — GPU-accelerated ModernBERT embeddings, tree-sitter chunking, hybrid BM25+vector ranking
Documentation
//! Parallel directory traversal using the `ignore` crate.
//!
//! Respects `.gitignore` rules, skips hidden files, and applies optional
//! ripgrep type, extension, and gitignore-style filters. Uses
//! `build_parallel()` for multi-threaded file discovery.

use ignore::{WalkBuilder, gitignore::Gitignore};
use std::collections::HashSet;
use std::path::{Path, PathBuf};
use std::sync::{Arc, Mutex};

/// File discovery filters shared by full indexing and incremental cache diffing.
#[derive(Debug, Clone, Default, PartialEq, Eq)]
pub struct WalkOptions {
    /// Optional ripgrep file type filter (e.g. "rust", "python", "js").
    pub file_type: Option<String>,
    /// File extensions to exclude, with or without a leading dot.
    pub exclude_extensions: Vec<String>,
    /// Additional `.gitignore`-style patterns matched relative to the root.
    pub ignore_patterns: Vec<String>,
}

impl WalkOptions {
    #[must_use]
    pub fn from_file_type(file_type: Option<&str>) -> Self {
        Self {
            file_type: file_type.map(str::to_string),
            ..Self::default()
        }
    }
}

/// Walk a directory tree in parallel and collect file paths.
///
/// Respects `.gitignore` rules and skips hidden files and directories.
/// Collects all files — the chunking phase decides whether to use
/// tree-sitter (known extensions) or sliding-window fallback (unknown).
///
/// When `file_type` is `Some`, only files matching that type (using
/// ripgrep's built-in type database, e.g. "rust", "python", "js") are
/// collected.
///
/// Uses the `ignore` crate's parallel walker for multi-threaded traversal.
#[must_use]
pub fn collect_files(root: &Path, file_type: Option<&str>) -> Vec<PathBuf> {
    collect_files_with_options(root, &WalkOptions::from_file_type(file_type))
}

/// Walk a directory tree in parallel and collect file paths with explicit
/// include/exclude filters.
#[must_use]
pub fn collect_files_with_options(root: &Path, options: &WalkOptions) -> Vec<PathBuf> {
    let files = Arc::new(Mutex::new(Vec::new()));
    let excluded_extensions = Arc::new(normalized_extensions(&options.exclude_extensions));
    let ignore_matcher = build_ignore_matcher(root, &options.ignore_patterns).map(Arc::new);

    let mut builder = WalkBuilder::new(root);
    builder.hidden(true).git_ignore(true).git_global(true);

    if let Some(ft) = options.file_type.as_deref() {
        let mut types_builder = ignore::types::TypesBuilder::new();
        types_builder.add_defaults();
        types_builder.select(ft);
        if let Ok(types) = types_builder.build() {
            builder.types(types);
        }
    }

    builder.build_parallel().run(|| {
        let files = Arc::clone(&files);
        let excluded_extensions = Arc::clone(&excluded_extensions);
        let ignore_matcher = ignore_matcher.clone();
        Box::new(move |entry| {
            let Ok(entry) = entry else {
                return ignore::WalkState::Continue;
            };
            let Some(file_type) = entry.file_type() else {
                return ignore::WalkState::Continue;
            };
            let is_dir = file_type.is_dir();
            if ignore_matcher
                .as_ref()
                .is_some_and(|matcher| is_ignored(matcher, entry.path(), is_dir))
            {
                return if is_dir {
                    ignore::WalkState::Skip
                } else {
                    ignore::WalkState::Continue
                };
            }
            if !file_type.is_file() {
                return ignore::WalkState::Continue;
            }
            if has_excluded_extension(entry.path(), &excluded_extensions) {
                return ignore::WalkState::Continue;
            }
            // Skip known generated/binary files that add noise to the index
            if let Some(name) = entry.path().file_name().and_then(|n| n.to_str())
                && matches!(
                    name,
                    "Cargo.lock"
                        | "package-lock.json"
                        | "yarn.lock"
                        | "pnpm-lock.yaml"
                        | "poetry.lock"
                        | "Gemfile.lock"
                        | "go.sum"
                )
            {
                return ignore::WalkState::Continue;
            }
            if let Ok(mut files) = files.lock() {
                files.push(entry.into_path());
            }
            ignore::WalkState::Continue
        })
    });

    let mut files = Arc::try_unwrap(files)
        .ok()
        .and_then(|files| files.into_inner().ok())
        .unwrap_or_default();
    files.sort();
    files
}

fn normalized_extensions(extensions: &[String]) -> HashSet<String> {
    extensions
        .iter()
        .filter_map(|ext| {
            let normalized = ext.trim().trim_start_matches('.').to_ascii_lowercase();
            (!normalized.is_empty()).then_some(normalized)
        })
        .collect()
}

fn has_excluded_extension(path: &Path, excluded_extensions: &HashSet<String>) -> bool {
    path.extension()
        .and_then(|ext| ext.to_str())
        .map(|ext| excluded_extensions.contains(&ext.to_ascii_lowercase()))
        .unwrap_or(false)
}

fn build_ignore_matcher(root: &Path, patterns: &[String]) -> Option<Gitignore> {
    if patterns.is_empty() {
        return None;
    }
    let mut builder = ignore::gitignore::GitignoreBuilder::new(root);
    for pattern in patterns {
        if let Err(error) = builder.add_line(None, pattern) {
            tracing::warn!(pattern, %error, "invalid ripvec ignore pattern; skipping");
        }
    }
    builder.build().ok().filter(|matcher| !matcher.is_empty())
}

fn is_ignored(matcher: &Gitignore, path: &Path, is_dir: bool) -> bool {
    matcher
        .matched_path_or_any_parents(path, is_dir)
        .is_ignore()
}

#[cfg(test)]
mod tests {
    use super::*;
    use tempfile::TempDir;

    fn write_file(root: &Path, relative: &str) {
        let path = root.join(relative);
        if let Some(parent) = path.parent() {
            std::fs::create_dir_all(parent).expect("create parent");
        }
        std::fs::write(path, "test").expect("write file");
    }

    fn collect_relative(root: &Path, options: &WalkOptions) -> Vec<String> {
        collect_files_with_options(root, options)
            .into_iter()
            .map(|path| {
                path.strip_prefix(root)
                    .expect("under root")
                    .to_string_lossy()
                    .replace('\\', "/")
            })
            .collect()
    }

    #[test]
    fn excludes_extensions_case_insensitively() {
        let dir = TempDir::new().expect("tempdir");
        write_file(dir.path(), "src/main.rs");
        write_file(dir.path(), "logs/events.JSONL");
        write_file(dir.path(), "README.md");

        let files = collect_relative(
            dir.path(),
            &WalkOptions {
                exclude_extensions: vec!["jsonl".to_string(), ".md".to_string()],
                ..WalkOptions::default()
            },
        );

        assert_eq!(files, ["src/main.rs"]);
    }

    #[test]
    fn excludes_gitignore_style_patterns() {
        let dir = TempDir::new().expect("tempdir");
        write_file(dir.path(), "src/main.rs");
        write_file(dir.path(), "generated/schema.rs");
        write_file(dir.path(), "notes/keep.md");
        write_file(dir.path(), "notes/drop.md");

        let files = collect_relative(
            dir.path(),
            &WalkOptions {
                ignore_patterns: vec![
                    "generated/".to_string(),
                    "*.md".to_string(),
                    "!notes/keep.md".to_string(),
                ],
                ..WalkOptions::default()
            },
        );

        assert_eq!(files, ["notes/keep.md", "src/main.rs"]);
    }

    #[test]
    fn relative_roots_with_ignore_patterns_do_not_panic() {
        let dir = tempfile::Builder::new()
            .prefix("ripvec-walk-test-")
            .tempdir_in(".")
            .expect("tempdir in current directory");
        let root = PathBuf::from(dir.path().file_name().expect("tempdir file name"));
        write_file(&root, "src/main.rs");
        write_file(&root, "notes/drop.md");

        let files = collect_relative(
            &root,
            &WalkOptions {
                ignore_patterns: vec!["*.md".to_string()],
                ..WalkOptions::default()
            },
        );

        assert_eq!(files, ["src/main.rs"]);
    }
}