nornir 0.4.17

Companion to cargo: dependency tracking, release gating, deploy, benchmarks, and documentation assembly. Project-agnostic.
Documentation
//! Workspace walker that classifies files into [`super::Corpus`] buckets.
//!
//! Excludes hard-coded build/VCS/tooling dirs (`target/`, `.git/`,
//! `node_modules/`, Python virtualenvs & caches, editor dirs) plus nornir's
//! own machine state (`.nornir/warehouse/`, `.nornir/cache/`) — but keeps
//! `.nornir/*.md` doc templates, which are indexed as source. When a repo
//! scope is supplied, any top-level directory that isn't a configured
//! `[repo.*]` is pruned. Classification is by filename + extension.

use std::path::{Path, PathBuf};

use walkdir::{DirEntry, WalkDir};
use super::Corpus;

pub struct Candidate {
    pub path: PathBuf,
    pub corpus: Corpus,
    /// Top-level workspace dir the file belongs to (e.g. `holger`, `znippy`).
    /// Empty when the file sits at the workspace root.
    pub repo: String,
}

/// Directory names pruned anywhere in the tree: build output, VCS metadata,
/// dependency caches, Python virtualenvs/caches, and common editor dirs. These
/// are never source-of-truth and otherwise flood the index (e.g. a `.venv`'s
/// `site-packages` can dwarf the actual workspace).
///
/// `.nornir/` itself is **not** pruned wholesale: its `*.md` are the editable
/// doc templates (indexed as source). Only its machine state —
/// `.nornir/warehouse/` and `.nornir/cache/` — is excluded (see
/// [`is_excluded`]), so the iceberg warehouse's avro/json/parquet and transient
/// caches stay out of the index.
const HARD_EXCLUDES: &[&str] = &[
    "target", ".git", "node_modules",
    ".venv", "venv", "site-packages", "__pycache__",
    ".mypy_cache", ".pytest_cache", ".ruff_cache", ".tox",
    ".idea", ".vscode", ".svn", ".hg", ".gradle",
];

/// Walk `root`, classifying indexable files.
///
/// When `repo_scope` is non-empty, only top-level directories whose name is in
/// the list are descended into (root-level files are always indexed). An empty
/// scope walks every top-level directory — preserving the legacy behaviour for
/// callers that have no `[repo.*]` config to scope by.
pub fn walk_workspace(root: &Path, repo_scope: &[String]) -> Vec<Candidate> {
    let mut out = Vec::new();
    let walker = WalkDir::new(root)
        .follow_links(false)
        .into_iter()
        .filter_entry(|e| !is_excluded(e, repo_scope));

    for entry in walker.flatten() {
        if !entry.file_type().is_file() {
            continue;
        }
        let p = entry.path();
        let Some(corpus) = classify(p) else { continue };
        let repo = first_segment_under(root, p).unwrap_or_default();
        out.push(Candidate {
            path: p.to_path_buf(),
            corpus,
            repo,
        });
    }
    out
}

fn is_excluded(entry: &DirEntry, repo_scope: &[String]) -> bool {
    if let Some(name) = entry.file_name().to_str() {
        if HARD_EXCLUDES.iter().any(|h| *h == name) {
            return true;
        }
        // Keep `.nornir/*.md` (doc templates, indexed as source) but prune
        // nornir's machine state: `.nornir/warehouse/` (the iceberg warehouse —
        // avro/json/parquet) and `.nornir/cache/` (transient). Match by the
        // immediate parent so unrelated dirs named `warehouse`/`cache`
        // elsewhere are unaffected.
        if entry.file_type().is_dir() && (name == "warehouse" || name == "cache") {
            let parent_is_nornir = entry
                .path()
                .parent()
                .and_then(|p| p.file_name())
                .and_then(|s| s.to_str())
                == Some(".nornir");
            if parent_is_nornir {
                return true;
            }
        }
        // Scope to configured repos: at depth 1, prune any directory that
        // isn't a configured repo. Root-level files (also depth 1, but not
        // dirs) and everything deeper inside a kept repo are unaffected.
        if !repo_scope.is_empty()
            && entry.depth() == 1
            && entry.file_type().is_dir()
            && !repo_scope.iter().any(|r| r == name)
        {
            return true;
        }
    }
    false
}

fn first_segment_under(root: &Path, p: &Path) -> Option<String> {
    let rel = p.strip_prefix(root).ok()?;
    rel.components()
        .next()
        .map(|c| c.as_os_str().to_string_lossy().into_owned())
}

fn classify(p: &Path) -> Option<Corpus> {
    let name = p.file_name()?.to_string_lossy();
    let name_lower = name.to_lowercase();
    let ext = p
        .extension()
        .map(|s| s.to_string_lossy().to_lowercase())
        .unwrap_or_default();

    if name_lower == "bench_history.jsonl" {
        return Some(Corpus::BenchHistory);
    }
    if name_lower.starts_with("changelog") || name_lower == "history.md" {
        return Some(Corpus::Changelog);
    }
    if name_lower == "cargo.toml" || name_lower == "nornir.toml" || name_lower == "workspace.md" {
        return Some(Corpus::Config);
    }
    match ext.as_str() {
        "md" | "markdown" | "txt" | "rst" | "adoc" => Some(Corpus::Docs),
        "rs" | "py" | "java" | "kt" | "scala" | "go" | "ts" | "tsx" | "js" | "sh" | "rb"
        | "c" | "cc" | "cpp" | "h" | "hpp" => Some(Corpus::Code),
        "toml" | "yaml" | "yml" | "json" => Some(Corpus::Config),
        _ => None,
    }
}