standarbuild-detect 0.3.0

Detect project kind (Rust, Node, Bun, Deno, Python, Lua, C/C++) AND workspace kind (Cargo, Npm/Pnpm/Yarn/Bun, Deno, Go, Lerna, Nx, Turborepo, Mira) in polyglot monorepos
Documentation
//! Recursive workspace scan: starting from a root directory, walk down up to
//! `max_depth` levels, run every registered [`crate::Detector`], and build a
//! [`DetectionResult`] containing both the projects and the workspace manifests
//! found.
//!
//! After the walk, [`reconcile_members`] cross-references projects and
//! workspaces: each project gets a `member_of` list of workspace roots that
//! declare it as a member (multi-entry when the same root has overlapping
//! workspace manifests, e.g. Cargo + Bazel claiming the same crate).

use std::collections::HashMap;
use std::path::{Path, PathBuf};

use crate::detector::{DetectorHit, DetectorRegistry};
use crate::kind::KindId;
use crate::workspace::WorkspaceKindId;

/// Single project entry in a [`DetectionResult`].
#[derive(Debug, Clone)]
#[cfg_attr(feature = "serde", derive(serde::Serialize))]
pub struct ProjectInfo {
    /// Project kind (Rust / Node / Bun / ...).
    pub kind: KindId,
    /// Human label, derived from manifest `name` or directory basename.
    pub label: String,
    /// POSIX-style path relative to the scan root (`./packages/web`,
    /// `.` when the scan root itself is the project).
    pub rel_path: String,
    /// Absolute on-disk path of the project root.
    #[cfg_attr(feature = "serde", serde(serialize_with = "crate::path_norm::serialize_path"))]
    pub absolute_path: PathBuf,
    /// Files / patterns that triggered detection.
    pub signals: Vec<String>,
    /// Absolute paths of workspace roots that declare this project as a
    /// member. Empty when the project is orphan (not part of any
    /// detected workspace). Multi-entry when overlapping workspaces
    /// claim the same project (rare; e.g. Cargo + Bazel).
    #[cfg_attr(feature = "serde", serde(serialize_with = "serialize_path_vec"))]
    pub member_of: Vec<PathBuf>,
}

/// Single workspace entry in a [`DetectionResult`].
#[derive(Debug, Clone)]
#[cfg_attr(feature = "serde", derive(serde::Serialize))]
pub struct WorkspaceInfo {
    /// Workspace kind (Cargo / Npm / Pnpm / ...).
    pub kind: WorkspaceKindId,
    /// Absolute on-disk path of the workspace root (where the manifest
    /// lives).
    #[cfg_attr(feature = "serde", serde(serialize_with = "crate::path_norm::serialize_path"))]
    pub root: PathBuf,
    /// Absolute paths to member project roots, in the order the
    /// manifest declares them.
    #[cfg_attr(feature = "serde", serde(serialize_with = "serialize_path_vec"))]
    pub members: Vec<PathBuf>,
    /// Files / patterns that triggered detection.
    pub signals: Vec<String>,
}

/// Aggregated result of [`discover`] / [`discover_with`].
#[derive(Debug, Clone, Default)]
#[cfg_attr(feature = "serde", derive(serde::Serialize))]
pub struct DetectionResult {
    /// All projects found, in walk order (deepest entry of the scan
    /// root first, then breadth-first descendants sorted by name).
    pub projects: Vec<ProjectInfo>,
    /// All workspace manifests found, in walk order.
    pub workspaces: Vec<WorkspaceInfo>,
}

impl DetectionResult {
    /// True when neither a project nor a workspace was detected.
    pub fn is_empty(&self) -> bool {
        self.projects.is_empty() && self.workspaces.is_empty()
    }
}

/// Strategy for choosing a project's `label`.
#[derive(Debug, Clone, Copy)]
pub enum LabelStrategy {
    /// Use the directory basename verbatim.
    Basename,
    /// For Rust / Node / Bun projects, read `name` from `Cargo.toml` or
    /// `package.json`. Falls back to the basename for other kinds.
    PreferManifestName,
}

/// Tuning knobs for [`discover`] / [`discover_with`].
#[derive(Debug, Clone)]
pub struct DiscoverOptions {
    /// Max recursion depth. `0` = only the scan root itself.
    pub max_depth: usize,
    /// Directory names to skip entirely (`target`, `node_modules`, ...).
    pub skip_dirs: Vec<String>,
    /// Skip directories whose name starts with `.` (default: true).
    pub skip_dotdirs: bool,
    /// How to label projects.
    pub label_strategy: LabelStrategy,
    /// When `true`, also emit a `ProjectInfo { kind: UNKNOWN, ... }` for
    /// depth-1 directories that didn't match any detector. Useful when
    /// callers want to surface raw children even when they look empty.
    pub include_unknown_at_depth_one: bool,
}

impl Default for DiscoverOptions {
    fn default() -> Self {
        Self {
            max_depth: 4,
            skip_dirs: default_skip_dirs(),
            skip_dotdirs: true,
            label_strategy: LabelStrategy::PreferManifestName,
            include_unknown_at_depth_one: true,
        }
    }
}

fn default_skip_dirs() -> Vec<String> {
    [
        "node_modules",
        "target",
        "dist",
        "build",
        "out",
        "__pycache__",
        ".venv",
        "venv",
    ]
    .iter()
    .map(|s| s.to_string())
    .collect()
}

/// Convenience wrapper around [`discover_with`] using the built-in
/// [`DetectorRegistry`].
pub fn discover(base_dir: &Path, opts: &DiscoverOptions) -> DetectionResult {
    discover_with(base_dir, opts, &DetectorRegistry::with_builtins())
}

/// Recursive scan against an explicit registry. Extra registered
/// detectors (custom kinds) participate exactly like the built-ins.
pub fn discover_with(
    base_dir: &Path,
    opts: &DiscoverOptions,
    registry: &DetectorRegistry,
) -> DetectionResult {
    let mut result = DetectionResult::default();

    visit(base_dir, base_dir, 0, opts, registry, &mut result);
    dedupe_labels(&mut result.projects);
    reconcile_members(&mut result);
    result
}

fn visit(
    base_dir: &Path,
    current: &Path,
    depth: usize,
    opts: &DiscoverOptions,
    registry: &DetectorRegistry,
    result: &mut DetectionResult,
) {
    let hits = registry.detect(current);
    record_hits(base_dir, current, depth, opts, &hits, result);

    if depth >= opts.max_depth {
        return;
    }
    let Ok(entries) = std::fs::read_dir(current) else {
        return;
    };
    let mut entries: Vec<_> = entries.flatten().collect();
    entries.sort_by_key(|e| e.file_name());
    for entry in entries {
        let path = entry.path();
        if !path.is_dir() {
            continue;
        }
        let name = entry.file_name().to_string_lossy().into_owned();
        if should_skip(&name, opts) {
            continue;
        }
        visit(base_dir, &path, depth + 1, opts, registry, result);
    }
}

fn record_hits(
    base_dir: &Path,
    current: &Path,
    depth: usize,
    opts: &DiscoverOptions,
    hits: &[DetectorHit],
    result: &mut DetectionResult,
) {
    if hits.is_empty() {
        // Optionally surface unknown depth-1 dirs as orphan projects.
        if depth == 1 && opts.include_unknown_at_depth_one {
            let (label, rel_path) = label_and_relpath(base_dir, current, &KindId::UNKNOWN, opts);
            result.projects.push(ProjectInfo {
                kind: KindId::UNKNOWN,
                label,
                rel_path,
                absolute_path: current.to_path_buf(),
                signals: Vec::new(),
                member_of: Vec::new(),
            });
        }
        return;
    }

    for hit in hits {
        match hit {
            DetectorHit::Project { kind, signals } => {
                let (label, rel_path) = label_and_relpath(base_dir, current, kind, opts);
                result.projects.push(ProjectInfo {
                    kind: kind.clone(),
                    label,
                    rel_path,
                    absolute_path: current.to_path_buf(),
                    signals: signals.clone(),
                    member_of: Vec::new(),
                });
            }
            DetectorHit::Workspace { kind, members, signals } => {
                result.workspaces.push(WorkspaceInfo {
                    kind: kind.clone(),
                    root: current.to_path_buf(),
                    members: members.clone(),
                    signals: signals.clone(),
                });
            }
            DetectorHit::Both {
                project_kind,
                workspace_kind,
                members,
                signals,
            } => {
                let (label, rel_path) =
                    label_and_relpath(base_dir, current, project_kind, opts);
                result.projects.push(ProjectInfo {
                    kind: project_kind.clone(),
                    label,
                    rel_path,
                    absolute_path: current.to_path_buf(),
                    signals: signals.clone(),
                    member_of: Vec::new(),
                });
                result.workspaces.push(WorkspaceInfo {
                    kind: workspace_kind.clone(),
                    root: current.to_path_buf(),
                    members: members.clone(),
                    signals: signals.clone(),
                });
            }
        }
    }
}

fn reconcile_members(result: &mut DetectionResult) {
    // Build a lookup: absolute member path → list of workspace roots that
    // declare it as a member.
    let mut owner_of: HashMap<PathBuf, Vec<PathBuf>> = HashMap::new();
    for ws in &result.workspaces {
        for member_path in &ws.members {
            owner_of
                .entry(member_path.clone())
                .or_default()
                .push(ws.root.clone());
        }
    }
    for project in &mut result.projects {
        if let Some(roots) = owner_of.get(&project.absolute_path) {
            project.member_of = roots.clone();
        }
    }
}

fn should_skip(name: &str, opts: &DiscoverOptions) -> bool {
    if opts.skip_dotdirs && name.starts_with('.') {
        return true;
    }
    opts.skip_dirs.iter().any(|d| d == name)
}

fn dedupe_labels(projects: &mut [ProjectInfo]) {
    let mut seen: HashMap<String, usize> = HashMap::new();
    for p in projects.iter() {
        *seen.entry(p.label.clone()).or_insert(0) += 1;
    }
    for p in projects.iter_mut() {
        if seen.get(&p.label).copied().unwrap_or(0) > 1 {
            let rel = p.rel_path.trim_start_matches("./");
            if !rel.is_empty() && rel != "." {
                p.label = rel.replace('/', "-");
            }
        }
    }
}

fn label_and_relpath(
    base_dir: &Path,
    dir: &Path,
    kind: &KindId,
    opts: &DiscoverOptions,
) -> (String, String) {
    let basename = dir
        .file_name()
        .and_then(|s| s.to_str())
        .map(|s| s.to_string())
        .unwrap_or_else(|| "root".to_string());
    let rel = dir
        .strip_prefix(base_dir)
        .ok()
        .map(|p| p.to_string_lossy().replace('\\', "/"))
        .unwrap_or_default();
    let rel_path = if rel.is_empty() { ".".to_string() } else { format!("./{}", rel) };
    let label = label_for(&basename, dir, kind, opts.label_strategy);
    (label, rel_path)
}

fn label_for(basename: &str, dir: &Path, kind: &KindId, strategy: LabelStrategy) -> String {
    match strategy {
        LabelStrategy::Basename => basename.to_string(),
        LabelStrategy::PreferManifestName => match kind.as_str() {
            "rust" => read_cargo_package_name(dir).unwrap_or_else(|| basename.to_string()),
            "node" | "bun" => read_package_json_name(dir).unwrap_or_else(|| basename.to_string()),
            _ => basename.to_string(),
        },
    }
}

fn read_cargo_package_name(dir: &Path) -> Option<String> {
    let text = std::fs::read_to_string(dir.join("Cargo.toml")).ok()?;
    let v: toml::Value = toml::from_str(&text).ok()?;
    v.get("package")?
        .get("name")?
        .as_str()
        .map(|s| s.to_string())
}

fn read_package_json_name(dir: &Path) -> Option<String> {
    let text = std::fs::read_to_string(dir.join("package.json")).ok()?;
    let v: serde_json::Value = serde_json::from_str(&text).ok()?;
    v.get("name")?.as_str().map(|s| s.to_string())
}

#[cfg(feature = "serde")]
fn serialize_path_vec<S>(paths: &[PathBuf], s: S) -> Result<S::Ok, S::Error>
where
    S: serde::Serializer,
{
    use serde::ser::SerializeSeq;
    let mut seq = s.serialize_seq(Some(paths.len()))?;
    for p in paths {
        seq.serialize_element(&crate::path_norm::to_posix(p))?;
    }
    seq.end()
}