alef-cli 0.9.1

CLI for the alef polyglot binding generator
use std::collections::{BTreeSet, HashSet};
use std::fs;
use std::path::{Path, PathBuf};
use tracing::info;

/// Markers that indicate a file was auto-generated by alef.
const ALEF_HEADER_MARKERS: &[&str] = &["auto-generated by alef", "AUTO-GENERATED by alef", "DO NOT EDIT"];

/// Clean up orphan alef-generated files that are no longer in the current generation output.
///
/// Strategy: walk only the directories where the current run actually wrote files
/// (the parent dir of every entry in `current_gen_paths`). For each file in those
/// directories whose first lines contain an alef-generated header marker, if its
/// canonicalized absolute path is NOT in the current run's path set, delete it.
///
/// Walking only the parents of just-written files is what keeps the cleanup safe
/// when callers (e.g. `alef generate`) emit only a subset of categories: scaffold
/// dirs that the current run did not touch are never visited, so untouched files
/// in those dirs (e.g. user-customized package manifests) are preserved.
pub fn cleanup_orphaned_files(current_gen_paths: &HashSet<PathBuf>) -> anyhow::Result<usize> {
    if current_gen_paths.is_empty() {
        return Ok(0);
    }

    // Normalize current_gen_paths so the comparison below is consistent. canonicalize()
    // resolves `.` / `..` / symlinks. If a file does not exist (yet), fall back to the
    // raw absolute path. The set is what we compare against during the walk.
    let normalized: HashSet<PathBuf> = current_gen_paths
        .iter()
        .map(|p| p.canonicalize().unwrap_or_else(|_| p.clone()))
        .collect();

    // Collect the set of parent directories actually touched in this run.
    let touched_dirs: BTreeSet<PathBuf> = current_gen_paths
        .iter()
        .filter_map(|p| p.parent().map(|d| d.to_path_buf()))
        .collect();

    let mut removed_count = 0;
    let mut visited_dirs: HashSet<PathBuf> = HashSet::new();

    for dir in &touched_dirs {
        if !dir.exists() {
            continue;
        }
        let canonical_dir = dir.canonicalize().unwrap_or_else(|_| dir.clone());
        if !visited_dirs.insert(canonical_dir.clone()) {
            continue;
        }
        removed_count += cleanup_dir_recursive(&canonical_dir, &normalized, &touched_dirs)?;
    }

    Ok(removed_count)
}

/// Walk `dir` and remove orphan alef-generated files. Recurses into subdirectories
/// only when the subdirectory itself is in `touched_dirs` (or contains a touched
/// path) — this keeps the cleanup tightly scoped to areas the current run wrote to.
fn cleanup_dir_recursive(
    dir: &Path,
    normalized_gen_paths: &HashSet<PathBuf>,
    touched_dirs: &BTreeSet<PathBuf>,
) -> anyhow::Result<usize> {
    let mut removed_count = 0;
    for entry in fs::read_dir(dir)? {
        let entry = entry?;
        let path = entry.path();

        if path.is_dir() {
            // Recurse only if the subdirectory itself is among touched_dirs (or one of
            // its ancestors is). Prevents the cleanup from climbing into untouched
            // sibling trees.
            let canonical_sub = path.canonicalize().unwrap_or_else(|_| path.clone());
            let descend = touched_dirs
                .iter()
                .any(|td| td == &canonical_sub || td.starts_with(&canonical_sub));
            if descend {
                removed_count += cleanup_dir_recursive(&path, normalized_gen_paths, touched_dirs)?;
            }
            continue;
        }

        if !has_alef_header(&path)? {
            continue;
        }

        let canonical_path = path.canonicalize().unwrap_or_else(|_| path.clone());
        if !normalized_gen_paths.contains(&canonical_path) {
            info!("Removing stale alef-generated file: {}", path.display());
            fs::remove_file(&path)?;
            removed_count += 1;
        }
    }

    Ok(removed_count)
}

/// Check if a file has an alef-generated header marker.
fn has_alef_header(path: &Path) -> anyhow::Result<bool> {
    // Read first ~2KB to check for header marker
    let content = match fs::read_to_string(path) {
        Ok(c) => c,
        Err(_) => {
            // If we can't read as UTF-8, skip it
            return Ok(false);
        }
    };

    // Check the first few lines for a marker
    let first_lines = content.lines().take(5).collect::<Vec<_>>().join("\n");

    for marker in ALEF_HEADER_MARKERS {
        if first_lines.contains(marker) {
            return Ok(true);
        }
    }

    Ok(false)
}