alef-cli 0.16.54

CLI for the alef polyglot binding generator
use std::collections::{BTreeSet, HashSet};
use std::fs;
use std::path::{Path, PathBuf};
use tracing::info;

/// Returns `true` if the file at `path` contains an `alef:hash:` line in its
/// first 10 lines — the cryptographic marker that alef's finalizer appends.
///
/// Loose markers such as `"auto-generated by alef"` or `"DO NOT EDIT"` are
/// intentionally ignored: they appear in countless vendored files (cgo headers,
/// swig output, autoconf artefacts) that alef must never delete.
pub(crate) fn has_alef_hash(path: &Path) -> bool {
    let Ok(content) = fs::read_to_string(path) else {
        return false;
    };
    alef_core::hash::extract_hash(&content).is_some()
}

/// Clean up orphan alef-generated files that are no longer in the current generation output.
///
/// Strategy: walk only the directories where the current run actually wrote files
/// (the parent dir of every entry in `current_gen_paths`). For each file in those
/// directories whose first lines contain an alef-generated header marker, if its
/// canonicalized absolute path is NOT in the current run's path set, delete it.
///
/// Walking only the parents of just-written files is what keeps the cleanup safe
/// when callers (e.g. `alef generate`) emit only a subset of categories: scaffold
/// dirs that the current run did not touch are never visited, so untouched files
/// in those dirs (e.g. user-customized package manifests) are preserved.
pub fn cleanup_orphaned_files(current_gen_paths: &HashSet<PathBuf>) -> anyhow::Result<usize> {
    if current_gen_paths.is_empty() {
        return Ok(0);
    }

    // Normalize current_gen_paths so the comparison below is consistent. canonicalize()
    // resolves `.` / `..` / symlinks. If a file does not exist (yet), fall back to the
    // raw absolute path. The set is what we compare against during the walk.
    let normalized: HashSet<PathBuf> = current_gen_paths
        .iter()
        .map(|p| p.canonicalize().unwrap_or_else(|_| p.clone()))
        .collect();

    // Collect the set of parent directories actually touched in this run.
    // Canonicalize so cross-platform path resolution (e.g. macOS /tmp vs.
    // /private/tmp symlinks) does not silently break the descend-check
    // comparisons in `cleanup_dir_recursive`, which always sees canonicalized
    // subdirectory paths.
    let touched_dirs: BTreeSet<PathBuf> = current_gen_paths
        .iter()
        .filter_map(|p| p.parent().map(|d| d.canonicalize().unwrap_or_else(|_| d.to_path_buf())))
        .collect();

    let mut removed_count = 0;
    let mut visited_dirs: HashSet<PathBuf> = HashSet::new();

    for dir in &touched_dirs {
        if !dir.exists() {
            continue;
        }
        let canonical_dir = dir.canonicalize().unwrap_or_else(|_| dir.clone());
        if !visited_dirs.insert(canonical_dir.clone()) {
            continue;
        }
        removed_count += cleanup_dir_recursive(&canonical_dir, &normalized, &touched_dirs)?;
    }

    Ok(removed_count)
}

/// Walk `dir` and remove orphan alef-generated files. Recurses into subdirectories
/// that are themselves touched, that contain a touched path, or that live beneath
/// a touched directory. The third clause catches orphans in subtrees a backend
/// previously owned but no longer writes to (e.g. the kotlin-android backend
/// dropped its `src/main/java/` Java DTO emit and left stale alef-marked Java
/// files behind). The `has_alef_hash` gate is the safety net that prevents
/// deletion of user-customised files and vendored artefacts.
fn cleanup_dir_recursive(
    dir: &Path,
    normalized_gen_paths: &HashSet<PathBuf>,
    touched_dirs: &BTreeSet<PathBuf>,
) -> anyhow::Result<usize> {
    let mut removed_count = 0;
    for entry in fs::read_dir(dir)? {
        let entry = entry?;
        let path = entry.path();

        if path.is_dir() {
            // Recurse if the subdirectory itself is touched, contains a touched
            // path, OR is a descendant of any touched dir. Combined with the
            // alef-header check below, this lets us sweep stale binding output
            // in subtrees that the current run no longer writes to without
            // touching user files.
            let canonical_sub = path.canonicalize().unwrap_or_else(|_| path.clone());
            let descend = touched_dirs
                .iter()
                .any(|td| td == &canonical_sub || td.starts_with(&canonical_sub) || canonical_sub.starts_with(td));
            if descend {
                removed_count += cleanup_dir_recursive(&path, normalized_gen_paths, touched_dirs)?;
            }
            continue;
        }

        if !has_alef_hash(&path) {
            continue;
        }

        let canonical_path = path.canonicalize().unwrap_or_else(|_| path.clone());
        if !normalized_gen_paths.contains(&canonical_path) {
            info!("Removing stale alef-generated file: {}", path.display());
            fs::remove_file(&path)?;
            removed_count += 1;
        }
    }

    Ok(removed_count)
}

#[cfg(test)]
mod tests {
    use super::{cleanup_orphaned_files, has_alef_hash};
    use std::collections::HashSet;
    use std::fs;

    /// A representative 64-char hex string used as a stand-in for a real alef hash
    /// in test fixtures. The actual hex value is irrelevant — only the presence of
    /// the `alef:hash:` prefix matters for ownership detection.
    const TEST_HASH: &str = "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef";

    #[test]
    fn cleanup_removes_orphan_with_alef_hash_header() {
        let tempdir = tempfile::tempdir().expect("tempdir");
        let package_dir = tempdir.path().join("packages/kotlin/src/main/kotlin/dev/demo");
        fs::create_dir_all(&package_dir).expect("create package dir");

        let current_file = package_dir.join("GraphQLRouteConfig.kt");
        let stale_file = package_dir.join("DefaultClient.kt");
        let alef_header = format!("// alef:hash:{TEST_HASH}\n\n");
        fs::write(&current_file, format!("{alef_header}class GraphQLRouteConfig\n")).expect("write current file");
        fs::write(&stale_file, format!("{alef_header}class DefaultClient\n")).expect("write stale file");

        let current_gen_paths = HashSet::from([current_file.clone()]);

        let removed = cleanup_orphaned_files(&current_gen_paths).expect("cleanup");

        assert_eq!(removed, 1);
        assert!(current_file.exists());
        assert!(!stale_file.exists());
    }

    /// Loose markers such as "Generated by alef" or "DO NOT EDIT" without the
    /// `alef:hash:` line must NOT trigger deletion — they appear in vendored cgo
    /// headers, swig output, and other consumer-managed files.
    #[test]
    fn cleanup_preserves_file_with_loose_marker_but_no_hash() {
        let tempdir = tempfile::tempdir().expect("tempdir");
        let package_dir = tempdir.path().join("packages/go/include");
        fs::create_dir_all(&package_dir).expect("create dir");

        // Simulate a vendored cgo header: has a "DO NOT EDIT" comment but no alef:hash.
        let vendored = package_dir.join("kreuzcrawl.h");
        fs::write(
            &vendored,
            "// DO NOT EDIT — generated by cgo. See CGO_ENABLED.\n#ifndef KREUZCRAWL_H\n#define KREUZCRAWL_H\n#endif\n",
        )
        .expect("write vendored header");

        // Put another alef-owned file in the same dir so the dir IS in touched_dirs.
        let alef_file = package_dir.join("bindings.go");
        fs::write(&alef_file, format!("// alef:hash:{TEST_HASH}\npackage main\n")).expect("write alef file");

        let current_gen_paths = HashSet::from([alef_file.clone()]);
        let removed = cleanup_orphaned_files(&current_gen_paths).expect("cleanup");

        assert_eq!(removed, 0, "vendored file without alef:hash must not be deleted");
        assert!(vendored.exists(), "vendored cgo header must survive");
        assert!(alef_file.exists(), "current alef file must survive");
    }

    /// `has_alef_hash` must return true only when the `alef:hash:` line is present.
    #[test]
    fn has_alef_hash_detects_hash_line() {
        let tempdir = tempfile::tempdir().expect("tempdir");
        let with_hash = tempdir.path().join("with_hash.rs");
        let without_hash = tempdir.path().join("without_hash.rs");
        fs::write(&with_hash, format!("// alef:hash:{TEST_HASH}\nfn main() {{}}\n")).expect("write");
        fs::write(
            &without_hash,
            "// auto-generated by alef\n// DO NOT EDIT\nfn main() {}\n",
        )
        .expect("write");

        assert!(has_alef_hash(&with_hash), "must detect alef:hash: line");
        assert!(!has_alef_hash(&without_hash), "must not match loose markers");
    }

    /// Regression: orphans in a sibling subtree of a touched directory must be
    /// swept. This models the kotlin-android case where the backend wrote
    /// Java DTOs into `src/main/java/` in older versions, then dropped that
    /// emit but kept writing Kotlin to `src/main/kotlin/`. The stale Java
    /// orphans live in a subtree the current run never writes to, but they
    /// are descendants of the package root that IS touched (via
    /// `build.gradle.kts` etc.).
    #[test]
    fn cleanup_removes_orphan_in_sibling_subtree_of_touched_dir() {
        let tempdir = tempfile::tempdir().expect("tempdir");
        let package_root = tempdir.path().join("packages/kotlin-android");
        let kotlin_dir = package_root.join("src/main/kotlin/dev/demo/android");
        let java_dir = package_root.join("src/main/java/dev/demo");
        fs::create_dir_all(&kotlin_dir).expect("create kotlin dir");
        fs::create_dir_all(&java_dir).expect("create java dir");

        let alef_header = format!("// alef:hash:{TEST_HASH}\n");
        let build_gradle = package_root.join("build.gradle.kts");
        let bridge_kt = kotlin_dir.join("DemoBridge.kt");
        let stale_java = java_dir.join("CrawlEngineHandle.java");
        let user_java = java_dir.join("UserCode.java");
        fs::write(&build_gradle, format!("{alef_header}plugins {{}}\n")).expect("write build.gradle.kts");
        fs::write(&bridge_kt, format!("{alef_header}object DemoBridge\n")).expect("write bridge.kt");
        fs::write(
            &stale_java,
            format!("{alef_header}public class CrawlEngineHandle {{}}\n"),
        )
        .expect("write stale java");
        // User-customised file in the same orphan subtree — must survive.
        fs::write(&user_java, "// hand-written\npublic class UserCode {}\n").expect("write user java");

        let current_gen_paths = HashSet::from([build_gradle.clone(), bridge_kt.clone()]);
        let removed = cleanup_orphaned_files(&current_gen_paths).expect("cleanup");

        assert_eq!(removed, 1, "exactly the alef-marked orphan must be removed");
        assert!(build_gradle.exists(), "current build.gradle.kts must survive");
        assert!(bridge_kt.exists(), "current bridge.kt must survive");
        assert!(!stale_java.exists(), "stale java orphan must be removed");
        assert!(user_java.exists(), "user-written java must survive (no alef hash)");
    }
}