nornir 0.4.29

Companion to cargo: dependency tracking, release gating, deploy, benchmarks, and documentation assembly. Project-agnostic.
//! AMD ROCm runtime-lib discovery + GPU probe for the ort embedder.
//!
//! The mirror image of [`super::cuda`] for AMD silicon. The ort
//! `MIGraphXExecutionProvider` / `ROCmExecutionProvider` are `dlopen`ed lazily
//! when the session is built, and they `NEED` the ROCm runtime libs
//! (`libamdhip64`, `libMIOpen`, `librocblas`, `libmigraphx_c`, …). glibc fixes
//! the loader search path at process start, so the robust fix — same as the
//! CUDA path — is to **`dlopen` the ROCm libs ourselves with `RTLD_GLOBAL`**
//! before the provider loads, so its `NEEDED` sonames resolve against the
//! already-resident globals.
//!
//! Discovery order (first dir that contains a given lib wins):
//!   1. `NORNIR_ROCM_LIBS` — explicit colon-separated dirs (highest priority).
//!   2. `$ROCM_PATH` / `$HIP_PATH` → `…/lib` (and `…/lib64`).
//!   3. Known system dirs: `/opt/rocm*/lib`, `/usr/lib/x86_64-linux-gnu`.
//!
//! Best-effort: anything missing just means ort falls back to CPU (no panic).
//! [`available`] is the runtime probe the embedder uses to decide whether to
//! even request the AMD GPU EP — when it returns `false` (no ROCm on the box)
//! the embedder requests CUDA-then-CPU exactly as before, so a non-AMD machine
//! is completely unaffected.
//!
//! Cargo feature: `embed-ort-rocm` (which also turns on `embed-ort`).

use std::path::{Path, PathBuf};
use std::sync::OnceLock;

/// Leaf-first load order: a lib's own `NEEDED` ROCm deps must already be global
/// when it loads. `libamdhip64` (the HIP runtime) underpins the rest, so it
/// loads first; the MIGraphX umbrella loads last.
const ORDERED_SONAMES: &[&str] = &[
    "libamdhip64.so",   // HIP runtime (the AMD analogue of libcudart)
    "librocblas.so",    // BLAS
    "libhipblas.so",    // HIP BLAS shim
    "librocfft.so",     // FFT
    "libMIOpen.so",     // DNN primitives (the cuDNN analogue)
    "libmigraphx_c.so", // MIGraphX C API (the provider's graph compiler)
];

/// Outcome of a ROCm preload attempt (for logging / diagnostics).
#[derive(Debug, Default, Clone)]
pub struct RocmPreload {
    /// Sonames successfully `dlopen`ed (kept resident for the process life).
    pub loaded: Vec<String>,
    /// Directories that were searched.
    pub dirs: Vec<PathBuf>,
    /// True if the HIP runtime (`libamdhip64`) loaded.
    pub hip: bool,
    /// True if MIOpen (DNN) loaded.
    pub miopen: bool,
}

// Loaded libraries are leaked on purpose: they must stay resident for the whole
// process so the provider can resolve against them.
static PRELOAD: OnceLock<RocmPreload> = OnceLock::new();

/// Discover + `dlopen` ROCm libs once. Idempotent; safe to call before every
/// session build.
pub fn ensure() -> &'static RocmPreload {
    PRELOAD.get_or_init(run)
}

/// Runtime probe: is the AMD ROCm GPU path usable on this box? True iff the AMD
/// driver is present **and** the HIP runtime + MIOpen libs were found and
/// loaded. When false, the caller should not request the AMD EP — it would only
/// fall back to CPU, and on a non-AMD machine the lib `dlopen`s all miss
/// harmlessly. This is the `available()`-style probe G1 asks for: a missing
/// ROCm install degrades to CPU, never errors.
pub fn available() -> bool {
    let p = ensure();
    driver_present() && p.hip && p.miopen
}

fn run() -> RocmPreload {
    let dirs = candidate_dirs();
    let mut out = RocmPreload {
        dirs: dirs.clone(),
        ..Default::default()
    };
    for soname in ORDERED_SONAMES {
        if let Some(path) = find_lib(&dirs, soname) {
            if dlopen_global(&path) {
                out.loaded.push(soname.to_string());
                if soname.starts_with("libamdhip64") {
                    out.hip = true;
                } else if soname.starts_with("libMIOpen") {
                    out.miopen = true;
                }
            }
        }
    }
    out
}

/// True if the AMD HIP runtime (`libamdhip64.so`) is resolvable as a bare soname
/// — i.e. ROCm is installed system-wide and the loader can find it. As with the
/// NVIDIA driver, nornir can **detect** this but cannot install it: the `amdgpu`
/// kernel module + ROCm packages need root. Only the *runtime* libs are
/// discoverable via [`ensure`].
pub fn driver_present() -> bool {
    use libloading::os::unix::{Library, RTLD_NOW};
    // SAFETY: probing a well-known system soname; handle dropped immediately.
    unsafe { Library::open(Some(std::ffi::OsStr::new("libamdhip64.so")), RTLD_NOW) }.is_ok()
}

fn yn(b: bool) -> &'static str {
    if b {
        "found"
    } else {
        "MISSING"
    }
}

/// Preflight the ROCm GPU path: `(gpu_ready, human_report)`. Mirrors the CUDA
/// [`super::cuda::preflight`]; drives a future `nornir vector doctor --rocm`.
pub fn preflight() -> (bool, String) {
    let p = ensure();
    let driver = driver_present();
    let gpu_ready = available();

    let mut s = String::from("nornir ROCm preflight (embed-ort-rocm AMD GPU path)\n");
    s.push_str(&format!("  AMD HIP runtime libamdhip64 : {}\n", yn(driver && p.hip)));
    s.push_str(&format!("  MIOpen (ROCm DNN)           : {}\n", yn(p.miopen)));
    s.push_str(&format!(
        "  ROCm libs loaded           : {}\n",
        if p.loaded.is_empty() { "(none)".into() } else { p.loaded.join(", ") }
    ));
    s.push_str(&format!(
        "  dirs searched              : {}\n",
        p.dirs.iter().map(|d| d.display().to_string()).collect::<Vec<_>>().join(", ")
    ));
    let missing: Vec<&str> = ORDERED_SONAMES
        .iter()
        .copied()
        .filter(|n| !p.loaded.iter().any(|l| l == n))
        .collect();
    if !missing.is_empty() {
        s.push_str(&format!("  runtime libs not found     : {}\n", missing.join(", ")));
    }
    s.push_str(&format!(
        "\n  verdict: AMD GPU embedding {}\n",
        if gpu_ready { "READY ✓" } else { "unavailable → CPU fallback" }
    ));
    if !driver {
        s.push_str(
            "  → install ROCm for your distro (e.g. `sudo apt install rocm-hip-runtime miopen-hip`, \
             or AMD's amdgpu-install). nornir can't install the kernel driver — it needs root + the \
             amdgpu module + (usually) a reboot.\n",
        );
    }
    if driver && !p.miopen {
        s.push_str(
            "  → ROCm runtime incomplete (no MIOpen). Install `miopen-hip` / `migraphx`, or put a \
             matched ROCm .so set in ONE dir and set NORNIR_ROCM_LIBS to it.\n",
        );
    }
    if gpu_ready {
        s.push_str("  → all set; embed-ort runs on the AMD GPU via MIGraphX/ROCm.\n");
    }
    (gpu_ready, s)
}

fn candidate_dirs() -> Vec<PathBuf> {
    let mut dirs: Vec<PathBuf> = Vec::new();
    let push = |p: PathBuf, dirs: &mut Vec<PathBuf>| {
        if p.is_dir() && !dirs.contains(&p) {
            dirs.push(p);
        }
    };

    // 1. Explicit override.
    if let Some(v) = std::env::var_os("NORNIR_ROCM_LIBS") {
        for p in std::env::split_paths(&v) {
            push(p, &mut dirs);
        }
    }
    // 2. ROCM_PATH / HIP_PATH → <root>/lib(64).
    for key in ["ROCM_PATH", "HIP_PATH"] {
        if let Some(root) = std::env::var_os(key) {
            push(Path::new(&root).join("lib"), &mut dirs);
            push(Path::new(&root).join("lib64"), &mut dirs);
        }
    }
    // 3. Known system locations.
    for sys in [
        "/opt/nornir/rocm",
        "/opt/rocm/lib",
        "/opt/rocm-6.0.0/lib",
        "/opt/rocm-6.1.0/lib",
        "/opt/rocm/lib64",
        "/usr/lib/x86_64-linux-gnu",
    ] {
        push(PathBuf::from(sys), &mut dirs);
    }
    dirs
}

/// First dir containing `soname` (exact) or a `soname.<n>` variant.
fn find_lib(dirs: &[PathBuf], soname: &str) -> Option<PathBuf> {
    for d in dirs {
        let exact = d.join(soname);
        if exact.exists() {
            return Some(exact);
        }
        // e.g. libMIOpen.so → match libMIOpen.so.1.0
        if let Ok(entries) = std::fs::read_dir(d) {
            for e in entries.flatten() {
                if let Some(name) = e.file_name().to_str() {
                    if name.starts_with(soname) {
                        return Some(e.path());
                    }
                }
            }
        }
    }
    None
}

fn dlopen_global(path: &Path) -> bool {
    use libloading::os::unix::{Library, RTLD_GLOBAL, RTLD_NOW};
    // SAFETY: loading a ROCm shared lib by absolute path; the handle is leaked
    // so it stays resident for the provider to resolve against.
    match unsafe { Library::open(Some(path), RTLD_NOW | RTLD_GLOBAL) } {
        Ok(lib) => {
            std::mem::forget(lib);
            true
        }
        Err(_) => false,
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    /// LAW (inject-and-assert): on a box with no ROCm, the probe must return a
    /// *sensible* answer — `available()` false, `driver_present()` false, and a
    /// non-empty preflight report whose verdict names CPU fallback. We can't
    /// inject an AMD GPU here, so we assert the no-GPU branch concretely; if a
    /// real GPU IS present, the inverse invariants are asserted instead.
    #[test]
    fn probe_degrades_to_cpu_without_rocm() {
        let p = ensure();
        // Discovery always produces a dir list (system dirs are unconditional).
        assert!(!p.dirs.is_empty(), "candidate dirs must be populated");

        let (ready, report) = preflight();
        // available() and preflight() must agree.
        assert_eq!(ready, available(), "preflight verdict must match available()");
        assert!(report.contains("ROCm preflight"), "report header missing: {report}");

        if available() {
            // If a real AMD GPU IS present, the invariants still hold.
            assert!(driver_present() && p.hip && p.miopen, "available() implies hip+miopen+driver");
            assert!(report.contains("READY"), "ready box should say READY: {report}");
        } else {
            // The common CI/dev case: no AMD GPU. Everything degrades cleanly.
            assert!(!ready, "no ROCm => not ready");
            assert!(report.contains("CPU fallback"), "verdict should name CPU fallback: {report}");
        }
    }
}