nornir 0.4.42 - Docs.rs

//! Make the ort CUDA execution provider "just work" without a manual
//! `LD_LIBRARY_PATH`.
//!
//! The onnxruntime CUDA provider (`libonnxruntime_providers_cuda.so`) is
//! `dlopen`ed lazily when the session is built, and it `NEEDS` the CUDA
//! runtime libs (`libcudart`, `libcublas`, `libcudnn`, …). glibc fixes the
//! loader search path at process start, so setting `LD_LIBRARY_PATH` from
//! inside the process is ignored. The robust fix is to **`dlopen` the CUDA
//! libs ourselves with `RTLD_GLOBAL`** before the provider loads — then its
//! `NEEDED` sonames resolve against the already-loaded globals.
//!
//! Discovery order (first dir that contains a given lib wins):
//!   1. `NORNIR_CUDA_LIBS` — explicit colon-separated dirs (highest priority).
//!   2. Active Python env: `$VIRTUAL_ENV` / `$CONDA_PREFIX` →
//!      `…/site-packages/nvidia/*/lib` (pip `nvidia-*-cu12` wheels).
//!   3. `NORNIR_CUDA_SCAN_ROOTS` — colon-separated roots scanned for
//!      `**/site-packages/nvidia/*/lib` (one level of venvs).
//!   4. Known system dirs: ollama bundles, `/usr/local/cuda*/lib64`, etc.
//!
//! Best-effort: anything missing just means ort falls back to CPU (no panic).
//!
//! Cargo feature: `embed-ort`.

use std::path::{Path, PathBuf};
use std::sync::OnceLock;

/// Leaf-first load order: a lib's own `NEEDED` CUDA deps must already be
/// global when it loads. `cudnn` (and its split sub-libs) come last.
const ORDERED_SONAMES: &[&str] = &[
    "libcudart.so.12",
    "libnvJitLink.so.12",
    "libcublasLt.so.12",
    "libcublas.so.12",
    "libcufft.so.11",
    "libcurand.so.10",
    "libcusparse.so.12",
    "libcusolver.so.11",
];

/// Outcome of a preload attempt (for logging / diagnostics).
#[derive(Debug, Default, Clone)]
pub struct CudaPreload {
    /// Sonames successfully `dlopen`ed (kept resident for the process life).
    pub loaded: Vec<String>,
    /// Directories that were searched.
    pub dirs: Vec<PathBuf>,
    /// True if a cuDNN lib was found and loaded.
    pub cudnn: bool,
}

// Loaded libraries are leaked on purpose: they must stay resident for the
// whole process so the provider can resolve against them.
static PRELOAD: OnceLock<CudaPreload> = OnceLock::new();

/// Discover + `dlopen` CUDA libs once. Idempotent; safe to call before every
/// session build.
pub fn ensure() -> &'static CudaPreload {
    PRELOAD.get_or_init(run)
}

/// Find `libonnxruntime.so` for the **dynamic-load** ort backend (#9). With
/// `ort/load-dynamic` the binary links no onnxruntime; ort `dlopen`s it at
/// runtime from `$ORT_DYLIB_PATH` (or the bare soname). The runtime selector
/// calls this to decide whether the ort/GPU path is even *usable* on this box:
/// `Some(path)` ⇒ onnxruntime is present and loadable, `None` ⇒ fall back to the
/// pure-Rust tract CPU embedder. Search order:
///   1. `$ORT_DYLIB_PATH` (explicit; honoured as-is if it loads).
///   2. The same dirs CUDA discovery uses ([`candidate_dirs`]) — so dropping a
///      `libonnxruntime.so` next to the CUDA libs in `/opt/nornir/cuda` is enough.
///   3. The bare soname `libonnxruntime.so` (system loader path).
/// Whatever is found is verified to actually `dlopen` (a stale/broken file
/// returns `None`, so the selector degrades to CPU rather than letting ort panic).
pub fn onnxruntime_dylib() -> Option<PathBuf> {
    // 1. Explicit override — trust it only if it loads.
    if let Some(p) = std::env::var_os("ORT_DYLIB_PATH") {
        let path = PathBuf::from(p);
        if !path.as_os_str().is_empty() && loadable(&path) {
            return Some(path);
        }
    }
    // 2. The CUDA discovery dirs (incl. /opt/nornir/cuda) — colocated runtime.
    if let Some(path) = find_lib(&candidate_dirs(), "libonnxruntime.so") {
        if loadable(&path) {
            return Some(path);
        }
    }
    // 3. Bare soname via the system loader.
    let bare = PathBuf::from("libonnxruntime.so");
    if loadable(&bare) {
        return Some(bare);
    }
    None
}

/// True if `path` can be `dlopen`ed *and* exports `OrtGetApiBase` (i.e. it is a
/// real onnxruntime, not a same-named decoy). The handle is dropped immediately;
/// ort re-loads (and keeps) its own copy. A bare soname is resolved by the loader.
fn loadable(path: &Path) -> bool {
    use libloading::os::unix::{Library, RTLD_NOW};
    let arg: &std::ffi::OsStr = path.as_os_str();
    // SAFETY: probing a shared object; handle dropped at scope end.
    let Ok(lib) = (unsafe { Library::open(Some(arg), RTLD_NOW) }) else {
        return false;
    };
    // SAFETY: looking up a symbol on a live handle; we only test existence.
    let ok = unsafe { lib.get::<unsafe extern "C" fn() -> *const std::ffi::c_void>(b"OrtGetApiBase") }.is_ok();
    ok
}

/// Probe + arm the dynamic-load ort path (#9). If [`onnxruntime_dylib`] finds a
/// usable onnxruntime, set `ORT_DYLIB_PATH` to its absolute location (so ort's
/// loader picks exactly that file) and return `true`. Otherwise leave the env
/// untouched and return `false` — the caller must then use the tract CPU
/// embedder, never touch ort (whose `api()` would panic if the dylib is absent).
/// Idempotent and cheap to call from the selector.
pub fn arm_onnxruntime() -> bool {
    match onnxruntime_dylib() {
        Some(path) => {
            if path.is_absolute() {
                // SAFETY: single-threaded selection point, before any ort use.
                std::env::set_var("ORT_DYLIB_PATH", &path);
            }
            true
        }
        None => false,
    }
}

fn run() -> CudaPreload {
    let dirs = candidate_dirs();
    let mut out = CudaPreload {
        dirs: dirs.clone(),
        ..Default::default()
    };
    for soname in ORDERED_SONAMES {
        if let Some(path) = find_lib(&dirs, soname) {
            if dlopen_global(&path) {
                out.loaded.push(soname.to_string());
            }
        }
    }
    // cuDNN 9 is split: load its sub-libs (precompiled engines, ops, …) before
    // the umbrella `libcudnn.so.9`, all from the same dir.
    if let Some(dir) = dirs.iter().find(|d| d.join("libcudnn.so.9").exists()) {
        let mut subs: Vec<PathBuf> = std::fs::read_dir(dir)
            .into_iter()
            .flatten()
            .flatten()
            .map(|e| e.path())
            .filter(|p| {
                p.file_name()
                    .and_then(|n| n.to_str())
                    .is_some_and(|n| n.starts_with("libcudnn") && n.contains(".so.") && !n.ends_with("libcudnn.so.9"))
            })
            .collect();
        subs.sort();
        for p in subs {
            dlopen_global(&p);
        }
        if dlopen_global(&dir.join("libcudnn.so.9")) {
            out.loaded.push("libcudnn.so.9".to_string());
            out.cudnn = true;
        }
    }
    out
}

/// True if the NVIDIA **userspace driver** (`libcuda.so.1`) is loadable — the lib
/// the GPU driver package installs alongside the kernel module. nornir can
/// **detect** this but **cannot install or embed it**: the kernel driver needs
/// root, a module matching the running kernel, and (usually) a reboot — it is not
/// a redistributable `.so` you can bundle in a binary. Only the *runtime* libs
/// (cudart/cublas/cuDNN) are redistributable and discoverable via [`ensure`].
pub fn driver_present() -> bool {
    use libloading::os::unix::{Library, RTLD_NOW};
    // SAFETY: probing a well-known system soname; handle dropped immediately.
    unsafe { Library::open(Some(std::ffi::OsStr::new("libcuda.so.1")), RTLD_NOW) }.is_ok()
}

fn yn(b: bool) -> &'static str {
    if b {
        "found"
    } else {
        "MISSING"
    }
}

/// Preflight the CUDA GPU path: `(gpu_ready, human_report)`. Checks the NVIDIA
/// driver, the runtime libs [`ensure`] resolved, and cuDNN, then appends concrete
/// advice for whatever is missing. `gpu_ready` is true iff embed-ort should run
/// on the GPU (driver + cudart + cuDNN all present). Drives `nornir vector doctor`.
pub fn preflight() -> (bool, String) {
    let p = ensure();
    let driver = driver_present();
    let cudart = p.loaded.iter().any(|s| s.starts_with("libcudart"));
    let gpu_ready = driver && cudart && p.cudnn;

    let mut s = String::from("nornir CUDA preflight (embed-ort GPU path)\n");
    s.push_str(&format!("  NVIDIA driver libcuda.so.1 : {}\n", yn(driver)));
    s.push_str(&format!("  libcudart (CUDA runtime)   : {}\n", yn(cudart)));
    s.push_str(&format!("  cuDNN 9                    : {}\n", yn(p.cudnn)));
    s.push_str(&format!(
        "  CUDA libs loaded           : {}\n",
        if p.loaded.is_empty() { "(none)".into() } else { p.loaded.join(", ") }
    ));
    s.push_str(&format!(
        "  dirs searched              : {}\n",
        p.dirs.iter().map(|d| d.display().to_string()).collect::<Vec<_>>().join(", ")
    ));
    let missing: Vec<&str> = ORDERED_SONAMES
        .iter()
        .copied()
        .filter(|n| !p.loaded.iter().any(|l| l == n))
        .collect();
    if !missing.is_empty() {
        s.push_str(&format!("  runtime libs not found     : {}\n", missing.join(", ")));
    }
    s.push_str(&format!(
        "\n  verdict: GPU embedding {}\n",
        if gpu_ready { "READY ✓" } else { "unavailable → CPU fallback" }
    ));
    if !driver {
        s.push_str(
            "  → install the NVIDIA GPU driver for your distro (e.g. `sudo apt install \
             nvidia-driver-XXX`, `sudo dnf install akmod-nvidia`, or NVIDIA's .run). nornir \
             can't install the kernel driver — it needs root + a matching kernel module + reboot.\n",
        );
    }
    if driver && (!p.cudnn || !cudart) {
        s.push_str(
            "  → CUDA runtime libs incomplete. Put a matched CUDA-12 / cuDNN-9 .so set \
             (libcudart.so.12, libcublas*, libcudnn.so.9 + its sub-libs) in ONE dir and set \
             NORNIR_CUDA_LIBS to it (e.g. /opt/nornir/cuda). See `.nornir/vector.md`.\n",
        );
    }
    if gpu_ready {
        s.push_str("  → all set; embed-ort runs on the GPU.\n");
    }
    (gpu_ready, s)
}

/// `nornir vector setup-cuda` — pin a complete CUDA runtime set into
/// `/opt/nornir/cuda` (the built-in search dir) by **copying** the libs from
/// wherever discovery finds them on this box (pip `nvidia-*` wheels in a venv,
/// an ollama bundle, a system toolkit). After this, the GPU works for every
/// nornir process with no env — and survives the source venv being deleted.
/// Returns `(copied, missing)` soname lists. Downloading from NVIDIA's redist
/// CDN (for boxes with no local source) is a planned follow-up.
pub fn setup(target: &Path) -> anyhow::Result<(Vec<String>, Vec<String>)> {
    use anyhow::Context;
    let dirs = candidate_dirs();
    std::fs::create_dir_all(target)
        .with_context(|| format!("create {} (need root? try sudo)", target.display()))?;
    let mut copied = Vec::new();
    let mut missing = Vec::new();

    // The ordered runtime set + cuDNN (umbrella + sub-libs from the same dir).
    for soname in ORDERED_SONAMES {
        match find_lib(&dirs, soname) {
            Some(src) => {
                let dst = target.join(src.file_name().unwrap_or_default());
                std::fs::copy(&src, &dst)
                    .with_context(|| format!("copy {} -> {}", src.display(), dst.display()))?;
                // The plain `.so.N` name the loader asks for, alongside any
                // fully-versioned file name we copied.
                let alias = target.join(soname);
                if !alias.exists() {
                    std::fs::copy(&src, &alias).ok();
                }
                copied.push(soname.to_string());
            }
            None => missing.push(soname.to_string()),
        }
    }
    if let Some(dir) = dirs.iter().find(|d| d.join("libcudnn.so.9").exists()) {
        let mut n = 0usize;
        for e in std::fs::read_dir(dir).into_iter().flatten().flatten() {
            let name = e.file_name();
            let name = name.to_string_lossy();
            if name.starts_with("libcudnn") && name.contains(".so") {
                std::fs::copy(e.path(), target.join(e.file_name()))
                    .with_context(|| format!("copy {}", e.path().display()))?;
                n += 1;
            }
        }
        copied.push(format!("libcudnn.so.9 (+{n} files)"));
    } else {
        missing.push("libcudnn.so.9".to_string());
    }
    Ok((copied, missing))
}

fn candidate_dirs() -> Vec<PathBuf> {
    let mut dirs: Vec<PathBuf> = Vec::new();
    let push = |p: PathBuf, dirs: &mut Vec<PathBuf>| {
        if p.is_dir() && !dirs.contains(&p) {
            dirs.push(p);
        }
    };

    // 1. Explicit override.
    if let Some(v) = std::env::var_os("NORNIR_CUDA_LIBS") {
        for p in std::env::split_paths(&v) {
            push(p, &mut dirs);
        }
    }
    // 2. Active Python env → pip nvidia wheels.
    for key in ["VIRTUAL_ENV", "CONDA_PREFIX"] {
        if let Some(root) = std::env::var_os(key) {
            for d in nvidia_pkg_dirs(Path::new(&root)) {
                push(d, &mut dirs);
            }
        }
    }
    // 3. Extra scan roots (each scanned for venvs one level deep).
    if let Some(v) = std::env::var_os("NORNIR_CUDA_SCAN_ROOTS") {
        for root in std::env::split_paths(&v) {
            for d in scan_root_for_nvidia(&root) {
                push(d, &mut dirs);
            }
        }
    }
    // 4. Known system locations: nornir's own conventional CUDA dir first (drop a
    //    matched CUDA-12 / cuDNN-9 .so set here and the GPU "just works", no env),
    //    then ollama bundles + a system CUDA toolkit.
    for sys in [
        "/opt/nornir/cuda",
        "/usr/local/lib/ollama/cuda_v12",
        "/usr/local/lib/ollama/cuda_v13",
        "/usr/local/cuda/lib64",
        "/usr/local/cuda-12/lib64",
        "/opt/cuda/lib64",
        "/usr/lib/x86_64-linux-gnu",
    ] {
        push(PathBuf::from(sys), &mut dirs);
    }
    dirs
}

/// `<root>/lib*/python*/site-packages/nvidia/*/lib` for a venv/conda root.
fn nvidia_pkg_dirs(root: &Path) -> Vec<PathBuf> {
    let mut out = Vec::new();
    for libname in ["lib", "lib64"] {
        let pyroot = root.join(libname);
        let Ok(entries) = std::fs::read_dir(&pyroot) else {
            continue;
        };
        for e in entries.flatten() {
            let nvidia = e.path().join("site-packages/nvidia");
            if let Ok(pkgs) = std::fs::read_dir(&nvidia) {
                for p in pkgs.flatten() {
                    let lib = p.path().join("lib");
                    if lib.is_dir() {
                        out.push(lib);
                    }
                }
            }
        }
    }
    out
}

/// Scan one level of subdirs of `root` as candidate venv roots.
fn scan_root_for_nvidia(root: &Path) -> Vec<PathBuf> {
    let mut out = nvidia_pkg_dirs(root);
    if let Ok(entries) = std::fs::read_dir(root) {
        for e in entries.flatten() {
            if e.path().is_dir() {
                out.extend(nvidia_pkg_dirs(&e.path()));
            }
        }
    }
    out
}

/// First dir containing `soname` (exact) or a `soname.<minor>` variant.
fn find_lib(dirs: &[PathBuf], soname: &str) -> Option<PathBuf> {
    for d in dirs {
        let exact = d.join(soname);
        if exact.exists() {
            return Some(exact);
        }
        // e.g. libcudart.so.12 → match libcudart.so.12.8.90
        if let Ok(entries) = std::fs::read_dir(d) {
            for e in entries.flatten() {
                if let Some(name) = e.file_name().to_str() {
                    if name.starts_with(soname) {
                        return Some(e.path());
                    }
                }
            }
        }
    }
    None
}

fn dlopen_global(path: &Path) -> bool {
    use libloading::os::unix::{Library, RTLD_GLOBAL, RTLD_NOW};
    // SAFETY: loading a CUDA shared lib by absolute path; the handle is leaked
    // so it stays resident for the provider to resolve against.
    match unsafe { Library::open(Some(path), RTLD_NOW | RTLD_GLOBAL) } {
        Ok(lib) => {
            std::mem::forget(lib);
            true
        }
        Err(_) => false,
    }
}