nornir 0.4.20

Companion to cargo: dependency tracking, release gating, deploy, benchmarks, and documentation assembly. Project-agnostic.
Documentation
//! Make the ort CUDA execution provider "just work" without a manual
//! `LD_LIBRARY_PATH`.
//!
//! The onnxruntime CUDA provider (`libonnxruntime_providers_cuda.so`) is
//! `dlopen`ed lazily when the session is built, and it `NEEDS` the CUDA
//! runtime libs (`libcudart`, `libcublas`, `libcudnn`, …). glibc fixes the
//! loader search path at process start, so setting `LD_LIBRARY_PATH` from
//! inside the process is ignored. The robust fix is to **`dlopen` the CUDA
//! libs ourselves with `RTLD_GLOBAL`** before the provider loads — then its
//! `NEEDED` sonames resolve against the already-loaded globals.
//!
//! Discovery order (first dir that contains a given lib wins):
//!   1. `NORNIR_CUDA_LIBS` — explicit colon-separated dirs (highest priority).
//!   2. Active Python env: `$VIRTUAL_ENV` / `$CONDA_PREFIX` →
//!      `…/site-packages/nvidia/*/lib` (pip `nvidia-*-cu12` wheels).
//!   3. `NORNIR_CUDA_SCAN_ROOTS` — colon-separated roots scanned for
//!      `**/site-packages/nvidia/*/lib` (one level of venvs).
//!   4. Known system dirs: ollama bundles, `/usr/local/cuda*/lib64`, etc.
//!
//! Best-effort: anything missing just means ort falls back to CPU (no panic).
//!
//! Cargo feature: `embed-ort`.

use std::path::{Path, PathBuf};
use std::sync::OnceLock;

/// Leaf-first load order: a lib's own `NEEDED` CUDA deps must already be
/// global when it loads. `cudnn` (and its split sub-libs) come last.
const ORDERED_SONAMES: &[&str] = &[
    "libcudart.so.12",
    "libnvJitLink.so.12",
    "libcublasLt.so.12",
    "libcublas.so.12",
    "libcufft.so.11",
    "libcurand.so.10",
    "libcusparse.so.12",
    "libcusolver.so.11",
];

/// Outcome of a preload attempt (for logging / diagnostics).
#[derive(Debug, Default, Clone)]
pub struct CudaPreload {
    /// Sonames successfully `dlopen`ed (kept resident for the process life).
    pub loaded: Vec<String>,
    /// Directories that were searched.
    pub dirs: Vec<PathBuf>,
    /// True if a cuDNN lib was found and loaded.
    pub cudnn: bool,
}

// Loaded libraries are leaked on purpose: they must stay resident for the
// whole process so the provider can resolve against them.
static PRELOAD: OnceLock<CudaPreload> = OnceLock::new();

/// Discover + `dlopen` CUDA libs once. Idempotent; safe to call before every
/// session build.
pub fn ensure() -> &'static CudaPreload {
    PRELOAD.get_or_init(run)
}

fn run() -> CudaPreload {
    let dirs = candidate_dirs();
    let mut out = CudaPreload {
        dirs: dirs.clone(),
        ..Default::default()
    };
    for soname in ORDERED_SONAMES {
        if let Some(path) = find_lib(&dirs, soname) {
            if dlopen_global(&path) {
                out.loaded.push(soname.to_string());
            }
        }
    }
    // cuDNN 9 is split: load its sub-libs (precompiled engines, ops, …) before
    // the umbrella `libcudnn.so.9`, all from the same dir.
    if let Some(dir) = dirs.iter().find(|d| d.join("libcudnn.so.9").exists()) {
        let mut subs: Vec<PathBuf> = std::fs::read_dir(dir)
            .into_iter()
            .flatten()
            .flatten()
            .map(|e| e.path())
            .filter(|p| {
                p.file_name()
                    .and_then(|n| n.to_str())
                    .is_some_and(|n| n.starts_with("libcudnn") && n.contains(".so.") && !n.ends_with("libcudnn.so.9"))
            })
            .collect();
        subs.sort();
        for p in subs {
            dlopen_global(&p);
        }
        if dlopen_global(&dir.join("libcudnn.so.9")) {
            out.loaded.push("libcudnn.so.9".to_string());
            out.cudnn = true;
        }
    }
    out
}

/// True if the NVIDIA **userspace driver** (`libcuda.so.1`) is loadable — the lib
/// the GPU driver package installs alongside the kernel module. nornir can
/// **detect** this but **cannot install or embed it**: the kernel driver needs
/// root, a module matching the running kernel, and (usually) a reboot — it is not
/// a redistributable `.so` you can bundle in a binary. Only the *runtime* libs
/// (cudart/cublas/cuDNN) are redistributable and discoverable via [`ensure`].
pub fn driver_present() -> bool {
    use libloading::os::unix::{Library, RTLD_NOW};
    // SAFETY: probing a well-known system soname; handle dropped immediately.
    unsafe { Library::open(Some(std::ffi::OsStr::new("libcuda.so.1")), RTLD_NOW) }.is_ok()
}

fn yn(b: bool) -> &'static str {
    if b {
        "found"
    } else {
        "MISSING"
    }
}

/// Preflight the CUDA GPU path: `(gpu_ready, human_report)`. Checks the NVIDIA
/// driver, the runtime libs [`ensure`] resolved, and cuDNN, then appends concrete
/// advice for whatever is missing. `gpu_ready` is true iff embed-ort should run
/// on the GPU (driver + cudart + cuDNN all present). Drives `nornir vector doctor`.
pub fn preflight() -> (bool, String) {
    let p = ensure();
    let driver = driver_present();
    let cudart = p.loaded.iter().any(|s| s.starts_with("libcudart"));
    let gpu_ready = driver && cudart && p.cudnn;

    let mut s = String::from("nornir CUDA preflight (embed-ort GPU path)\n");
    s.push_str(&format!("  NVIDIA driver libcuda.so.1 : {}\n", yn(driver)));
    s.push_str(&format!("  libcudart (CUDA runtime)   : {}\n", yn(cudart)));
    s.push_str(&format!("  cuDNN 9                    : {}\n", yn(p.cudnn)));
    s.push_str(&format!(
        "  CUDA libs loaded           : {}\n",
        if p.loaded.is_empty() { "(none)".into() } else { p.loaded.join(", ") }
    ));
    s.push_str(&format!(
        "  dirs searched              : {}\n",
        p.dirs.iter().map(|d| d.display().to_string()).collect::<Vec<_>>().join(", ")
    ));
    let missing: Vec<&str> = ORDERED_SONAMES
        .iter()
        .copied()
        .filter(|n| !p.loaded.iter().any(|l| l == n))
        .collect();
    if !missing.is_empty() {
        s.push_str(&format!("  runtime libs not found     : {}\n", missing.join(", ")));
    }
    s.push_str(&format!(
        "\n  verdict: GPU embedding {}\n",
        if gpu_ready { "READY ✓" } else { "unavailable → CPU fallback" }
    ));
    if !driver {
        s.push_str(
            "  → install the NVIDIA GPU driver for your distro (e.g. `sudo apt install \
             nvidia-driver-XXX`, `sudo dnf install akmod-nvidia`, or NVIDIA's .run). nornir \
             can't install the kernel driver — it needs root + a matching kernel module + reboot.\n",
        );
    }
    if driver && (!p.cudnn || !cudart) {
        s.push_str(
            "  → CUDA runtime libs incomplete. Put a matched CUDA-12 / cuDNN-9 .so set \
             (libcudart.so.12, libcublas*, libcudnn.so.9 + its sub-libs) in ONE dir and set \
             NORNIR_CUDA_LIBS to it (e.g. /opt/nornir/cuda). See `.nornir/vector.md`.\n",
        );
    }
    if gpu_ready {
        s.push_str("  → all set; embed-ort runs on the GPU.\n");
    }
    (gpu_ready, s)
}

/// `nornir vector setup-cuda` — pin a complete CUDA runtime set into
/// `/opt/nornir/cuda` (the built-in search dir) by **copying** the libs from
/// wherever discovery finds them on this box (pip `nvidia-*` wheels in a venv,
/// an ollama bundle, a system toolkit). After this, the GPU works for every
/// nornir process with no env — and survives the source venv being deleted.
/// Returns `(copied, missing)` soname lists. Downloading from NVIDIA's redist
/// CDN (for boxes with no local source) is a planned follow-up.
pub fn setup(target: &Path) -> anyhow::Result<(Vec<String>, Vec<String>)> {
    use anyhow::Context;
    let dirs = candidate_dirs();
    std::fs::create_dir_all(target)
        .with_context(|| format!("create {} (need root? try sudo)", target.display()))?;
    let mut copied = Vec::new();
    let mut missing = Vec::new();

    // The ordered runtime set + cuDNN (umbrella + sub-libs from the same dir).
    for soname in ORDERED_SONAMES {
        match find_lib(&dirs, soname) {
            Some(src) => {
                let dst = target.join(src.file_name().unwrap_or_default());
                std::fs::copy(&src, &dst)
                    .with_context(|| format!("copy {} -> {}", src.display(), dst.display()))?;
                // The plain `.so.N` name the loader asks for, alongside any
                // fully-versioned file name we copied.
                let alias = target.join(soname);
                if !alias.exists() {
                    std::fs::copy(&src, &alias).ok();
                }
                copied.push(soname.to_string());
            }
            None => missing.push(soname.to_string()),
        }
    }
    if let Some(dir) = dirs.iter().find(|d| d.join("libcudnn.so.9").exists()) {
        let mut n = 0usize;
        for e in std::fs::read_dir(dir).into_iter().flatten().flatten() {
            let name = e.file_name();
            let name = name.to_string_lossy();
            if name.starts_with("libcudnn") && name.contains(".so") {
                std::fs::copy(e.path(), target.join(e.file_name()))
                    .with_context(|| format!("copy {}", e.path().display()))?;
                n += 1;
            }
        }
        copied.push(format!("libcudnn.so.9 (+{n} files)"));
    } else {
        missing.push("libcudnn.so.9".to_string());
    }
    Ok((copied, missing))
}

fn candidate_dirs() -> Vec<PathBuf> {
    let mut dirs: Vec<PathBuf> = Vec::new();
    let push = |p: PathBuf, dirs: &mut Vec<PathBuf>| {
        if p.is_dir() && !dirs.contains(&p) {
            dirs.push(p);
        }
    };

    // 1. Explicit override.
    if let Some(v) = std::env::var_os("NORNIR_CUDA_LIBS") {
        for p in std::env::split_paths(&v) {
            push(p, &mut dirs);
        }
    }
    // 2. Active Python env → pip nvidia wheels.
    for key in ["VIRTUAL_ENV", "CONDA_PREFIX"] {
        if let Some(root) = std::env::var_os(key) {
            for d in nvidia_pkg_dirs(Path::new(&root)) {
                push(d, &mut dirs);
            }
        }
    }
    // 3. Extra scan roots (each scanned for venvs one level deep).
    if let Some(v) = std::env::var_os("NORNIR_CUDA_SCAN_ROOTS") {
        for root in std::env::split_paths(&v) {
            for d in scan_root_for_nvidia(&root) {
                push(d, &mut dirs);
            }
        }
    }
    // 4. Known system locations: nornir's own conventional CUDA dir first (drop a
    //    matched CUDA-12 / cuDNN-9 .so set here and the GPU "just works", no env),
    //    then ollama bundles + a system CUDA toolkit.
    for sys in [
        "/opt/nornir/cuda",
        "/usr/local/lib/ollama/cuda_v12",
        "/usr/local/lib/ollama/cuda_v13",
        "/usr/local/cuda/lib64",
        "/usr/local/cuda-12/lib64",
        "/opt/cuda/lib64",
        "/usr/lib/x86_64-linux-gnu",
    ] {
        push(PathBuf::from(sys), &mut dirs);
    }
    dirs
}

/// `<root>/lib*/python*/site-packages/nvidia/*/lib` for a venv/conda root.
fn nvidia_pkg_dirs(root: &Path) -> Vec<PathBuf> {
    let mut out = Vec::new();
    for libname in ["lib", "lib64"] {
        let pyroot = root.join(libname);
        let Ok(entries) = std::fs::read_dir(&pyroot) else {
            continue;
        };
        for e in entries.flatten() {
            let nvidia = e.path().join("site-packages/nvidia");
            if let Ok(pkgs) = std::fs::read_dir(&nvidia) {
                for p in pkgs.flatten() {
                    let lib = p.path().join("lib");
                    if lib.is_dir() {
                        out.push(lib);
                    }
                }
            }
        }
    }
    out
}

/// Scan one level of subdirs of `root` as candidate venv roots.
fn scan_root_for_nvidia(root: &Path) -> Vec<PathBuf> {
    let mut out = nvidia_pkg_dirs(root);
    if let Ok(entries) = std::fs::read_dir(root) {
        for e in entries.flatten() {
            if e.path().is_dir() {
                out.extend(nvidia_pkg_dirs(&e.path()));
            }
        }
    }
    out
}

/// First dir containing `soname` (exact) or a `soname.<minor>` variant.
fn find_lib(dirs: &[PathBuf], soname: &str) -> Option<PathBuf> {
    for d in dirs {
        let exact = d.join(soname);
        if exact.exists() {
            return Some(exact);
        }
        // e.g. libcudart.so.12 → match libcudart.so.12.8.90
        if let Ok(entries) = std::fs::read_dir(d) {
            for e in entries.flatten() {
                if let Some(name) = e.file_name().to_str() {
                    if name.starts_with(soname) {
                        return Some(e.path());
                    }
                }
            }
        }
    }
    None
}

fn dlopen_global(path: &Path) -> bool {
    use libloading::os::unix::{Library, RTLD_GLOBAL, RTLD_NOW};
    // SAFETY: loading a CUDA shared lib by absolute path; the handle is leaked
    // so it stays resident for the provider to resolve against.
    match unsafe { Library::open(Some(path), RTLD_NOW | RTLD_GLOBAL) } {
        Ok(lib) => {
            std::mem::forget(lib);
            true
        }
        Err(_) => false,
    }
}