nornir 0.4.42

Companion to cargo: dependency tracking, release gating, deploy, benchmarks, and documentation assembly. Project-agnostic.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
//! Make the ort CUDA execution provider "just work" without a manual
//! `LD_LIBRARY_PATH`.
//!
//! The onnxruntime CUDA provider (`libonnxruntime_providers_cuda.so`) is
//! `dlopen`ed lazily when the session is built, and it `NEEDS` the CUDA
//! runtime libs (`libcudart`, `libcublas`, `libcudnn`, …). glibc fixes the
//! loader search path at process start, so setting `LD_LIBRARY_PATH` from
//! inside the process is ignored. The robust fix is to **`dlopen` the CUDA
//! libs ourselves with `RTLD_GLOBAL`** before the provider loads — then its
//! `NEEDED` sonames resolve against the already-loaded globals.
//!
//! Discovery order (first dir that contains a given lib wins):
//!   1. `NORNIR_CUDA_LIBS` — explicit colon-separated dirs (highest priority).
//!   2. Active Python env: `$VIRTUAL_ENV` / `$CONDA_PREFIX` →
//!      `…/site-packages/nvidia/*/lib` (pip `nvidia-*-cu12` wheels).
//!   3. `NORNIR_CUDA_SCAN_ROOTS` — colon-separated roots scanned for
//!      `**/site-packages/nvidia/*/lib` (one level of venvs).
//!   4. Known system dirs: ollama bundles, `/usr/local/cuda*/lib64`, etc.
//!
//! Best-effort: anything missing just means ort falls back to CPU (no panic).
//!
//! Cargo feature: `embed-ort`.

use std::path::{Path, PathBuf};
use std::sync::OnceLock;

/// Leaf-first load order: a lib's own `NEEDED` CUDA deps must already be
/// global when it loads. `cudnn` (and its split sub-libs) come last.
const ORDERED_SONAMES: &[&str] = &[
    "libcudart.so.12",
    "libnvJitLink.so.12",
    "libcublasLt.so.12",
    "libcublas.so.12",
    "libcufft.so.11",
    "libcurand.so.10",
    "libcusparse.so.12",
    "libcusolver.so.11",
];

/// Outcome of a preload attempt (for logging / diagnostics).
#[derive(Debug, Default, Clone)]
pub struct CudaPreload {
    /// Sonames successfully `dlopen`ed (kept resident for the process life).
    pub loaded: Vec<String>,
    /// Directories that were searched.
    pub dirs: Vec<PathBuf>,
    /// True if a cuDNN lib was found and loaded.
    pub cudnn: bool,
}

// Loaded libraries are leaked on purpose: they must stay resident for the
// whole process so the provider can resolve against them.
static PRELOAD: OnceLock<CudaPreload> = OnceLock::new();

/// Discover + `dlopen` CUDA libs once. Idempotent; safe to call before every
/// session build.
pub fn ensure() -> &'static CudaPreload {
    PRELOAD.get_or_init(run)
}

/// Find `libonnxruntime.so` for the **dynamic-load** ort backend (#9). With
/// `ort/load-dynamic` the binary links no onnxruntime; ort `dlopen`s it at
/// runtime from `$ORT_DYLIB_PATH` (or the bare soname). The runtime selector
/// calls this to decide whether the ort/GPU path is even *usable* on this box:
/// `Some(path)` ⇒ onnxruntime is present and loadable, `None` ⇒ fall back to the
/// pure-Rust tract CPU embedder. Search order:
///   1. `$ORT_DYLIB_PATH` (explicit; honoured as-is if it loads).
///   2. The same dirs CUDA discovery uses ([`candidate_dirs`]) — so dropping a
///      `libonnxruntime.so` next to the CUDA libs in `/opt/nornir/cuda` is enough.
///   3. The bare soname `libonnxruntime.so` (system loader path).
/// Whatever is found is verified to actually `dlopen` (a stale/broken file
/// returns `None`, so the selector degrades to CPU rather than letting ort panic).
pub fn onnxruntime_dylib() -> Option<PathBuf> {
    // 1. Explicit override — trust it only if it loads.
    if let Some(p) = std::env::var_os("ORT_DYLIB_PATH") {
        let path = PathBuf::from(p);
        if !path.as_os_str().is_empty() && loadable(&path) {
            return Some(path);
        }
    }
    // 2. The CUDA discovery dirs (incl. /opt/nornir/cuda) — colocated runtime.
    if let Some(path) = find_lib(&candidate_dirs(), "libonnxruntime.so") {
        if loadable(&path) {
            return Some(path);
        }
    }
    // 3. Bare soname via the system loader.
    let bare = PathBuf::from("libonnxruntime.so");
    if loadable(&bare) {
        return Some(bare);
    }
    None
}

/// True if `path` can be `dlopen`ed *and* exports `OrtGetApiBase` (i.e. it is a
/// real onnxruntime, not a same-named decoy). The handle is dropped immediately;
/// ort re-loads (and keeps) its own copy. A bare soname is resolved by the loader.
fn loadable(path: &Path) -> bool {
    use libloading::os::unix::{Library, RTLD_NOW};
    let arg: &std::ffi::OsStr = path.as_os_str();
    // SAFETY: probing a shared object; handle dropped at scope end.
    let Ok(lib) = (unsafe { Library::open(Some(arg), RTLD_NOW) }) else {
        return false;
    };
    // SAFETY: looking up a symbol on a live handle; we only test existence.
    let ok = unsafe { lib.get::<unsafe extern "C" fn() -> *const std::ffi::c_void>(b"OrtGetApiBase") }.is_ok();
    ok
}

/// Probe + arm the dynamic-load ort path (#9). If [`onnxruntime_dylib`] finds a
/// usable onnxruntime, set `ORT_DYLIB_PATH` to its absolute location (so ort's
/// loader picks exactly that file) and return `true`. Otherwise leave the env
/// untouched and return `false` — the caller must then use the tract CPU
/// embedder, never touch ort (whose `api()` would panic if the dylib is absent).
/// Idempotent and cheap to call from the selector.
pub fn arm_onnxruntime() -> bool {
    match onnxruntime_dylib() {
        Some(path) => {
            if path.is_absolute() {
                // SAFETY: single-threaded selection point, before any ort use.
                std::env::set_var("ORT_DYLIB_PATH", &path);
            }
            true
        }
        None => false,
    }
}

fn run() -> CudaPreload {
    let dirs = candidate_dirs();
    let mut out = CudaPreload {
        dirs: dirs.clone(),
        ..Default::default()
    };
    for soname in ORDERED_SONAMES {
        if let Some(path) = find_lib(&dirs, soname) {
            if dlopen_global(&path) {
                out.loaded.push(soname.to_string());
            }
        }
    }
    // cuDNN 9 is split: load its sub-libs (precompiled engines, ops, …) before
    // the umbrella `libcudnn.so.9`, all from the same dir.
    if let Some(dir) = dirs.iter().find(|d| d.join("libcudnn.so.9").exists()) {
        let mut subs: Vec<PathBuf> = std::fs::read_dir(dir)
            .into_iter()
            .flatten()
            .flatten()
            .map(|e| e.path())
            .filter(|p| {
                p.file_name()
                    .and_then(|n| n.to_str())
                    .is_some_and(|n| n.starts_with("libcudnn") && n.contains(".so.") && !n.ends_with("libcudnn.so.9"))
            })
            .collect();
        subs.sort();
        for p in subs {
            dlopen_global(&p);
        }
        if dlopen_global(&dir.join("libcudnn.so.9")) {
            out.loaded.push("libcudnn.so.9".to_string());
            out.cudnn = true;
        }
    }
    out
}

/// True if the NVIDIA **userspace driver** (`libcuda.so.1`) is loadable — the lib
/// the GPU driver package installs alongside the kernel module. nornir can
/// **detect** this but **cannot install or embed it**: the kernel driver needs
/// root, a module matching the running kernel, and (usually) a reboot — it is not
/// a redistributable `.so` you can bundle in a binary. Only the *runtime* libs
/// (cudart/cublas/cuDNN) are redistributable and discoverable via [`ensure`].
pub fn driver_present() -> bool {
    use libloading::os::unix::{Library, RTLD_NOW};
    // SAFETY: probing a well-known system soname; handle dropped immediately.
    unsafe { Library::open(Some(std::ffi::OsStr::new("libcuda.so.1")), RTLD_NOW) }.is_ok()
}

fn yn(b: bool) -> &'static str {
    if b {
        "found"
    } else {
        "MISSING"
    }
}

/// Preflight the CUDA GPU path: `(gpu_ready, human_report)`. Checks the NVIDIA
/// driver, the runtime libs [`ensure`] resolved, and cuDNN, then appends concrete
/// advice for whatever is missing. `gpu_ready` is true iff embed-ort should run
/// on the GPU (driver + cudart + cuDNN all present). Drives `nornir vector doctor`.
pub fn preflight() -> (bool, String) {
    let p = ensure();
    let driver = driver_present();
    let cudart = p.loaded.iter().any(|s| s.starts_with("libcudart"));
    let gpu_ready = driver && cudart && p.cudnn;

    let mut s = String::from("nornir CUDA preflight (embed-ort GPU path)\n");
    s.push_str(&format!("  NVIDIA driver libcuda.so.1 : {}\n", yn(driver)));
    s.push_str(&format!("  libcudart (CUDA runtime)   : {}\n", yn(cudart)));
    s.push_str(&format!("  cuDNN 9                    : {}\n", yn(p.cudnn)));
    s.push_str(&format!(
        "  CUDA libs loaded           : {}\n",
        if p.loaded.is_empty() { "(none)".into() } else { p.loaded.join(", ") }
    ));
    s.push_str(&format!(
        "  dirs searched              : {}\n",
        p.dirs.iter().map(|d| d.display().to_string()).collect::<Vec<_>>().join(", ")
    ));
    let missing: Vec<&str> = ORDERED_SONAMES
        .iter()
        .copied()
        .filter(|n| !p.loaded.iter().any(|l| l == n))
        .collect();
    if !missing.is_empty() {
        s.push_str(&format!("  runtime libs not found     : {}\n", missing.join(", ")));
    }
    s.push_str(&format!(
        "\n  verdict: GPU embedding {}\n",
        if gpu_ready { "READY ✓" } else { "unavailable → CPU fallback" }
    ));
    if !driver {
        s.push_str(
            "  → install the NVIDIA GPU driver for your distro (e.g. `sudo apt install \
             nvidia-driver-XXX`, `sudo dnf install akmod-nvidia`, or NVIDIA's .run). nornir \
             can't install the kernel driver — it needs root + a matching kernel module + reboot.\n",
        );
    }
    if driver && (!p.cudnn || !cudart) {
        s.push_str(
            "  → CUDA runtime libs incomplete. Put a matched CUDA-12 / cuDNN-9 .so set \
             (libcudart.so.12, libcublas*, libcudnn.so.9 + its sub-libs) in ONE dir and set \
             NORNIR_CUDA_LIBS to it (e.g. /opt/nornir/cuda). See `.nornir/vector.md`.\n",
        );
    }
    if gpu_ready {
        s.push_str("  → all set; embed-ort runs on the GPU.\n");
    }
    (gpu_ready, s)
}

/// `nornir vector setup-cuda` — pin a complete CUDA runtime set into
/// `/opt/nornir/cuda` (the built-in search dir) by **copying** the libs from
/// wherever discovery finds them on this box (pip `nvidia-*` wheels in a venv,
/// an ollama bundle, a system toolkit). After this, the GPU works for every
/// nornir process with no env — and survives the source venv being deleted.
/// Returns `(copied, missing)` soname lists. Downloading from NVIDIA's redist
/// CDN (for boxes with no local source) is a planned follow-up.
pub fn setup(target: &Path) -> anyhow::Result<(Vec<String>, Vec<String>)> {
    use anyhow::Context;
    let dirs = candidate_dirs();
    std::fs::create_dir_all(target)
        .with_context(|| format!("create {} (need root? try sudo)", target.display()))?;
    let mut copied = Vec::new();
    let mut missing = Vec::new();

    // The ordered runtime set + cuDNN (umbrella + sub-libs from the same dir).
    for soname in ORDERED_SONAMES {
        match find_lib(&dirs, soname) {
            Some(src) => {
                let dst = target.join(src.file_name().unwrap_or_default());
                std::fs::copy(&src, &dst)
                    .with_context(|| format!("copy {} -> {}", src.display(), dst.display()))?;
                // The plain `.so.N` name the loader asks for, alongside any
                // fully-versioned file name we copied.
                let alias = target.join(soname);
                if !alias.exists() {
                    std::fs::copy(&src, &alias).ok();
                }
                copied.push(soname.to_string());
            }
            None => missing.push(soname.to_string()),
        }
    }
    if let Some(dir) = dirs.iter().find(|d| d.join("libcudnn.so.9").exists()) {
        let mut n = 0usize;
        for e in std::fs::read_dir(dir).into_iter().flatten().flatten() {
            let name = e.file_name();
            let name = name.to_string_lossy();
            if name.starts_with("libcudnn") && name.contains(".so") {
                std::fs::copy(e.path(), target.join(e.file_name()))
                    .with_context(|| format!("copy {}", e.path().display()))?;
                n += 1;
            }
        }
        copied.push(format!("libcudnn.so.9 (+{n} files)"));
    } else {
        missing.push("libcudnn.so.9".to_string());
    }
    Ok((copied, missing))
}

fn candidate_dirs() -> Vec<PathBuf> {
    let mut dirs: Vec<PathBuf> = Vec::new();
    let push = |p: PathBuf, dirs: &mut Vec<PathBuf>| {
        if p.is_dir() && !dirs.contains(&p) {
            dirs.push(p);
        }
    };

    // 1. Explicit override.
    if let Some(v) = std::env::var_os("NORNIR_CUDA_LIBS") {
        for p in std::env::split_paths(&v) {
            push(p, &mut dirs);
        }
    }
    // 2. Active Python env → pip nvidia wheels.
    for key in ["VIRTUAL_ENV", "CONDA_PREFIX"] {
        if let Some(root) = std::env::var_os(key) {
            for d in nvidia_pkg_dirs(Path::new(&root)) {
                push(d, &mut dirs);
            }
        }
    }
    // 3. Extra scan roots (each scanned for venvs one level deep).
    if let Some(v) = std::env::var_os("NORNIR_CUDA_SCAN_ROOTS") {
        for root in std::env::split_paths(&v) {
            for d in scan_root_for_nvidia(&root) {
                push(d, &mut dirs);
            }
        }
    }
    // 4. Known system locations: nornir's own conventional CUDA dir first (drop a
    //    matched CUDA-12 / cuDNN-9 .so set here and the GPU "just works", no env),
    //    then ollama bundles + a system CUDA toolkit.
    for sys in [
        "/opt/nornir/cuda",
        "/usr/local/lib/ollama/cuda_v12",
        "/usr/local/lib/ollama/cuda_v13",
        "/usr/local/cuda/lib64",
        "/usr/local/cuda-12/lib64",
        "/opt/cuda/lib64",
        "/usr/lib/x86_64-linux-gnu",
    ] {
        push(PathBuf::from(sys), &mut dirs);
    }
    dirs
}

/// `<root>/lib*/python*/site-packages/nvidia/*/lib` for a venv/conda root.
fn nvidia_pkg_dirs(root: &Path) -> Vec<PathBuf> {
    let mut out = Vec::new();
    for libname in ["lib", "lib64"] {
        let pyroot = root.join(libname);
        let Ok(entries) = std::fs::read_dir(&pyroot) else {
            continue;
        };
        for e in entries.flatten() {
            let nvidia = e.path().join("site-packages/nvidia");
            if let Ok(pkgs) = std::fs::read_dir(&nvidia) {
                for p in pkgs.flatten() {
                    let lib = p.path().join("lib");
                    if lib.is_dir() {
                        out.push(lib);
                    }
                }
            }
        }
    }
    out
}

/// Scan one level of subdirs of `root` as candidate venv roots.
fn scan_root_for_nvidia(root: &Path) -> Vec<PathBuf> {
    let mut out = nvidia_pkg_dirs(root);
    if let Ok(entries) = std::fs::read_dir(root) {
        for e in entries.flatten() {
            if e.path().is_dir() {
                out.extend(nvidia_pkg_dirs(&e.path()));
            }
        }
    }
    out
}

/// First dir containing `soname` (exact) or a `soname.<minor>` variant.
fn find_lib(dirs: &[PathBuf], soname: &str) -> Option<PathBuf> {
    for d in dirs {
        let exact = d.join(soname);
        if exact.exists() {
            return Some(exact);
        }
        // e.g. libcudart.so.12 → match libcudart.so.12.8.90
        if let Ok(entries) = std::fs::read_dir(d) {
            for e in entries.flatten() {
                if let Some(name) = e.file_name().to_str() {
                    if name.starts_with(soname) {
                        return Some(e.path());
                    }
                }
            }
        }
    }
    None
}

fn dlopen_global(path: &Path) -> bool {
    use libloading::os::unix::{Library, RTLD_GLOBAL, RTLD_NOW};
    // SAFETY: loading a CUDA shared lib by absolute path; the handle is leaked
    // so it stays resident for the provider to resolve against.
    match unsafe { Library::open(Some(path), RTLD_NOW | RTLD_GLOBAL) } {
        Ok(lib) => {
            std::mem::forget(lib);
            true
        }
        Err(_) => false,
    }
}