flodl 0.5.2

floDl — a flow-graph deep learning framework built on libtorch
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
//! CUDA utility functions — device queries, memory stats, hardware probing.

use std::ffi::CStr;

use flodl_sys as ffi;

use super::{check_err, Device, DType, Result, Tensor, TensorError, TensorOptions, LIVE_TENSOR_COUNT};
use std::sync::atomic::Ordering;

/// Returns true if CUDA is available.
///
/// On Linux, this also ensures CUDA libraries are loaded (they can be
/// dropped by the linker's `--as-needed` flag since no Rust code
/// directly references symbols in `libtorch_cuda.so`).
pub fn cuda_available() -> bool {
    // flodl_force_cuda_link references c10::cuda::device_count(),
    // creating a real symbol dependency on c10_cuda.so. This prevents
    // --as-needed from dropping CUDA libs. The call is cheap (no-op on
    // non-CUDA builds since the symbol resolves to a stub returning 0).
    unsafe { let _ = ffi::flodl_force_cuda_link(); }
    unsafe { ffi::flodl_cuda_is_available() != 0 }
}

/// Returns the number of CUDA devices.
pub fn cuda_device_count() -> i32 {
    unsafe { ffi::flodl_cuda_device_count() }
}

/// Query CUDA memory usage for a specific device.
/// Returns `(used_bytes, total_bytes)` or an error if CUDA is not available.
pub fn cuda_memory_info_idx(device_index: i32) -> Result<(u64, u64)> {
    let mut used: u64 = 0;
    let mut total: u64 = 0;
    check_err(unsafe { ffi::flodl_cuda_mem_info(device_index, &mut used, &mut total) })?;
    Ok((used, total))
}

/// Query CUDA memory usage for device 0.
/// Returns `(used_bytes, total_bytes)` or an error if CUDA is not available.
pub fn cuda_memory_info() -> Result<(u64, u64)> {
    cuda_memory_info_idx(0)
}

/// Query bytes reserved by the CUDA caching allocator on a specific device.
///
/// This is the Rust equivalent of `torch.cuda.memory_reserved()`. It can exceed
/// physical VRAM when unified memory spills to host RAM.
pub fn cuda_allocated_bytes_idx(device_index: i32) -> Result<u64> {
    let mut allocated: u64 = 0;
    check_err(unsafe { ffi::flodl_cuda_alloc_bytes(device_index, &mut allocated) })?;
    Ok(allocated)
}

/// Query bytes reserved by the CUDA caching allocator on device 0.
pub fn cuda_allocated_bytes() -> Result<u64> {
    cuda_allocated_bytes_idx(0)
}

/// Query bytes actively used by tensors on a specific device.
///
/// This is the Rust equivalent of `torch.cuda.memory_allocated()`. Unlike
/// `cuda_allocated_bytes` (which reports the allocator's total reservation),
/// this only counts sub-blocks currently backing live tensors.
pub fn cuda_active_bytes_idx(device_index: i32) -> Result<u64> {
    let mut active: u64 = 0;
    check_err(unsafe { ffi::flodl_cuda_active_bytes(device_index, &mut active) })?;
    Ok(active)
}

/// Query bytes actively used by tensors on device 0.
pub fn cuda_active_bytes() -> Result<u64> {
    cuda_active_bytes_idx(0)
}

/// Peak bytes allocated to tensors since last `cuda_reset_peak_stats()` on a specific device.
///
/// This is the Rust equivalent of `torch.cuda.max_memory_allocated()`.
pub fn cuda_peak_active_bytes_idx(device_index: i32) -> Result<u64> {
    let mut peak: u64 = 0;
    check_err(unsafe { ffi::flodl_cuda_peak_active_bytes(device_index, &mut peak) })?;
    Ok(peak)
}

/// Peak bytes allocated to tensors since last `cuda_reset_peak_stats()` on device 0.
pub fn cuda_peak_active_bytes() -> Result<u64> {
    cuda_peak_active_bytes_idx(0)
}

/// Peak bytes reserved by the CUDA caching allocator since last `cuda_reset_peak_stats()` on a specific device.
///
/// This is the Rust equivalent of `torch.cuda.max_memory_reserved()`.
pub fn cuda_peak_reserved_bytes_idx(device_index: i32) -> Result<u64> {
    let mut peak: u64 = 0;
    check_err(unsafe { ffi::flodl_cuda_peak_reserved_bytes(device_index, &mut peak) })?;
    Ok(peak)
}

/// Peak bytes reserved by the CUDA caching allocator since last `cuda_reset_peak_stats()` on device 0.
pub fn cuda_peak_reserved_bytes() -> Result<u64> {
    cuda_peak_reserved_bytes_idx(0)
}

/// Reset peak memory statistics for a specific device.
/// Equivalent to `torch.cuda.reset_peak_memory_stats()`.
pub fn cuda_reset_peak_stats_idx(device_index: i32) {
    unsafe { ffi::flodl_cuda_reset_peak_stats(device_index) }
}

/// Reset peak memory statistics for device 0.
pub fn cuda_reset_peak_stats() {
    cuda_reset_peak_stats_idx(0)
}

/// Release all unused cached memory from the CUDA caching allocator.
/// Equivalent to `torch.cuda.empty_cache()`.
pub fn cuda_empty_cache() {
    unsafe { ffi::flodl_cuda_empty_cache() }
}

/// Query GPU utilization percentage (0-100) via NVML.
/// Returns `None` if NVML is not available or the query fails.
pub fn cuda_utilization() -> Option<u32> {
    cuda_utilization_idx(0)
}

/// Query GPU utilization percentage for a specific device (0-100) via NVML.
pub fn cuda_utilization_idx(device_index: i32) -> Option<u32> {
    let val = unsafe { ffi::flodl_cuda_utilization(device_index) };
    if val >= 0 { Some(val as u32) } else { None }
}

/// Set the current CUDA device.
pub fn set_current_cuda_device(device_index: u8) {
    unsafe { ffi::flodl_set_current_device(device_index as i32) };
}

/// Get the current CUDA device index.
pub fn current_cuda_device() -> u8 {
    unsafe { ffi::flodl_get_current_device() as u8 }
}

/// Synchronize a CUDA device (wait for all pending work to complete).
pub fn cuda_synchronize(device_index: u8) {
    unsafe { ffi::flodl_cuda_synchronize(device_index as i32) };
}

/// Returns the GPU device name for the given index (e.g. "NVIDIA GeForce GTX 1060 6GB").
pub fn cuda_device_name_idx(device: i32) -> Option<String> {
    let mut buf = [0i8; 256];
    let err = unsafe { ffi::flodl_cuda_device_name(device, buf.as_mut_ptr(), 256) };
    if err.is_null() {
        let name = unsafe { CStr::from_ptr(buf.as_ptr()) }
            .to_string_lossy()
            .into_owned();
        Some(name)
    } else {
        unsafe { ffi::flodl_free_string(err) };
        None
    }
}

/// Returns the GPU device name for device 0 (e.g. "NVIDIA GeForce GTX 1060 6GB").
pub fn cuda_device_name() -> Option<String> {
    cuda_device_name_idx(0)
}

/// Information about a CUDA device.
#[derive(Debug, Clone)]
pub struct DeviceInfo {
    /// Device index (0-based).
    pub index: u8,
    /// Device name (e.g. "NVIDIA GeForce GTX 1060 6GB").
    pub name: String,
    /// Total device memory in bytes.
    pub total_memory: u64,
    /// Compute capability major version (e.g. 6 for sm_61).
    pub sm_major: u32,
    /// Compute capability minor version (e.g. 1 for sm_61).
    pub sm_minor: u32,
}

impl DeviceInfo {
    /// Compute capability as a string (e.g. "sm_61").
    pub fn sm_version(&self) -> String {
        format!("sm_{}{}", self.sm_major, self.sm_minor)
    }
}

/// Query compute capability (major, minor) for a CUDA device.
///
/// Returns `None` if CUDA is unavailable or the device index is invalid.
pub fn cuda_compute_capability(device_index: i32) -> Option<(u32, u32)> {
    let mut major: i32 = 0;
    let mut minor: i32 = 0;
    let err = unsafe {
        ffi::flodl_cuda_compute_capability(device_index, &mut major, &mut minor)
    };
    if err.is_null() {
        Some((major as u32, minor as u32))
    } else {
        unsafe { ffi::flodl_free_string(err) };
        None
    }
}

/// Enumerate all available CUDA devices.
pub fn cuda_devices() -> Vec<DeviceInfo> {
    let n = cuda_device_count();
    (0..n).filter_map(|i| {
        let name = cuda_device_name_idx(i)?;
        let total_memory = cuda_memory_info_idx(i).map(|(_, t)| t).unwrap_or(0);
        let (sm_major, sm_minor) = cuda_compute_capability(i).unwrap_or((0, 0));
        Some(DeviceInfo { index: i as u8, name, total_memory, sm_major, sm_minor })
    }).collect()
}

/// Probe whether a CUDA device can execute compute kernels under the
/// current libtorch build. Returns `Ok(())` if the device works, or an
/// error describing why it cannot (e.g. missing kernel image for sm_61).
pub fn probe_device(device: Device) -> Result<()> {
    let idx = match device {
        Device::CUDA(i) => i,
        Device::CPU => return Ok(()),
    };
    let opts = TensorOptions { dtype: DType::Float32, device };
    match Tensor::zeros(&[1], opts) {
        Ok(t) => {
            // Also try a simple op to verify kernels load
            let _ = t.add(&t)?;
            Ok(())
        }
        Err(e) => {
            let msg = format!("{}", e);
            if msg.contains("no kernel image") {
                let (sm_maj, sm_min) = cuda_compute_capability(idx as i32)
                    .unwrap_or((0, 0));
                let name = cuda_device_name_idx(idx as i32)
                    .unwrap_or_else(|| format!("CUDA({})", idx));
                let variant = recommended_cuda_variant(sm_maj);
                Err(TensorError::new(&format!(
                    "CUDA({}) {} (sm_{}{}) cannot run kernels in this libtorch build. \
                     Recommended: switch to libtorch {} \
                     (in Dockerfile, change the cu### variant)",
                    idx, name, sm_maj, sm_min, variant
                )))
            } else {
                Err(e)
            }
        }
    }
}

/// Return all CUDA devices that can run compute kernels, with warnings
/// for any excluded devices printed to stderr.
///
/// This is the primary entry point for multi-GPU setup. It probes each
/// GPU and returns only the working ones, giving actionable diagnostics
/// for any that fail.
pub fn usable_cuda_devices() -> Vec<Device> {
    if !cuda_available() {
        return vec![];
    }
    let devices = cuda_devices();
    let mut usable = Vec::new();

    for info in &devices {
        let dev = Device::CUDA(info.index);
        match probe_device(dev) {
            Ok(()) => usable.push(dev),
            Err(e) => {
                eprintln!("[flodl] WARNING: {}", e);
            }
        }
    }

    if usable.len() < devices.len() {
        let names: Vec<String> = usable.iter().map(|d| format!("{}", d)).collect();
        crate::verbose!(
            "[flodl] Proceeding with {}/{} devices: [{}]",
            usable.len(), devices.len(), names.join(", ")
        );
    }

    usable
}

/// Recommend the best libtorch CUDA variant for a given compute capability.
fn recommended_cuda_variant(sm_major: u32) -> &'static str {
    match sm_major {
        0..=6 => "cu126",  // Maxwell/Pascal (sm_50-sm_61): cu126 for broadest compat
        _ => "cu128",      // Volta+ (sm_70+): cu128 for best performance
    }
}

/// One-line hardware summary for dashboard headers.
///
/// Returns something like:
/// `"CPU: AMD Ryzen 9 5900X (64GB) | GPU: NVIDIA GeForce GTX 1060 (6GB)"`
pub fn hardware_summary() -> String {
    let cpu = cpu_model_name().unwrap_or_else(|| "Unknown CPU".into());
    let threads = cpu_thread_count();
    let ram = total_ram_gb();
    let mut s = format!("{} ({} threads, {}GB)", cpu, threads, ram);

    if cuda_available() {
        let n = cuda_device_count();
        for i in 0..n {
            if let Some(gpu) = cuda_device_name_idx(i) {
                let vram_str = cuda_memory_info_idx(i)
                    .map(|(_, total)| format!(" ({}GB)", total / (1024 * 1024 * 1024)))
                    .unwrap_or_default();
                let _ = std::fmt::Write::write_fmt(&mut s, format_args!(
                    " | {}{}", gpu, vram_str
                ));
            }
        }
    }
    s
}

/// Count logical CPU threads from /proc/cpuinfo (Linux).
fn cpu_thread_count() -> usize {
    std::fs::read_to_string("/proc/cpuinfo")
        .ok()
        .map(|s| s.lines().filter(|l| l.starts_with("processor")).count())
        .unwrap_or(1)
}

/// Read CPU model name from /proc/cpuinfo (Linux).
fn cpu_model_name() -> Option<String> {
    let info = std::fs::read_to_string("/proc/cpuinfo").ok()?;
    for line in info.lines() {
        if line.starts_with("model name") && let Some(val) = line.split(':').nth(1) {
            return Some(val.trim().to_string());
        }
    }
    None
}

/// Total physical RAM in GB (Linux).
fn total_ram_gb() -> u64 {
    std::fs::read_to_string("/proc/meminfo")
        .ok()
        .and_then(|s| {
            for line in s.lines() {
                if line.starts_with("MemTotal:") {
                    let kb: u64 = line.split_whitespace().nth(1)?.parse().ok()?;
                    return Some(kb / (1024 * 1024));
                }
            }
            None
        })
        .unwrap_or(0)
}

/// Enable or disable cuDNN benchmark mode.
///
/// When enabled, cuDNN will benchmark multiple convolution algorithms
/// on the first call and cache the fastest. Benefits fixed-size workloads
/// (FBRL, fixed image dims) with 5-10% speedup. Can hurt dynamic-shape
/// workloads due to warmup cost. Off by default — users opt in.
pub fn set_cudnn_benchmark(enable: bool) {
    unsafe { ffi::flodl_set_cudnn_benchmark(enable as i32) }
}

/// Seed all libtorch RNGs (CPU + CUDA) for reproducible tensor ops.
///
/// This sets the global seed for `Tensor::rand`, `Tensor::randn`,
/// dropout masks, and all other libtorch random operations.
/// Call before model creation and training for full reproducibility.
pub fn manual_seed(seed: u64) {
    unsafe { ffi::flodl_manual_seed(seed) }
}

/// Seed all CUDA device RNGs. No-op when built without CUDA.
///
/// Usually you want `manual_seed()` instead, which seeds both CPU
/// and CUDA. Use this only when you need to re-seed CUDA independently.
pub fn cuda_manual_seed_all(seed: u64) {
    unsafe { ffi::flodl_cuda_manual_seed_all(seed) }
}

/// Ask glibc to return free memory to the OS (Linux only).
///
/// Returns `true` if memory was actually released. Useful for
/// distinguishing allocator fragmentation from real leaks:
/// if RSS drops after calling this, the growth was fragmentation.
pub fn malloc_trim() -> bool {
    unsafe { ffi::flodl_malloc_trim() != 0 }
}

/// Number of live C++ Tensor handles (created but not yet dropped).
/// If this grows over time during training, there is a handle leak.
/// If it stays stable but RSS grows, the leak is inside libtorch.
pub fn live_tensor_count() -> u64 {
    LIVE_TENSOR_COUNT.load(Ordering::Relaxed)
}

/// Read current process RSS in kilobytes (Linux only).
/// Returns 0 on non-Linux or if /proc/self/statm is unreadable.
pub fn rss_kb() -> usize {
    std::fs::read_to_string("/proc/self/statm")
        .ok()
        .and_then(|s| s.split_whitespace().nth(1)?.parse::<usize>().ok())
        .map(|pages| pages * 4)
        .unwrap_or(0)
}