use std::ffi::CStr;
use flodl_sys as ffi;
use super::{check_err, Device, DType, Result, Tensor, TensorError, TensorOptions, LIVE_TENSOR_COUNT};
use std::sync::atomic::Ordering;
pub fn cuda_available() -> bool {
unsafe { let _ = ffi::flodl_force_cuda_link(); }
unsafe { ffi::flodl_cuda_is_available() != 0 }
}
pub fn cuda_device_count() -> i32 {
unsafe { ffi::flodl_cuda_device_count() }
}
pub fn cuda_memory_info_idx(device_index: i32) -> Result<(u64, u64)> {
let mut used: u64 = 0;
let mut total: u64 = 0;
check_err(unsafe { ffi::flodl_cuda_mem_info(device_index, &mut used, &mut total) })?;
Ok((used, total))
}
pub fn cuda_memory_info() -> Result<(u64, u64)> {
cuda_memory_info_idx(0)
}
pub fn cuda_allocated_bytes_idx(device_index: i32) -> Result<u64> {
let mut allocated: u64 = 0;
check_err(unsafe { ffi::flodl_cuda_alloc_bytes(device_index, &mut allocated) })?;
Ok(allocated)
}
pub fn cuda_allocated_bytes() -> Result<u64> {
cuda_allocated_bytes_idx(0)
}
pub fn cuda_active_bytes_idx(device_index: i32) -> Result<u64> {
let mut active: u64 = 0;
check_err(unsafe { ffi::flodl_cuda_active_bytes(device_index, &mut active) })?;
Ok(active)
}
pub fn cuda_active_bytes() -> Result<u64> {
cuda_active_bytes_idx(0)
}
pub fn cuda_peak_active_bytes_idx(device_index: i32) -> Result<u64> {
let mut peak: u64 = 0;
check_err(unsafe { ffi::flodl_cuda_peak_active_bytes(device_index, &mut peak) })?;
Ok(peak)
}
pub fn cuda_peak_active_bytes() -> Result<u64> {
cuda_peak_active_bytes_idx(0)
}
pub fn cuda_peak_reserved_bytes_idx(device_index: i32) -> Result<u64> {
let mut peak: u64 = 0;
check_err(unsafe { ffi::flodl_cuda_peak_reserved_bytes(device_index, &mut peak) })?;
Ok(peak)
}
pub fn cuda_peak_reserved_bytes() -> Result<u64> {
cuda_peak_reserved_bytes_idx(0)
}
pub fn cuda_reset_peak_stats_idx(device_index: i32) {
unsafe { ffi::flodl_cuda_reset_peak_stats(device_index) }
}
pub fn cuda_reset_peak_stats() {
cuda_reset_peak_stats_idx(0)
}
pub fn cuda_empty_cache() {
unsafe { ffi::flodl_cuda_empty_cache() }
}
pub fn cuda_utilization() -> Option<u32> {
cuda_utilization_idx(0)
}
pub fn cuda_utilization_idx(device_index: i32) -> Option<u32> {
let val = unsafe { ffi::flodl_cuda_utilization(device_index) };
if val >= 0 { Some(val as u32) } else { None }
}
pub fn set_current_cuda_device(device_index: u8) {
unsafe { ffi::flodl_set_current_device(device_index as i32) };
}
pub fn current_cuda_device() -> u8 {
unsafe { ffi::flodl_get_current_device() as u8 }
}
pub fn cuda_synchronize(device_index: u8) {
unsafe { ffi::flodl_cuda_synchronize(device_index as i32) };
}
pub fn cuda_device_name_idx(device: i32) -> Option<String> {
let mut buf = [0i8; 256];
let err = unsafe { ffi::flodl_cuda_device_name(device, buf.as_mut_ptr(), 256) };
if err.is_null() {
let name = unsafe { CStr::from_ptr(buf.as_ptr()) }
.to_string_lossy()
.into_owned();
Some(name)
} else {
unsafe { ffi::flodl_free_string(err) };
None
}
}
pub fn cuda_device_name() -> Option<String> {
cuda_device_name_idx(0)
}
#[derive(Debug, Clone)]
pub struct DeviceInfo {
pub index: u8,
pub name: String,
pub total_memory: u64,
pub sm_major: u32,
pub sm_minor: u32,
}
impl DeviceInfo {
pub fn sm_version(&self) -> String {
format!("sm_{}{}", self.sm_major, self.sm_minor)
}
}
pub fn cuda_compute_capability(device_index: i32) -> Option<(u32, u32)> {
let mut major: i32 = 0;
let mut minor: i32 = 0;
let err = unsafe {
ffi::flodl_cuda_compute_capability(device_index, &mut major, &mut minor)
};
if err.is_null() {
Some((major as u32, minor as u32))
} else {
unsafe { ffi::flodl_free_string(err) };
None
}
}
pub fn cuda_devices() -> Vec<DeviceInfo> {
let n = cuda_device_count();
(0..n).filter_map(|i| {
let name = cuda_device_name_idx(i)?;
let total_memory = cuda_memory_info_idx(i).map(|(_, t)| t).unwrap_or(0);
let (sm_major, sm_minor) = cuda_compute_capability(i).unwrap_or((0, 0));
Some(DeviceInfo { index: i as u8, name, total_memory, sm_major, sm_minor })
}).collect()
}
pub fn probe_device(device: Device) -> Result<()> {
let idx = match device {
Device::CUDA(i) => i,
Device::CPU => return Ok(()),
};
let opts = TensorOptions { dtype: DType::Float32, device };
match Tensor::zeros(&[1], opts) {
Ok(t) => {
let _ = t.add(&t)?;
Ok(())
}
Err(e) => {
let msg = format!("{}", e);
if msg.contains("no kernel image") {
let (sm_maj, sm_min) = cuda_compute_capability(idx as i32)
.unwrap_or((0, 0));
let name = cuda_device_name_idx(idx as i32)
.unwrap_or_else(|| format!("CUDA({})", idx));
let variant = recommended_cuda_variant(sm_maj);
Err(TensorError::new(&format!(
"CUDA({}) {} (sm_{}{}) cannot run kernels in this libtorch build. \
Recommended: switch to libtorch {} \
(in Dockerfile, change the cu### variant)",
idx, name, sm_maj, sm_min, variant
)))
} else {
Err(e)
}
}
}
}
pub fn usable_cuda_devices() -> Vec<Device> {
if !cuda_available() {
return vec![];
}
let devices = cuda_devices();
let mut usable = Vec::new();
for info in &devices {
let dev = Device::CUDA(info.index);
match probe_device(dev) {
Ok(()) => usable.push(dev),
Err(e) => {
eprintln!("[flodl] WARNING: {}", e);
}
}
}
if usable.len() < devices.len() {
let names: Vec<String> = usable.iter().map(|d| format!("{}", d)).collect();
eprintln!(
"[flodl] Proceeding with {}/{} devices: [{}]",
usable.len(), devices.len(), names.join(", ")
);
}
usable
}
fn recommended_cuda_variant(sm_major: u32) -> &'static str {
match sm_major {
0..=6 => "cu126", _ => "cu128", }
}
pub fn hardware_summary() -> String {
let cpu = cpu_model_name().unwrap_or_else(|| "Unknown CPU".into());
let threads = cpu_thread_count();
let ram = total_ram_gb();
let mut s = format!("{} ({} threads, {}GB)", cpu, threads, ram);
if cuda_available() {
let n = cuda_device_count();
for i in 0..n {
if let Some(gpu) = cuda_device_name_idx(i) {
let vram_str = cuda_memory_info_idx(i)
.map(|(_, total)| format!(" ({}GB)", total / (1024 * 1024 * 1024)))
.unwrap_or_default();
let _ = std::fmt::Write::write_fmt(&mut s, format_args!(
" | {}{}", gpu, vram_str
));
}
}
}
s
}
fn cpu_thread_count() -> usize {
std::fs::read_to_string("/proc/cpuinfo")
.ok()
.map(|s| s.lines().filter(|l| l.starts_with("processor")).count())
.unwrap_or(1)
}
fn cpu_model_name() -> Option<String> {
let info = std::fs::read_to_string("/proc/cpuinfo").ok()?;
for line in info.lines() {
if line.starts_with("model name") && let Some(val) = line.split(':').nth(1) {
return Some(val.trim().to_string());
}
}
None
}
fn total_ram_gb() -> u64 {
std::fs::read_to_string("/proc/meminfo")
.ok()
.and_then(|s| {
for line in s.lines() {
if line.starts_with("MemTotal:") {
let kb: u64 = line.split_whitespace().nth(1)?.parse().ok()?;
return Some(kb / (1024 * 1024));
}
}
None
})
.unwrap_or(0)
}
pub fn set_cudnn_benchmark(enable: bool) {
unsafe { ffi::flodl_set_cudnn_benchmark(enable as i32) }
}
pub fn manual_seed(seed: u64) {
unsafe { ffi::flodl_manual_seed(seed) }
}
pub fn cuda_manual_seed_all(seed: u64) {
unsafe { ffi::flodl_cuda_manual_seed_all(seed) }
}
pub fn malloc_trim() -> bool {
unsafe { ffi::flodl_malloc_trim() != 0 }
}
pub fn live_tensor_count() -> u64 {
LIVE_TENSOR_COUNT.load(Ordering::Relaxed)
}
pub fn rss_kb() -> usize {
std::fs::read_to_string("/proc/self/statm")
.ok()
.and_then(|s| s.split_whitespace().nth(1)?.parse::<usize>().ok())
.map(|pages| pages * 4)
.unwrap_or(0)
}