use std::collections::{BTreeMap, HashSet};
use std::sync::Mutex;
use super::device::GpuDeviceInfo;
use super::policy::DispatchPolicy;
use super::runtime::GpuRuntime;
pub(crate) fn log_cuda_enabled(device: &GpuDeviceInfo, policy: &DispatchPolicy) {
log::info!(
"[GPU] CUDA acceleration enabled\n device: {}\n measured: fp64={} GFLOP/s h2d={:.1} GB/s d2h={:.1} GB/s memory={}\n libraries: CUDA driver ready; cuBLAS/cuSOLVER/cuSPARSE load on first use\n dispatch: xtwx_rows>={} gemm>={}flop gemv>={}flop spmv_rows>={} spmv_nnz>={} chol_p>={} syevd_p>={} trsm>={}flop",
device,
format_float(device.peak_fp64_gflops()),
device.calibration.h2d_gb_s,
device.calibration.d2h_gb_s,
format_bytes(device.total_memory_bytes),
policy.xtwx_min_rows,
format_count(policy.gemm_min_flops),
format_count(policy.gemv_min_flops),
policy.spmv_min_rows,
format_count(policy.spmv_min_nnz as u64),
policy.chol_min_p,
policy.syevd_min_p,
format_count(policy.trsm_min_flops),
);
}
pub(crate) fn log_cuda_pool(devices: &[GpuDeviceInfo]) {
if devices.len() <= 1 {
return;
}
let summary = devices
.iter()
.map(|device| {
format!(
"{}:'{}' sm_{}{} {}SM {}",
device.ordinal,
device.name,
device.compute_capability_major,
device.compute_capability_minor,
device.sm_count,
format_bytes(device.total_memory_bytes),
)
})
.collect::<Vec<_>>()
.join("; ");
log::info!(
"[GPU] multi-device pool enabled | devices={} | policy_device={} | {}",
devices.len(),
devices[0].ordinal,
summary,
);
}
pub(crate) fn log_cuda_disabled(reason: &str) {
log::info!(
"[GPU] CUDA acceleration disabled\n backend: CPU (faer + Rayon)\n reason: {}",
reason
);
}
pub(crate) fn log_library_ready(library: &'static str, device: &GpuDeviceInfo) {
log_route(format!(
"[GPU] {library} ready | device={} '{}' | CUDA library handle initialized",
device.ordinal, device.name,
));
}
pub(crate) fn log_library_unavailable(library: &'static str, reason: &str) {
log_route(format!(
"[GPU] {library} unavailable | CPU route retained | reason={reason}"
));
}
pub(crate) fn log_policy_cpu(op: &'static str, shape: String, reason: String) {
activity::record_policy_decline(op);
if reason.starts_with("CUDA unavailable:") {
log_route(format!("[GPU] CPU route | op={op} | reason={reason}"));
} else {
log_route(format!(
"[GPU] CPU route | op={op} | shape={shape} | reason={reason}"
));
}
}
pub(crate) fn dispatch_decline_reason(policy_reason: String) -> String {
if let Some(cpu_reason) = GpuRuntime::global().cpu_reason() {
format!("CUDA unavailable: {cpu_reason}")
} else {
policy_reason
}
}
pub(crate) fn log_runtime_cpu(op: &'static str, backend: &'static str, shape: String) {
activity::record_runtime_decline(op);
if let Some(cpu_reason) = GpuRuntime::global().cpu_reason() {
log_route(format!(
"[GPU] CPU route | op={op} | backend={backend} | reason=CUDA unavailable: {cpu_reason}"
));
} else {
log_route(format!(
"[GPU] CPU route | op={op} | backend={backend} | shape={shape} | reason=device dispatch returned no result"
));
}
}
pub(crate) fn log_gpu_success(
op: &'static str,
backend: &'static str,
device: &GpuDeviceInfo,
shape: String,
flops: u64,
h2d_bytes: usize,
d2h_bytes: usize,
elapsed_s: f64,
) {
activity::record_success(op, flops, h2d_bytes, d2h_bytes, elapsed_s);
log_route(format!(
"[GPU] device route | op={op} | backend={backend} | device={} '{}' | shape={shape} | work={}flop | transfer=h2d:{} d2h:{} | elapsed={:.3}s",
device.ordinal,
device.name,
format_count(flops),
format_bytes(h2d_bytes),
format_bytes(d2h_bytes),
elapsed_s,
));
}
pub(crate) fn log_gpu_success_multi(
op: &'static str,
backend: &'static str,
devices: &[GpuDeviceInfo],
shape: String,
flops: u64,
h2d_bytes: usize,
d2h_bytes: usize,
elapsed_s: f64,
) {
if devices.len() == 1 {
log_gpu_success(
op,
backend,
&devices[0],
shape,
flops,
h2d_bytes,
d2h_bytes,
elapsed_s,
);
return;
}
activity::record_success(op, flops, h2d_bytes, d2h_bytes, elapsed_s);
let device_summary = devices
.iter()
.map(|device| format!("{} '{}'", device.ordinal, device.name))
.collect::<Vec<_>>()
.join(", ");
log_route(format!(
"[GPU] multi-device route | op={op} | backend={backend} | devices=[{device_summary}] | shape={shape} | work={}flop | transfer=h2d:{} d2h:{} | elapsed={:.3}s",
format_count(flops),
format_bytes(h2d_bytes),
format_bytes(d2h_bytes),
elapsed_s,
));
}
fn log_route(signature: String) {
static SEEN: Mutex<Option<HashSet<String>>> = Mutex::new(None);
let mut guard = match SEEN.lock() {
Ok(g) => g,
Err(poisoned) => poisoned.into_inner(),
};
let seen = guard.get_or_insert_with(HashSet::new);
if seen.insert(signature.clone()) {
log::info!("{signature}");
}
}
pub(crate) fn bytes_for_f64(len: usize) -> usize {
len.saturating_mul(std::mem::size_of::<f64>())
}
pub(crate) fn bytes_for_i32(len: usize) -> usize {
len.saturating_mul(std::mem::size_of::<i32>())
}
pub(crate) fn gemm_flops(m: usize, n: usize, k: usize) -> u64 {
(m as u64)
.saturating_mul(n as u64)
.saturating_mul(k.max(1) as u64)
.saturating_mul(2)
}
pub(crate) fn gemv_flops(rows: usize, cols: usize) -> u64 {
(rows as u64).saturating_mul(cols as u64).saturating_mul(2)
}
pub(crate) fn chol_flops(p: usize) -> u64 {
let p64 = p as u64;
p64.saturating_mul(p64).saturating_mul(p64) / 3
}
fn format_count(value: u64) -> String {
if value == u64::MAX {
return "never".to_string();
}
if value >= 1_000_000_000_000 {
format!("{:.2}T", value as f64 / 1_000_000_000_000.0)
} else if value >= 1_000_000_000 {
format!("{:.2}B", value as f64 / 1_000_000_000.0)
} else if value >= 1_000_000 {
format!("{:.2}M", value as f64 / 1_000_000.0)
} else if value >= 1_000 {
format!("{:.2}K", value as f64 / 1_000.0)
} else {
value.to_string()
}
}
fn format_bytes(bytes: usize) -> String {
const GIB: f64 = 1024.0 * 1024.0 * 1024.0;
const MIB: f64 = 1024.0 * 1024.0;
const KIB: f64 = 1024.0;
let bytes_f = bytes as f64;
if bytes_f >= GIB {
format!("{:.2}GiB", bytes_f / GIB)
} else if bytes_f >= MIB {
format!("{:.2}MiB", bytes_f / MIB)
} else if bytes_f >= KIB {
format!("{:.2}KiB", bytes_f / KIB)
} else {
format!("{bytes}B")
}
}
fn format_float(value: f64) -> String {
if value >= 100.0 {
format!("{value:.0}")
} else {
format!("{value:.1}")
}
}
pub fn gpu_activity_summary() -> Option<String> {
if !GpuRuntime::global().is_available() {
return None;
}
Some(activity::snapshot_summary())
}
pub fn flush_gpu_activity_summary() {
if let Some(summary) = gpu_activity_summary() {
log::info!("{summary}");
}
}
mod activity {
use super::{BTreeMap, Mutex, format_bytes, format_count};
use std::sync::atomic::{AtomicU64, Ordering};
static SUCCESS_COUNT: AtomicU64 = AtomicU64::new(0);
static POLICY_DECLINE_COUNT: AtomicU64 = AtomicU64::new(0);
static RUNTIME_DECLINE_COUNT: AtomicU64 = AtomicU64::new(0);
static TOTAL_FLOPS: AtomicU64 = AtomicU64::new(0);
static TOTAL_H2D_BYTES: AtomicU64 = AtomicU64::new(0);
static TOTAL_D2H_BYTES: AtomicU64 = AtomicU64::new(0);
static TOTAL_GPU_MICROS: AtomicU64 = AtomicU64::new(0);
static PER_OP: Mutex<Option<BTreeMap<&'static str, PerOpStats>>> = Mutex::new(None);
#[derive(Clone, Copy, Default)]
pub(super) struct PerOpStats {
pub success: u64,
pub policy_decline: u64,
pub runtime_decline: u64,
pub flops: u64,
pub gpu_micros: u64,
}
fn with_per_op<F: FnOnce(&mut BTreeMap<&'static str, PerOpStats>)>(f: F) {
let mut guard = match PER_OP.lock() {
Ok(g) => g,
Err(poisoned) => poisoned.into_inner(),
};
let map = guard.get_or_insert_with(BTreeMap::new);
f(map);
}
pub(super) fn record_success(
op: &'static str,
flops: u64,
h2d_bytes: usize,
d2h_bytes: usize,
elapsed_s: f64,
) {
SUCCESS_COUNT.fetch_add(1, Ordering::Relaxed);
TOTAL_FLOPS.fetch_add(flops, Ordering::Relaxed);
TOTAL_H2D_BYTES.fetch_add(h2d_bytes as u64, Ordering::Relaxed);
TOTAL_D2H_BYTES.fetch_add(d2h_bytes as u64, Ordering::Relaxed);
let micros = (elapsed_s * 1_000_000.0).max(0.0) as u64;
TOTAL_GPU_MICROS.fetch_add(micros, Ordering::Relaxed);
with_per_op(|map| {
let entry = map.entry(op).or_default();
entry.success += 1;
entry.flops = entry.flops.saturating_add(flops);
entry.gpu_micros = entry.gpu_micros.saturating_add(micros);
});
}
pub(super) fn record_policy_decline(op: &'static str) {
POLICY_DECLINE_COUNT.fetch_add(1, Ordering::Relaxed);
with_per_op(|map| {
map.entry(op).or_default().policy_decline += 1;
});
}
pub(super) fn record_runtime_decline(op: &'static str) {
RUNTIME_DECLINE_COUNT.fetch_add(1, Ordering::Relaxed);
with_per_op(|map| {
map.entry(op).or_default().runtime_decline += 1;
});
}
pub(super) fn snapshot_summary() -> String {
let success = SUCCESS_COUNT.load(Ordering::Relaxed);
let policy_decline = POLICY_DECLINE_COUNT.load(Ordering::Relaxed);
let runtime_decline = RUNTIME_DECLINE_COUNT.load(Ordering::Relaxed);
let flops = TOTAL_FLOPS.load(Ordering::Relaxed);
let h2d = TOTAL_H2D_BYTES.load(Ordering::Relaxed);
let d2h = TOTAL_D2H_BYTES.load(Ordering::Relaxed);
let gpu_micros = TOTAL_GPU_MICROS.load(Ordering::Relaxed);
let total = success + policy_decline + runtime_decline;
if total == 0 {
return "[GPU] activity summary: no dispatch sites reached".to_string();
}
let gpu_seconds = (gpu_micros as f64) / 1_000_000.0;
let measured_gflops = if gpu_seconds > 0.0 {
(flops as f64) / gpu_seconds / 1e9
} else {
0.0
};
let mut lines = Vec::new();
lines.push(format!(
"[GPU] activity summary | dispatched={} declined_policy={} declined_runtime={} | work={}flop in {:.3}s ({:.0} GFLOP/s effective) | transfer=h2d:{} d2h:{}",
success,
policy_decline,
runtime_decline,
format_count(flops),
gpu_seconds,
measured_gflops,
format_bytes(h2d as usize),
format_bytes(d2h as usize),
));
let snapshot: Vec<(&'static str, PerOpStats)> = {
let guard = match PER_OP.lock() {
Ok(g) => g,
Err(poisoned) => poisoned.into_inner(),
};
match guard.as_ref() {
Some(map) => map.iter().map(|(k, v)| (*k, *v)).collect(),
None => Vec::new(),
}
};
if !snapshot.is_empty() {
let mut sorted = snapshot;
sorted.sort_by(|a, b| b.1.success.cmp(&a.1.success).then_with(|| a.0.cmp(b.0)));
for (op, stats) in sorted {
let gpu_seconds = (stats.gpu_micros as f64) / 1_000_000.0;
lines.push(format!(
" {op:>32}: gpu={} cpu_policy={} cpu_runtime={} work={}flop t={:.3}s",
stats.success,
stats.policy_decline,
stats.runtime_decline,
format_count(stats.flops),
gpu_seconds,
));
}
}
lines.join("\n")
}
}
#[cfg(test)]
mod activity_tests {
use super::activity;
#[test]
fn empty_summary_short_circuits() {
let s = activity::snapshot_summary();
assert!(s.contains("activity summary") || s.contains("no dispatch"));
}
}