use std::collections::VecDeque;
use std::sync::{Mutex, OnceLock};
const MAX_STATS: usize = 1024;
#[derive(Clone, Debug, Default)]
pub struct KernelStat {
pub name: &'static str,
pub n: usize,
pub p: usize,
pub k: usize,
pub nnz: usize,
pub flops_est: usize,
pub bytes_est: usize,
pub cpu_ms: f64,
pub gpu_ms: Option<f64>,
}
#[derive(Clone, Debug, Default)]
pub struct KernelStatsSnapshot {
pub stats: Vec<KernelStat>,
}
static STATS: OnceLock<Mutex<VecDeque<KernelStat>>> = OnceLock::new();
fn stats() -> &'static Mutex<VecDeque<KernelStat>> {
STATS.get_or_init(|| Mutex::new(VecDeque::with_capacity(MAX_STATS)))
}
pub fn record(stat: KernelStat) {
if let Ok(mut guard) = stats().lock() {
if guard.len() == MAX_STATS {
guard.pop_front();
}
guard.push_back(stat);
}
}
pub fn snapshot() -> KernelStatsSnapshot {
if let Ok(guard) = stats().lock() {
KernelStatsSnapshot {
stats: guard.iter().cloned().collect(),
}
} else {
KernelStatsSnapshot::default()
}
}
pub fn clear() {
if let Ok(mut guard) = stats().lock() {
guard.clear();
}
}
use std::cell::RefCell;
#[derive(Clone, Debug, Default, PartialEq, Eq)]
pub struct GpuExecutionTelemetry {
pub h2d_bytes: usize,
pub d2h_bytes: usize,
pub factorization_count: usize,
pub handle_creation_count: usize,
pub kernel_launch_count: usize,
pub cpu_fallback_count: usize,
pub cpu_fallback_reasons: Vec<String>,
pub context_id: usize,
}
thread_local! {
static EXECUTION_TELEMETRY: RefCell<GpuExecutionTelemetry> =
RefCell::new(GpuExecutionTelemetry::default());
}
#[inline]
pub fn telemetry_with<R>(f: impl FnOnce(&mut GpuExecutionTelemetry) -> R) -> R {
EXECUTION_TELEMETRY.with(|cell| f(&mut cell.borrow_mut()))
}
#[inline]
pub fn telemetry_record_h2d(bytes: usize) {
telemetry_with(|t| t.h2d_bytes += bytes);
}
#[inline]
pub fn telemetry_record_d2h(bytes: usize) {
telemetry_with(|t| t.d2h_bytes += bytes);
}
#[inline]
pub fn telemetry_record_factorization() {
telemetry_with(|t| t.factorization_count += 1);
}
#[inline]
pub fn telemetry_record_handle_creation(context_id: usize) {
telemetry_with(|t| {
t.handle_creation_count += 1;
t.context_id = context_id;
});
}
#[inline]
pub fn telemetry_record_kernel_launch() {
telemetry_with(|t| t.kernel_launch_count += 1);
}
#[inline]
pub fn telemetry_record_cpu_fallback(reason: impl Into<String>) {
telemetry_with(|t| {
t.cpu_fallback_count += 1;
t.cpu_fallback_reasons.push(reason.into());
});
}
#[must_use]
pub fn telemetry_snapshot() -> GpuExecutionTelemetry {
telemetry_with(|t| t.clone())
}
pub fn telemetry_reset() {
telemetry_with(|t| *t = GpuExecutionTelemetry::default());
}