pub(super) mod gpu_profile {
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::Mutex;
use std::time::Instant;
static ENABLED: AtomicBool = AtomicBool::new(false);
static INITIALIZED: AtomicBool = AtomicBool::new(false);
pub struct GpuTimings {
pub wall_ms: f64,
pub cpu_encode_ms: f64,
pub gpu_exec_ms: f64,
}
static STATS: Mutex<Vec<GpuTimings>> = Mutex::new(Vec::new());
pub fn is_enabled() -> bool {
if !INITIALIZED.load(Ordering::Relaxed) {
INITIALIZED.store(true, Ordering::Relaxed);
if std::env::var("OXIBONSAI_PROFILE_GPU").is_ok() {
ENABLED.store(true, Ordering::Relaxed);
}
}
ENABLED.load(Ordering::Relaxed)
}
pub unsafe fn gpu_cmd_times(cmd_buf: &metal::CommandBufferRef) -> (f64, f64) {
let start: f64 = msg_send![cmd_buf, GPUStartTime];
let end: f64 = msg_send![cmd_buf, GPUEndTime];
(start, end)
}
pub fn record_and_print(
wall_start: Instant,
encode_end: Instant,
gpu_start: f64,
gpu_end: f64,
) {
let wall_end = Instant::now();
let wall_ms = wall_end.duration_since(wall_start).as_secs_f64() * 1000.0;
let cpu_encode_ms = encode_end.duration_since(wall_start).as_secs_f64() * 1000.0;
let gpu_exec_ms = (gpu_end - gpu_start) * 1000.0;
let overhead_ms = (wall_ms - cpu_encode_ms - gpu_exec_ms).max(0.0);
if let Ok(mut stats) = STATS.lock() {
let token_num = stats.len();
eprintln!(
"[GPU Profile] token={} wall={:.1}ms cpu_encode={:.1}ms gpu_exec={:.1}ms overhead={:.1}ms",
token_num, wall_ms, cpu_encode_ms, gpu_exec_ms, overhead_ms,
);
stats.push(GpuTimings {
wall_ms,
cpu_encode_ms,
gpu_exec_ms,
});
}
}
pub fn print_summary(model_size_bytes: u64) {
if let Ok(stats) = STATS.lock() {
if stats.is_empty() {
return;
}
let n = stats.len() as f64;
let avg_wall: f64 = stats.iter().map(|s| s.wall_ms).sum::<f64>() / n;
let avg_cpu: f64 = stats.iter().map(|s| s.cpu_encode_ms).sum::<f64>() / n;
let avg_gpu: f64 = stats.iter().map(|s| s.gpu_exec_ms).sum::<f64>() / n;
let avg_overhead = (avg_wall - avg_cpu - avg_gpu).max(0.0);
let gpu_bw = if avg_gpu > 0.0 {
(model_size_bytes as f64) / (avg_gpu / 1000.0) / 1e9
} else {
0.0
};
eprintln!(
"[GPU Profile Summary] tokens={} avg: wall={:.1}ms cpu={:.1}ms gpu={:.1}ms overhead={:.1}ms gpu_bw={:.1}GB/s",
stats.len() as u64, avg_wall, avg_cpu, avg_gpu, avg_overhead, gpu_bw,
);
}
}
}
pub fn print_gpu_profile_summary(model_size_bytes: u64) {
if gpu_profile::is_enabled() {
gpu_profile::print_summary(model_size_bytes);
}
}