vor 0.2.0

Cross-platform performance instrumentation with an in-app egui panel and live system and GPU metrics.
Documentation
//! macOS GPU backend. Both sources are `sudo`-free.
//!
//! - Utilization (`Device`) and SM (`Renderer`) come from the
//!   `IOAccelerator` registry entry's `PerformanceStatistics`
//!   dictionary, re-read each poll.
//! - Power is ΔEnergy / Δt over the `GPU Energy` channel of the
//!   private IOReport framework's `Energy Model` group. IOReport
//!   ships only inside the dyld shared cache on recent macOS, so it
//!   is bound at runtime via `dlopen` rather than linked.
//!
//! PCIe throughput has no meaning on unified-memory Apple Silicon,
//! so that field stays zero.

use std::ffi::{CStr, c_char, c_void};
use std::sync::atomic::{AtomicBool, Ordering};
use std::time::Instant;

use core_foundation::array::{CFArrayGetCount, CFArrayGetValueAtIndex, CFArrayRef};
use core_foundation::base::{CFEqual, CFRelease, CFTypeRef, TCFType, kCFAllocatorDefault};
use core_foundation::dictionary::{CFDictionaryGetValue, CFDictionaryRef, CFMutableDictionaryRef};
use core_foundation::number::{CFNumberGetValue, CFNumberRef, kCFNumberSInt64Type};
use core_foundation::string::{CFString, CFStringRef};
use io_kit_sys::types::io_registry_entry_t;
use io_kit_sys::{
    IOIteratorNext, IOObjectRelease, IORegistryEntryCreateCFProperties,
    IOServiceGetMatchingServices, IOServiceMatching, kIOMasterPortDefault,
};

use super::GpuReading;

type CopyChannelsInGroup =
    unsafe extern "C" fn(CFStringRef, CFStringRef, u64, u64, u64) -> CFMutableDictionaryRef;
type CreateSubscription = unsafe extern "C" fn(
    *const c_void,
    CFMutableDictionaryRef,
    *mut CFMutableDictionaryRef,
    u64,
    CFTypeRef,
) -> *const c_void;
type CreateSamples =
    unsafe extern "C" fn(*const c_void, CFMutableDictionaryRef, CFTypeRef) -> CFDictionaryRef;
type CreateSamplesDelta =
    unsafe extern "C" fn(CFDictionaryRef, CFDictionaryRef, CFTypeRef) -> CFDictionaryRef;
type ChannelGetStr = unsafe extern "C" fn(CFDictionaryRef) -> CFStringRef;
type SimpleGetIntegerValue = unsafe extern "C" fn(CFDictionaryRef, i32) -> i64;

struct IoReport {
    create_samples: CreateSamples,
    create_delta: CreateSamplesDelta,
    channel_name: ChannelGetStr,
    channel_unit: ChannelGetStr,
    simple_int: SimpleGetIntegerValue,
    subscription: *const c_void,
    channels: CFMutableDictionaryRef,
    prev: CFDictionaryRef,
    prev_at: Instant,
    k_channels: CFString,
    k_gpu_energy: CFString,
}

/// One read of the `IOAccelerator` PerformanceStatistics dict.
struct AccelStats {
    util: f32,
    sm: f32,
    mem_bytes: u64,
}

pub(super) struct Sampler {
    accelerator: io_registry_entry_t,
    energy: IoReport,
    k_perf: CFString,
    k_device: CFString,
    k_renderer: CFString,
    k_mem: CFString,
}

impl Sampler {
    /// `Some` on every real Mac (there is always an `IOAccelerator`).
    /// The `Option` only matches the NVIDIA backend's signature so the
    /// shared collector handles both; a Mac with no accelerator is
    /// genuinely broken and still panics inside `first_accelerator`.
    pub(super) fn new() -> Option<Self> {
        let accelerator = unsafe { first_accelerator() };
        let energy = unsafe { IoReport::new() };
        Some(Self {
            accelerator,
            energy,
            k_perf: CFString::new("PerformanceStatistics"),
            k_device: CFString::new("Device Utilization %"),
            k_renderer: CFString::new("Renderer Utilization %"),
            k_mem: CFString::new("In use system memory"),
        })
    }

    pub(super) fn poll(&mut self) -> GpuReading {
        let AccelStats {
            util,
            sm,
            mem_bytes,
        } = self.accel_stats();
        let power_w = self.energy.gpu_power_w();
        GpuReading {
            util,
            sm,
            pcie_bps: 0,
            power_w,
            mem_bytes,
            // Temperature / clock have no clean public source on Apple
            // Silicon; left zero (the rows are cuda-gated anyway).
            temp_c: 0.0,
            clock_mhz: 0.0,
        }
    }

    fn accel_stats(&self) -> AccelStats {
        unsafe {
            let mut props: CFMutableDictionaryRef = std::ptr::null_mut();
            let kr = IORegistryEntryCreateCFProperties(
                self.accelerator,
                &mut props,
                kCFAllocatorDefault,
                0,
            );
            assert_eq!(kr, 0);
            let perf = CFDictionaryGetValue(props, self.k_perf.as_concrete_TypeRef().cast());
            assert!(!perf.is_null());
            let perf = perf as CFDictionaryRef;
            let stats = AccelStats {
                util: dict_i64(perf, &self.k_device) as f32,
                sm: dict_i64(perf, &self.k_renderer) as f32,
                mem_bytes: dict_i64(perf, &self.k_mem) as u64,
            };
            CFRelease(props.cast());
            stats
        }
    }
}

impl IoReport {
    unsafe fn new() -> Self {
        unsafe {
            let lib = libc::dlopen(c"/usr/lib/libIOReport.dylib".as_ptr(), libc::RTLD_NOW);
            assert!(!lib.is_null());
            let copy_channels: CopyChannelsInGroup = load(lib, c"IOReportCopyChannelsInGroup");
            let create_subscription: CreateSubscription = load(lib, c"IOReportCreateSubscription");
            let create_samples: CreateSamples = load(lib, c"IOReportCreateSamples");
            let create_delta: CreateSamplesDelta = load(lib, c"IOReportCreateSamplesDelta");
            let channel_name: ChannelGetStr = load(lib, c"IOReportChannelGetChannelName");
            let channel_unit: ChannelGetStr = load(lib, c"IOReportChannelGetUnitLabel");
            let simple_int: SimpleGetIntegerValue = load(lib, c"IOReportSimpleGetIntegerValue");

            let group = CFString::new("Energy Model");
            let desired = copy_channels(group.as_concrete_TypeRef(), std::ptr::null(), 0, 0, 0);
            assert!(!desired.is_null());
            let mut channels: CFMutableDictionaryRef = std::ptr::null_mut();
            let subscription = create_subscription(
                std::ptr::null(),
                desired,
                &mut channels,
                0,
                std::ptr::null(),
            );
            assert!(!subscription.is_null());
            assert!(!channels.is_null());
            let prev = create_samples(subscription, channels, std::ptr::null());
            assert!(!prev.is_null());
            Self {
                create_samples,
                create_delta,
                channel_name,
                channel_unit,
                simple_int,
                subscription,
                channels,
                prev,
                prev_at: Instant::now(),
                k_channels: CFString::new("IOReportChannels"),
                k_gpu_energy: CFString::new("GPU Energy"),
            }
        }
    }

    fn gpu_power_w(&mut self) -> f32 {
        unsafe {
            let now = Instant::now();
            let dt = now.duration_since(self.prev_at).as_secs_f64();
            let current = (self.create_samples)(self.subscription, self.channels, std::ptr::null());
            assert!(!current.is_null());
            let delta = (self.create_delta)(self.prev, current, std::ptr::null());
            assert!(!delta.is_null());
            CFRelease(self.prev.cast());
            self.prev = current;
            self.prev_at = now;

            let chans = CFDictionaryGetValue(delta, self.k_channels.as_concrete_TypeRef().cast());
            assert!(!chans.is_null());
            let chans = chans as CFArrayRef;
            let mut joules = 0.0;
            for i in 0..CFArrayGetCount(chans) {
                let ch = CFArrayGetValueAtIndex(chans, i) as CFDictionaryRef;
                let name = (self.channel_name)(ch);
                if CFEqual(name.cast(), self.k_gpu_energy.as_concrete_TypeRef().cast()) == 0 {
                    continue;
                }
                let raw = (self.simple_int)(ch, 0);
                let unit = CFString::wrap_under_get_rule((self.channel_unit)(ch)).to_string();
                joules = raw as f64 * joules_per(&unit);
                break;
            }
            CFRelease(delta.cast());
            if dt <= 0.0 {
                return 0.0;
            }
            (joules / dt) as f32
        }
    }
}

unsafe fn first_accelerator() -> io_registry_entry_t {
    unsafe {
        let matching = IOServiceMatching(c"IOAccelerator".as_ptr().cast::<c_char>());
        let mut iter = 0;
        let kr = IOServiceGetMatchingServices(kIOMasterPortDefault, matching, &mut iter);
        assert_eq!(kr, 0);
        assert_ne!(iter, 0);
        let entry = IOIteratorNext(iter);
        assert_ne!(entry, 0);
        IOObjectRelease(iter);
        entry
    }
}

unsafe fn load<T>(lib: *mut c_void, name: &CStr) -> T {
    unsafe {
        let p = libc::dlsym(lib, name.as_ptr());
        assert!(!p.is_null(), "missing IOReport symbol: {name:?}");
        std::mem::transmute_copy(&p)
    }
}

unsafe fn dict_i64(dict: CFDictionaryRef, key: &CFString) -> i64 {
    // `PerformanceStatistics` keys vary across GPU generations, so a
    // miss is tolerated as 0 — but warned once so a silently-flat row
    // is traceable rather than mistaken for an idle GPU.
    static WARNED: AtomicBool = AtomicBool::new(false);
    unsafe {
        let v = CFDictionaryGetValue(dict, key.as_concrete_TypeRef().cast());
        if v.is_null() {
            if !WARNED.swap(true, Ordering::Relaxed) {
                tracing::warn!("IOAccelerator PerformanceStatistics missing {key}");
            }
            return 0;
        }
        let mut out: i64 = 0;
        let ok = CFNumberGetValue(
            v as CFNumberRef,
            kCFNumberSInt64Type,
            (&mut out as *mut i64).cast(),
        );
        assert!(ok);
        out
    }
}

fn joules_per(unit: &str) -> f64 {
    match unit {
        "nJ" => 1e-9,
        "uJ" | "µJ" => 1e-6,
        "mJ" => 1e-3,
        "J" => 1.0,
        other => panic!("unexpected IOReport energy unit: {other}"),
    }
}