gpur 0.5.0 - Docs.rs

//! AMD backend. Linux: sysfs/amdgpu. Windows: ADLX (not yet implemented).

use super::GpuBackend;

pub fn probe() -> Option<Box<dyn GpuBackend>> {
    #[cfg(target_os = "linux")]
    if let Some(b) = linux_impl::probe() {
        return Some(b);
    }
    // TODO Windows: ADLX bindings.
    None
}

#[cfg(target_os = "linux")]
mod linux_impl {
    use crate::backend::linux::{
        self, card_name, cards_with_vendor, first_dir, pdev_of, read_trim, read_u64,
    };
    use crate::backend::{GpuBackend, GpuProcess, GpuSnapshot, ProcKind};
    use anyhow::Result;
    use std::collections::{HashMap, HashSet};
    use std::fs;
    use std::path::{Path, PathBuf};
    use std::time::Instant;

    const AMD_VENDOR: &str = "0x1002";

    pub fn probe() -> Option<Box<dyn GpuBackend>> {
        let devices = scan("/sys/class/drm");
        if devices.is_empty() {
            return None;
        }
        Some(Box::new(AmdBackend {
            devices,
            engine_state: HashMap::new(),
            last_procs: Vec::new(),
        }))
    }

    struct AmdDevice {
        name: String,
        dev: PathBuf,
        hwmon: Option<PathBuf>,
        /// PCI address ("0000:75:00.0"), matched against fdinfo drm-pdev.
        pdev: Option<String>,
        /// Critical edge temperature (°C), for the throttle heuristic.
        temp_crit_c: Option<f64>,
    }

    struct AmdBackend {
        devices: Vec<AmdDevice>,
        /// (pid, drm-client-id) -> (total engine ns, video engine ns) at last scan.
        engine_state: HashMap<(u32, u64), (u64, u64, Instant)>,
        /// Built during poll's fdinfo sweep, served by processes().
        last_procs: Vec<GpuProcess>,
    }

    /// VCN/media engines in amdgpu fdinfo naming.
    fn is_video_engine(name: &str) -> bool {
        name.starts_with("dec")
            || name.starts_with("enc")
            || name.starts_with("jpeg")
            || name.starts_with("vcn")
            || name.starts_with("vpe")
    }

    impl GpuBackend for AmdBackend {
        fn name(&self) -> &'static str {
            "amdgpu"
        }

        fn poll(&mut self) -> Result<Vec<GpuSnapshot>> {
            // One fdinfo sweep per poll: per-process rows + device video util
            // (sysfs has gpu_busy_percent but nothing for the VCN engines).
            let video_util = self.sweep_clients();
            Ok(self
                .devices
                .iter()
                .enumerate()
                .map(|(i, d)| sample(d, video_util.get(&i).copied()))
                .collect())
        }

        fn processes(&mut self) -> Vec<GpuProcess> {
            self.last_procs.clone()
        }
    }

    impl AmdBackend {
        /// Returns per-device video-engine utilization; refreshes last_procs.
        fn sweep_clients(&mut self) -> HashMap<usize, f64> {
            let pdev_to_gpu: HashMap<&str, usize> = self
                .devices
                .iter()
                .enumerate()
                .filter_map(|(i, d)| d.pdev.as_deref().map(|p| (p, i)))
                .collect();

            // (pid, gpu) -> aggregated stats across that process's DRM clients.
            let mut agg: HashMap<(u32, usize), (f64, u64, bool)> = HashMap::new();
            let mut video_util: HashMap<usize, f64> = HashMap::new();
            let mut seen_clients: HashSet<(u32, u64)> = HashSet::new();
            let now = Instant::now();

            for pid in linux::proc_pids() {
                for client in linux::drm_clients(pid, "amdgpu") {
                    let Some(&gpu) = client.pdev.as_deref().and_then(|p| pdev_to_gpu.get(p)) else {
                        continue;
                    };
                    // The same client shows once per duplicated fd — count once.
                    if !seen_clients.insert((pid, client.id)) {
                        continue;
                    }
                    let engine_ns = client.total_engine_ns();
                    let video_ns: u64 = client
                        .engine_ns
                        .iter()
                        .filter(|(k, _)| is_video_engine(k))
                        .map(|(_, v)| *v)
                        .sum();
                    let (util, vutil) = match self.engine_state.get(&(pid, client.id)) {
                        Some((prev_ns, prev_video, prev_at)) => {
                            let wall = now.duration_since(*prev_at).as_nanos() as f64;
                            if wall > 0.0 {
                                (
                                    (engine_ns.saturating_sub(*prev_ns) as f64 / wall * 100.0)
                                        .clamp(0.0, 100.0),
                                    (video_ns.saturating_sub(*prev_video) as f64 / wall * 100.0)
                                        .clamp(0.0, 100.0),
                                )
                            } else {
                                (0.0, 0.0)
                            }
                        }
                        None => (0.0, 0.0),
                    };
                    self.engine_state
                        .insert((pid, client.id), (engine_ns, video_ns, now));

                    *video_util.entry(gpu).or_default() += vutil;
                    let e = agg.entry((pid, gpu)).or_insert((0.0, 0, false));
                    e.0 += util;
                    e.1 += client.memory.get("vram").copied().unwrap_or(0);
                    e.2 |= client.engine_ns.get("gfx").copied().unwrap_or(0) > 0;
                }
            }

            // Forget clients that vanished so the map doesn't grow forever.
            self.engine_state.retain(|k, _| seen_clients.contains(k));

            self.last_procs = agg
                .into_iter()
                .map(|((pid, gpu_index), (util, vram, graphics))| GpuProcess {
                    pid,
                    gpu_index,
                    kind: if graphics {
                        ProcKind::Graphics
                    } else {
                        ProcKind::Compute
                    },
                    gpu_util_pct: Some(util.min(100.0)),
                    gpu_mem_bytes: vram,
                })
                .collect();
            video_util
        }
    }

    fn scan(drm: &str) -> Vec<AmdDevice> {
        cards_with_vendor(drm, AMD_VENDOR)
            .into_iter()
            .map(|(idx, dev)| {
                let name = card_name(&dev, idx, "1002", "AMD");
                let hwmon = first_dir(&dev.join("hwmon"));
                let pdev = pdev_of(&dev);
                let temp_crit_c = hwmon
                    .as_deref()
                    .and_then(|h| read_u64(&h.join("temp1_crit")))
                    .map(|v| v as f64 / 1000.0);
                AmdDevice {
                    name,
                    dev,
                    hwmon,
                    pdev,
                    temp_crit_c,
                }
            })
            .collect()
    }

    fn sample(d: &AmdDevice, video_util: Option<f64>) -> GpuSnapshot {
        let h = d.hwmon.as_deref();
        let temperature_c = hwmon_u64(h, "temp1_input").map(|v| v as f64 / 1000.0);
        let power_w = hwmon_u64(h, "power1_average")
            .or_else(|| hwmon_u64(h, "power1_input"))
            .map(|v| v as f64 / 1e6);
        let power_limit_w = hwmon_u64(h, "power1_cap")
            .filter(|v| *v > 0)
            .or_else(|| hwmon_u64(h, "power1_cap_default").filter(|v| *v > 0))
            .map(|v| v as f64 / 1e6);

        // Heuristic (amdgpu's real throttle bits live in the versioned
        // gpu_metrics blob): flag when pinned at the power cap or within a
        // few degrees of the critical temperature.
        let mut throttle_parts: Vec<&str> = Vec::new();
        if let (Some(t), Some(crit)) = (temperature_c, d.temp_crit_c)
            && t >= crit - 3.0
        {
            throttle_parts.push("thermal");
        }
        if let (Some(w), Some(cap)) = (power_w, power_limit_w)
            && w >= cap * 0.99
        {
            throttle_parts.push("power-limit");
        }
        let throttle = if throttle_parts.is_empty() {
            None
        } else {
            Some(throttle_parts.join("+"))
        };

        GpuSnapshot {
            name: d.name.clone(),
            integrated: is_apu(&d.dev),
            utilization_pct: read_u64(&d.dev.join("gpu_busy_percent")).unwrap_or(0) as f64,
            mem_util_pct: read_u64(&d.dev.join("mem_busy_percent")).map(|v| v as f64),
            video_util_pct: video_util.map(|v| v.min(100.0)),
            enc_util_pct: None,
            dec_util_pct: None,
            throttle,
            vram_used_bytes: read_u64(&d.dev.join("mem_info_vram_used")).unwrap_or(0),
            vram_total_bytes: read_u64(&d.dev.join("mem_info_vram_total")).unwrap_or(0),
            // temp1 = edge sensor (millidegrees); power in microwatts with
            // the APU fallback and cap-of-0 handling done above.
            temperature_c,
            power_w,
            power_limit_w,
            fan_pct: fan_pct(h),
            // Hz. Reads 0 when the clock domain is power-gated at idle.
            clock_mhz: hwmon_u64(h, "freq1_input")
                .map(|v| v / 1_000_000)
                .or_else(|| dpm_active_mhz(&d.dev.join("pp_dpm_sclk"))),
            mem_clock_mhz: hwmon_u64(h, "freq2_input")
                .map(|v| v / 1_000_000)
                // APUs have no freq2_input; the active DPM level has it.
                .or_else(|| dpm_active_mhz(&d.dev.join("pp_dpm_mclk"))),
            pcie_gen: read_trim(&d.dev.join("current_link_speed"))
                .as_deref()
                .and_then(gts_to_gen),
            pcie_width: read_trim(&d.dev.join("current_link_width")).and_then(|w| w.parse().ok()),
            pcie_max_gen: read_trim(&d.dev.join("max_link_speed"))
                .as_deref()
                .and_then(gts_to_gen),
            pcie_max_width: read_trim(&d.dev.join("max_link_width")).and_then(|w| w.parse().ok()),
            // amdgpu does not expose PCIe throughput counters.
            pcie_rx_kbs: None,
            pcie_tx_kbs: None,
        }
    }

    /// gpu_metrics header byte 2 is the format revision: v1_x = discrete,
    /// v2_x/v3_x = APU. Missing file -> assume discrete.
    fn is_apu(dev: &Path) -> bool {
        fs::read(dev.join("gpu_metrics"))
            .ok()
            .and_then(|b| b.get(2).copied())
            .is_some_and(|rev| rev >= 2)
    }

    /// "16.0 GT/s PCIe" -> Some(4). Gen1=2.5, doubling each gen after Gen2.
    fn gts_to_gen(speed: &str) -> Option<u8> {
        let gts: f64 = speed.split_whitespace().next()?.parse().ok()?;
        Some(match gts {
            s if s >= 128.0 => 7,
            s if s >= 64.0 => 6,
            s if s >= 32.0 => 5,
            s if s >= 16.0 => 4,
            s if s >= 8.0 => 3,
            s if s >= 5.0 => 2,
            _ => 1,
        })
    }

    /// Parse the '*'-marked active level of a pp_dpm_{s,m}clk table:
    /// "1: 3000Mhz *" -> Some(3000).
    fn dpm_active_mhz(path: &Path) -> Option<u64> {
        let table = read_trim(path)?;
        let active = table.lines().find(|l| l.trim_end().ends_with('*'))?;
        let digits: String = active
            .split(':')
            .nth(1)?
            .trim()
            .chars()
            .take_while(char::is_ascii_digit)
            .collect();
        digits.parse().ok()
    }

    fn fan_pct(h: Option<&Path>) -> Option<f64> {
        let pwm = hwmon_u64(h, "pwm1")?;
        let max = hwmon_u64(h, "pwm1_max").filter(|v| *v > 0).unwrap_or(255);
        Some(pwm as f64 / max as f64 * 100.0)
    }

    fn hwmon_u64(hwmon: Option<&Path>, file: &str) -> Option<u64> {
        read_u64(&hwmon?.join(file))
    }

    #[cfg(test)]
    mod tests {
        use super::*;

        #[test]
        fn pcie_gen_from_gts_string() {
            assert_eq!(gts_to_gen("2.5 GT/s PCIe"), Some(1));
            assert_eq!(gts_to_gen("8.0 GT/s PCIe"), Some(3));
            assert_eq!(gts_to_gen("16.0 GT/s PCIe"), Some(4));
            assert_eq!(gts_to_gen("32.0 GT/s PCIe"), Some(5));
            assert_eq!(gts_to_gen("garbage"), None);
        }

        #[test]
        fn dpm_table_active_level_parses() {
            let dir = std::env::temp_dir().join("gpur-dpm-test");
            std::fs::create_dir_all(&dir).unwrap();
            let f = dir.join("pp_dpm_mclk");
            std::fs::write(&f, "0: 96Mhz\n1: 3000Mhz *\n2: 1249Mhz\n").unwrap();
            assert_eq!(dpm_active_mhz(&f), Some(3000));
            std::fs::write(&f, "S: 0Mhz *\n").unwrap();
            assert_eq!(dpm_active_mhz(&f), Some(0));
        }

        #[test]
        #[ignore = "requires AMD hardware; run with --ignored --nocapture"]
        fn live_poll_reports_devices() {
            let mut backend = probe().expect("no amdgpu devices visible in /sys/class/drm");
            let gpus = backend.poll().unwrap();
            assert!(!gpus.is_empty());
            for g in &gpus {
                println!(
                    "{}: util={}% vram={}/{}MiB temp={:?}C power={:?}W fan={:?}% core={:?}MHz mem={:?}MHz",
                    g.name,
                    g.utilization_pct,
                    g.vram_used_bytes / 1024 / 1024,
                    g.vram_total_bytes / 1024 / 1024,
                    g.temperature_c,
                    g.power_w,
                    g.fan_pct,
                    g.clock_mhz,
                    g.mem_clock_mhz,
                );
                assert!(g.vram_total_bytes > 0, "vram total should be nonzero");
            }
            // Two samples so engine-ns deltas can produce utilization.
            let _ = backend.processes();
            std::thread::sleep(std::time::Duration::from_millis(300));
            backend.poll().unwrap();
            let procs = backend.processes();
            for p in &procs {
                println!(
                    "pid={} gpu={} kind={:?} util={:?}% vram={}MiB",
                    p.pid,
                    p.gpu_index,
                    p.kind,
                    p.gpu_util_pct,
                    p.gpu_mem_bytes / 1024 / 1024,
                );
            }
            println!("{} gpu processes", procs.len());
        }
    }
}