1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
use super::{Adaptor, MetricMetadata};
use crate::metric::{Metric, MetricEntry};
use nvml_wrapper::Nvml;

static NAME: &str = "CUDA";

/// Track basic cuda infos.
pub struct CUDAMetric {
    nvml: Option<Nvml>,
}

impl CUDAMetric {
    /// Creates a new metric for CUDA.
    pub fn new() -> Self {
        Self {
            nvml: Nvml::init().map(Some).unwrap_or_else(|err| {
                log::warn!("Unable to initialize CUDA Metric: {err}");
                None
            }),
        }
    }
}

impl Default for CUDAMetric {
    fn default() -> Self {
        Self::new()
    }
}

impl<T> Adaptor<()> for T {
    fn adapt(&self) {}
}

impl Metric for CUDAMetric {
    type Input = ();

    fn update(&mut self, _item: &(), _metadata: &MetricMetadata) -> MetricEntry {
        let not_available = || {
            MetricEntry::new(
                NAME.to_string(),
                "Unavailable".to_string(),
                "Unavailable".to_string(),
            )
        };

        let available = |nvml: &Nvml| {
            let mut formatted = String::new();
            let mut raw_running = String::new();

            let device_count = match nvml.device_count() {
                Ok(val) => val,
                Err(err) => {
                    log::warn!("Unable to get the number of cuda devices: {err}");
                    return not_available();
                }
            };

            for index in 0..device_count {
                let device = match nvml.device_by_index(index) {
                    Ok(val) => val,
                    Err(err) => {
                        log::warn!("Unable to get device {index}: {err}");
                        return not_available();
                    }
                };
                let memory_info = match device.memory_info() {
                    Ok(info) => info,
                    Err(err) => {
                        log::warn!("Unable to get memory info from device {index}: {err}");
                        return not_available();
                    }
                };

                let used_gb = memory_info.used as f64 * 1e-9;
                let total_gb = memory_info.total as f64 * 1e-9;

                let memory_info_formatted = format!("{used_gb:.2}/{total_gb:.2} Gb");
                let memory_info_raw = format!("{used_gb}/{total_gb}");

                formatted = format!("{formatted} GPU #{index} - Memory {memory_info_formatted}");
                raw_running = format!("{memory_info_raw} ");

                let utilization_rates = match device.utilization_rates() {
                    Ok(rate) => rate,
                    Err(err) => {
                        log::warn!("Unable to get utilization rates from device {index}: {err}");
                        return not_available();
                    }
                };
                let utilization_rate_formatted = format!("{}%", utilization_rates.gpu);
                formatted = format!("{formatted} - Usage {utilization_rate_formatted}");
            }

            MetricEntry::new(NAME.to_string(), formatted, raw_running)
        };

        match &self.nvml {
            Some(nvml) => available(nvml),
            None => not_available(),
        }
    }

    fn clear(&mut self) {}
}