singe_nvml/
library.rs

1#[allow(unused_imports)]
2use crate::error::Status;
3
4use std::{ffi::CString, mem::MaybeUninit, ptr};
5
6use singe_core::string_from_c_chars;
7use singe_nvml_sys as sys;
8
9use crate::{
10    device::Device,
11    error::Result,
12    try_ffi,
13    types::{
14        ConfComputeKeyRotationThreshold, ConfComputeSystemCaps, ConfComputeSystemSettings,
15        ConfComputeSystemState, CudaDriverVersion, EventData, EventTypes, ExcludedDeviceInfo,
16        GpmMetric, GpmMetricId, HwbcEntry, InitFlags, PgpuMetadata, Pid, SystemEventData,
17        SystemEventTypes, VgpuCompatibility, VgpuDriverCapability, VgpuMetadata, VgpuVersion,
18        VgpuVersionRange,
19    },
20    unit::Unit,
21    utility::struct_version,
22};
23
24#[derive(Debug)]
25pub struct EventSet(sys::nvmlEventSet_t);
26
27#[derive(Debug)]
28pub struct SystemEventSet(sys::nvmlSystemEventSet_t);
29
30#[derive(Debug)]
31pub struct Library;
32
33#[derive(Debug)]
34pub struct GpmSample(sys::nvmlGpmSample_t);
35
36impl Library {
37    /// Initializes NVML, but does not initialize any GPUs yet.
38    ///
39    /// * Newer NVML initialization adds flags that let callers adjust initialization behavior.
40    /// * In NVML 5.319, [`Library::create`] replaced the older initialization path that initialized all GPU devices in the system.
41    ///
42    /// This allows NVML to communicate with a GPU when other GPUs in the system are unstable or in a bad state.
43    /// With this initialization mode, GPUs are discovered and initialized lazily when you request a device handle from this crate.
44    ///
45    /// In contrast, the older initialization path in NVML 4.304 would fail if any detected GPU was in a bad or unstable state.
46    ///
47    /// For all products.
48    ///
49    /// Call this once before invoking any other methods in the library.
50    /// A reference count of the number of initializations is maintained.
51    /// Shutdown only occurs when the reference count reaches zero.
52    ///
53    /// # Errors
54    ///
55    /// Returns an error if the NVIDIA driver is not running, if NVML does not
56    /// have permission to communicate with the driver, or if NVML reports an
57    /// unexpected failure.
58    pub fn create() -> Result<Self> {
59        unsafe {
60            try_ffi!(sys::nvmlInit_v2())?;
61        }
62        Ok(Self)
63    }
64
65    /// Initializes NVML with the provided initialization flags.
66    /// This follows the same reference-counting behavior as [`Library::create`].
67    ///
68    /// For all products.
69    ///
70    /// # Errors
71    ///
72    /// Returns an error if the NVIDIA driver is not running, if NVML does not
73    /// have permission to communicate with the driver, or if NVML reports an
74    /// unexpected failure.
75    pub fn create_with_flags(flags: InitFlags) -> Result<Self> {
76        unsafe {
77            try_ffi!(sys::nvmlInitWithFlags(flags.bits()))?;
78        }
79        Ok(Self)
80    }
81
82    /// Returns the version of the system's graphics driver.
83    ///
84    /// For all products.
85    ///
86    /// The version identifier is an alphanumeric string.
87    /// It does not exceed 80 bytes including the terminating NUL byte.
88    /// This wrapper allocates the required NVML buffer internally.
89    ///
90    /// # Errors
91    ///
92    /// Returns an error if the internal version buffer is too small, if NVML
93    /// rejects the output argument, or if NVML has not been initialized.
94    pub fn driver_version(&self) -> Result<String> {
95        let mut buffer = [0i8; sys::NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE as usize];
96        unsafe {
97            try_ffi!(sys::nvmlSystemGetDriverVersion(
98                buffer.as_mut_ptr(),
99                buffer.len() as u32,
100            ))?;
101        }
102        Ok(string_from_c_chars(&buffer))
103    }
104
105    /// Returns the version of the NVML library.
106    ///
107    /// For all products.
108    ///
109    /// The version identifier is an alphanumeric string.
110    /// It does not exceed 80 bytes including the terminating NUL byte.
111    /// This wrapper allocates the required NVML buffer internally.
112    ///
113    /// # Errors
114    ///
115    /// Returns an error if the internal version buffer is too small or if NVML
116    /// rejects the output argument.
117    pub fn version(&self) -> Result<String> {
118        crate::version()
119    }
120
121    /// Returns the version of the CUDA driver from the shared library.
122    ///
123    /// For all products.
124    ///
125    /// CUDA driver version obtained by calling the CUDA driver.
126    ///
127    /// # Errors
128    ///
129    /// Returns an error if the CUDA driver library or version function cannot
130    /// be found, or if NVML rejects the output argument.
131    pub fn cuda_driver_version(&self) -> Result<CudaDriverVersion> {
132        let mut raw = 0;
133        unsafe {
134            try_ffi!(sys::nvmlSystemGetCudaDriverVersion_v2(&raw mut raw))?;
135        }
136        Ok(CudaDriverVersion::from_raw(raw))
137    }
138
139    /// Returns the version of the CUDA driver.
140    ///
141    /// For all products.
142    ///
143    /// The CUDA driver version is retrieved from the currently installed version of CUDA.
144    /// If the CUDA library is not found, this returns a known supported version number.
145    ///
146    /// # Errors
147    ///
148    /// Returns an error if NVML rejects the query arguments.
149    pub fn cuda_driver_version_fallback(&self) -> Result<CudaDriverVersion> {
150        let mut raw = 0;
151        unsafe {
152            try_ffi!(sys::nvmlSystemGetCudaDriverVersion(&raw mut raw))?;
153        }
154        Ok(CudaDriverVersion::from_raw(raw))
155    }
156
157    pub fn process_name(&self, pid: Pid) -> Result<String> {
158        self.process_name_with_capacity(pid, 1024)
159    }
160
161    /// Returns the name of the process with the provided process ID.
162    ///
163    /// For all products.
164    ///
165    /// The returned process name is truncated to `capacity` and encoded in ANSI.
166    ///
167    /// # Errors
168    ///
169    /// Returns an error if NVML rejects the process ID or output buffer, if the
170    /// process does not exist, if the current process lacks permission, if NVML
171    /// has not been initialized, or if NVML reports an unexpected failure.
172    pub fn process_name_with_capacity(&self, pid: Pid, capacity: usize) -> Result<String> {
173        let mut buffer = vec![0i8; capacity];
174        unsafe {
175            try_ffi!(sys::nvmlSystemGetProcessName(
176                pid.0,
177                buffer.as_mut_ptr(),
178                buffer.len() as u32,
179            ))?;
180        }
181        Ok(string_from_c_chars(&buffer))
182    }
183
184    /// Returns the driver branch of the NVIDIA driver installed on the system.
185    ///
186    /// For all products.
187    ///
188    /// The branch identifier is an alphanumeric string.
189    /// It does not exceed 80 bytes including the terminating NUL byte.
190    /// This wrapper allocates the required NVML buffer internally.
191    ///
192    /// # Errors
193    ///
194    /// Returns an error if the internal driver-branch buffer is too small, if
195    /// NVML rejects the query arguments, if NVML has not been initialized, or if
196    /// NVML reports an unexpected failure.
197    pub fn driver_branch(&self) -> Result<String> {
198        let mut info = sys::nvmlSystemDriverBranchInfo_t {
199            version: struct_version::<sys::nvmlSystemDriverBranchInfo_t>(1),
200            ..Default::default()
201        };
202        unsafe {
203            try_ffi!(sys::nvmlSystemGetDriverBranch(
204                &raw mut info,
205                sys::NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE,
206            ))?;
207        }
208        Ok(string_from_c_chars(&info.branch))
209    }
210
211    /// Returns the requested vGPU driver capability.
212    ///
213    /// See [`VgpuDriverCapability`] for the supported capabilities.
214    /// Returns a boolean indicating whether the capability is supported.
215    ///
216    /// For Maxwell or newer fully supported devices.
217    ///
218    /// # Errors
219    ///
220    /// Returns an error if NVML rejects the requested capability, if the current
221    /// driver state does not support vGPU capability queries, if NVML has not
222    /// been initialized, or if NVML reports an unexpected failure.
223    pub fn vgpu_driver_capability(&self, capability: VgpuDriverCapability) -> Result<bool> {
224        let mut result = 0;
225        unsafe {
226            try_ffi!(sys::nvmlGetVgpuDriverCapabilities(
227                capability.into(),
228                &raw mut result,
229            ))?;
230        }
231        Ok(result != 0)
232    }
233
234    /// Query the ranges of supported vGPU versions.
235    ///
236    /// Returns the preset linear range of supported vGPU versions for the NVIDIA vGPU Manager and the administrator-configured range.
237    /// If the preset range has not been overridden by [`Library::set_vgpu_version`], both ranges are the same.
238    ///
239    /// This wrapper returns both the preset supported range and the administrator-configured current range.
240    /// By default, the current range matches the preset range.
241    ///
242    /// # Errors
243    ///
244    /// Returns an error if NVML does not support this query, rejects the output
245    /// buffers, or fails while fetching the version ranges.
246    pub fn vgpu_version(&self) -> Result<VgpuVersionRange> {
247        let mut supported = sys::nvmlVgpuVersion_t::default();
248        let mut current = sys::nvmlVgpuVersion_t::default();
249        unsafe {
250            try_ffi!(sys::nvmlGetVgpuVersion(
251                &raw mut supported,
252                &raw mut current
253            ))?;
254        }
255        Ok(VgpuVersionRange {
256            supported: supported.into(),
257            current: current.into(),
258        })
259    }
260
261    /// Override the preset range of vGPU versions supported by the NVIDIA vGPU Manager with a range set by an administrator.
262    ///
263    /// Configures the NVIDIA vGPU Manager with an administrator-provided range of supported vGPU versions.
264    /// This range must be a subset of the preset range that the NVIDIA vGPU Manager supports.
265    /// The custom range set by an administrator takes precedence over the preset range and is advertised to the guest VM for negotiating the vGPU version.
266    /// See [`Library::vgpu_version`] for details of how to query the preset range of versions supported.
267    ///
268    /// This overrides the preset vGPU version range with the administrator-provided range.
269    ///
270    /// After host system reboot or driver reload, the range of supported versions reverts to the range that is preset for the NVIDIA vGPU Manager.
271    ///
272    /// 1. The range set by the administrator must be a subset of the preset range that the NVIDIA vGPU Manager supports.
273    ///    Otherwise, an error is returned.
274    /// 2. If the range of supported guest driver versions does not overlap the range set by the administrator,
275    ///    the guest driver fails to load.
276    /// 3. If the range of supported guest driver versions overlaps the range set by the administrator,
277    ///    the guest driver loads with a negotiated vGPU version equal to the
278    ///    maximum value in the overlapping range.
279    /// 4. No VMs must be running on the host when setting the version range.
280    ///    If a VM is running on the host, the call fails.
281    ///
282    /// # Errors
283    ///
284    /// Returns an error if `version` is invalid or outside the preset supported
285    /// range, if a VM is running on the host, or if the installed vGPU Manager
286    /// does not support overriding the version range.
287    pub fn set_vgpu_version(&self, version: VgpuVersion) -> Result<()> {
288        let mut version: sys::nvmlVgpuVersion_t = version.into();
289        unsafe {
290            try_ffi!(sys::nvmlSetVgpuVersion(&raw mut version))?;
291        }
292        Ok(())
293    }
294
295    /// Takes a vGPU instance metadata structure read from [`VgpuInstance::metadata`](crate::vgpu_instance::VgpuInstance::metadata), and a vGPU metadata structure for a physical GPU read from [`Device::vgpu_metadata`], and returns compatibility information of the vGPU instance and the physical GPU.
296    ///
297    /// This wrapper returns compatibility information describing whether the vGPU or VM may be booted on the physical GPU.
298    /// If the vGPU / VM compatibility with the physical GPU is limited, a limit code indicates the factor limiting compatibility.
299    /// See the returned compatibility structure for the reported limit code.
300    ///
301    /// vGPU compatibility does not take into account dynamic capacity conditions that may limit a system's ability to boot a given vGPU or associated VM.
302    ///
303    /// # Errors
304    ///
305    /// Returns an error if NVML rejects either metadata blob or reports an
306    /// unexpected compatibility-query failure.
307    pub fn vgpu_compatibility(
308        &self,
309        vgpu_metadata: &VgpuMetadata,
310        pgpu_metadata: &PgpuMetadata,
311    ) -> Result<VgpuCompatibility> {
312        let mut vgpu_raw = vgpu_metadata.encode_raw();
313        let mut pgpu_raw = pgpu_metadata.encode_raw();
314        let mut compatibility = sys::nvmlVgpuPgpuCompatibility_t::default();
315        unsafe {
316            try_ffi!(sys::nvmlGetVgpuCompatibility(
317                vgpu_raw.as_mut_ptr().cast(),
318                pgpu_raw.as_mut_ptr().cast(),
319                &raw mut compatibility,
320            ))?;
321        }
322        Ok(compatibility.into())
323    }
324
325    /// Returns the set of GPUs that have CPU affinity with the given CPU number.
326    /// Supported on Linux only.
327    ///
328    /// # Errors
329    ///
330    /// Returns an error if NVML rejects the CPU number, if topology discovery is
331    /// not supported on this platform, or if NVML fails while collecting the GPU
332    /// set.
333    pub fn topology_gpu_set(&self, cpu_number: u32) -> Result<Vec<Device>> {
334        let mut count = 0;
335        let status = unsafe {
336            sys::nvmlSystemGetTopologyGpuSet(cpu_number, &raw mut count, ptr::null_mut())
337        };
338        if status == sys::nvmlReturn_t::NVML_SUCCESS && count == 0 {
339            return Ok(Vec::new());
340        }
341        if status != sys::nvmlReturn_t::NVML_ERROR_INSUFFICIENT_SIZE {
342            return Err(status.into());
343        }
344
345        let mut devices = vec![ptr::null_mut(); count as usize];
346        unsafe {
347            try_ffi!(sys::nvmlSystemGetTopologyGpuSet(
348                cpu_number,
349                &raw mut count,
350                devices.as_mut_ptr(),
351            ))?;
352        }
353        devices.truncate(count as usize);
354        Ok(devices
355            .into_iter()
356            .map(|handle| unsafe { Device::from_raw(handle) })
357            .collect())
358    }
359
360    /// Returns the number of excluded GPU devices in the system.
361    ///
362    /// For all products.
363    ///
364    /// # Errors
365    ///
366    /// Returns an error if NVML rejects the count output.
367    pub fn excluded_device_count(&self) -> Result<u32> {
368        let mut count = 0;
369        unsafe {
370            try_ffi!(sys::nvmlGetExcludedDeviceCount(&raw mut count))?;
371        }
372        Ok(count)
373    }
374
375    /// Acquire the device information for an excluded GPU device, based on its index.
376    ///
377    /// For all products.
378    ///
379    /// Valid indices are derived from the count returned by [`Library::excluded_device_count`].
380    /// For example, if the count is 2 the valid indices are 0 and 1, corresponding to GPU 0 and GPU 1.
381    ///
382    /// # Errors
383    ///
384    /// Returns an error if `index` is out of range or NVML rejects the device
385    /// information output.
386    pub fn excluded_device(&self, index: u32) -> Result<ExcludedDeviceInfo> {
387        let mut info = sys::nvmlExcludedDeviceInfo_t::default();
388        unsafe {
389            try_ffi!(sys::nvmlGetExcludedDeviceInfoByIndex(index, &raw mut info))?;
390        }
391        Ok(info.into())
392    }
393
394    pub fn excluded_devices(&self) -> Result<Vec<ExcludedDeviceInfo>> {
395        (0..self.excluded_device_count()?)
396            .map(|index| self.excluded_device(index))
397            .collect()
398    }
399
400    /// Returns the number of compute devices in the system.
401    /// A compute device is a single GPU.
402    ///
403    /// For all products.
404    ///
405    /// [`Library::device_count`] returns the count of all devices in the system even if [`Library::device`] returns [`Status::NoPermission`] for some devices.
406    /// Update your code to handle this error, or use the NVML 4.304 or older header file.
407    /// For backward binary compatibility reasons, the `_v1` symbol is still present in the shared library.
408    /// The old `_v1` NVML entry point does not count devices that NVML has no permission to talk to.
409    ///
410    /// # Errors
411    ///
412    /// Returns an error if NVML rejects the count output, if NVML has not been
413    /// initialized, or if NVML reports an unexpected failure.
414    pub fn device_count(&self) -> Result<u32> {
415        let mut count = 0;
416        unsafe {
417            try_ffi!(sys::nvmlDeviceGetCount_v2(&raw mut count))?;
418        }
419        Ok(count)
420    }
421
422    /// Returns the number of units in the system.
423    ///
424    /// For S-class products.
425    ///
426    /// # Errors
427    ///
428    /// Returns an error if NVML rejects the count output, if NVML has not been
429    /// initialized, or if NVML reports an unexpected failure.
430    pub fn unit_count(&self) -> Result<u32> {
431        let mut count = 0;
432        unsafe {
433            try_ffi!(sys::nvmlUnitGetCount(&raw mut count))?;
434        }
435        Ok(count)
436    }
437
438    /// Acquire the handle for a particular unit, based on its index.
439    ///
440    /// For S-class products.
441    ///
442    /// Valid indices are derived from the count returned by [`Library::unit_count`].
443    /// For example, if the count is 2 the valid indices are 0 and 1, corresponding to UNIT 0 and UNIT 1.
444    ///
445    /// The order in which NVML enumerates units has no guarantees of consistency between reboots.
446    ///
447    /// # Errors
448    ///
449    /// Returns an error if `index` is out of range, if NVML rejects the handle
450    /// output, if NVML has not been initialized, or if NVML reports an
451    /// unexpected failure.
452    pub fn unit(&self, index: u32) -> Result<Unit> {
453        let mut handle = ptr::null_mut();
454        unsafe {
455            try_ffi!(sys::nvmlUnitGetHandleByIndex(index, &raw mut handle))?;
456        }
457        Ok(unsafe { Unit::from_raw(handle) })
458    }
459
460    pub fn units(&self) -> Result<Vec<Unit>> {
461        (0..self.unit_count()?)
462            .map(|index| self.unit(index))
463            .collect()
464    }
465
466    /// Acquire the handle for a particular device, based on its index.
467    ///
468    /// For all products.
469    ///
470    /// Valid indices are derived from the accessible device count returned by [`Library::device_count`].
471    /// For example, if the count is 2 the valid indices are 0 and 1, corresponding to GPU 0 and GPU 1.
472    ///
473    /// The order in which NVML enumerates devices has no guarantees of consistency between reboots.
474    /// Prefer PCI bus IDs or UUIDs for stable device lookup.
475    /// See [`Library::device_by_uuid`] and [`Library::device_by_pci_bus_id`].
476    ///
477    /// The NVML index may not correlate with other libraries, such as the CUDA device index.
478    ///
479    /// Starting from NVML 5, this call causes NVML to initialize the target GPU. NVML may initialize additional GPUs if:
480    ///
481    /// * The target GPU is an SLI slave.
482    ///
483    /// [`Library::device_count`] returns the count of all devices in the system even if [`Library::device`] returns [`Status::NoPermission`] for some devices.
484    /// Update your code to handle this error, or use the NVML 4.304 or older header file.
485    /// For backward binary compatibility reasons, the `_v1` symbol is still present in the shared library.
486    /// The old `_v1` NVML entry point does not count devices that NVML has no permission to talk to.
487    ///
488    /// This means that [`Library::device`] and _v1 can return different devices for the same index.
489    /// Code that uses the default `_v2` mappings at the top of the file is unaffected.
490    ///
491    /// # Errors
492    ///
493    /// Returns an error if `index` is out of range, if the process cannot access
494    /// the target GPU, if the GPU cannot be initialized because of power,
495    /// interrupt, or bus-access problems, if NVML has not been initialized, or
496    /// if NVML reports an unexpected failure.
497    pub fn device(&self, index: u32) -> Result<Device> {
498        let mut handle = ptr::null_mut();
499        unsafe {
500            try_ffi!(sys::nvmlDeviceGetHandleByIndex_v2(index, &raw mut handle))?;
501        }
502        Ok(unsafe { Device::from_raw(handle) })
503    }
504
505    /// Acquire the handle for a device from its globally unique immutable UUID.
506    ///
507    /// For all products.
508    ///
509    /// Starting from NVML 5, this call causes NVML to initialize the target GPU. NVML may initialize additional GPUs as it searches for the target GPU.
510    ///
511    /// # Errors
512    ///
513    /// Returns an error if `uuid` contains an interior NUL byte or does not
514    /// identify a device, if NVML cannot initialize one of the GPUs searched,
515    /// if NVML has not been initialized, or if NVML reports an unexpected
516    /// failure.
517    pub fn device_by_uuid(&self, uuid: &str) -> Result<Device> {
518        // TODO: add a typed nvmlUUID_t-based API for nvmlDeviceGetHandleByUUIDV when we design a clean ASCII/binary UUID abstraction.
519        let uuid = CString::new(uuid)?;
520        let mut handle = ptr::null_mut();
521        unsafe {
522            try_ffi!(sys::nvmlDeviceGetHandleByUUID(
523                uuid.as_ptr(),
524                &raw mut handle
525            ))?;
526        }
527        Ok(unsafe { Device::from_raw(handle) })
528    }
529
530    /// Acquire the handle for a particular device, based on its PCI bus id.
531    ///
532    /// For all products.
533    ///
534    /// This value corresponds to the PCI bus ID returned by [`Device::pci_info`].
535    ///
536    /// Starting from NVML 5, this call causes NVML to initialize the target GPU. NVML may initialize additional GPUs if:
537    ///
538    /// * The target GPU is an SLI slave.
539    ///
540    /// Older NVML releases returned [`Status::NotFound`] instead of [`Status::NoPermission`].
541    ///
542    /// # Errors
543    ///
544    /// Returns an error if `pci_bus_id` contains an interior NUL byte or does
545    /// not identify a device, if the process cannot access the target GPU, if
546    /// NVML cannot initialize it because of power, interrupt, or bus-access
547    /// problems, if NVML has not been initialized, or if NVML reports an
548    /// unexpected failure.
549    pub fn device_by_pci_bus_id(&self, pci_bus_id: &str) -> Result<Device> {
550        let pci_bus_id = CString::new(pci_bus_id)?;
551        let mut handle = ptr::null_mut();
552        unsafe {
553            try_ffi!(sys::nvmlDeviceGetHandleByPciBusId_v2(
554                pci_bus_id.as_ptr(),
555                &raw mut handle,
556            ))?;
557        }
558        Ok(unsafe { Device::from_raw(handle) })
559    }
560
561    pub fn device_by_serial(&self, serial: &str) -> Result<Device> {
562        for device in self.devices()? {
563            if device.serial()? == serial {
564                return Ok(device);
565            }
566        }
567
568        Err(sys::nvmlReturn_t::NVML_ERROR_NOT_FOUND.into())
569    }
570
571    pub fn devices(&self) -> Result<Vec<Device>> {
572        (0..self.device_count()?)
573            .map(|index| self.device(index))
574            .collect()
575    }
576
577    /// Creates an empty set of events.
578    /// The returned event set is freed automatically when dropped.
579    ///
580    /// For Fermi or newer fully supported devices.
581    ///
582    /// # Errors
583    ///
584    /// Returns an error if NVML rejects the event-set output, if NVML has not
585    /// been initialized, or if NVML reports an unexpected failure.
586    pub fn event_set(&self) -> Result<EventSet> {
587        let mut set = ptr::null_mut();
588        unsafe {
589            try_ffi!(sys::nvmlEventSetCreate(&raw mut set))?;
590        }
591        Ok(EventSet(set))
592    }
593
594    /// Creates an empty set of system events.
595    /// The returned system event set is freed automatically when dropped.
596    ///
597    /// For Fermi or newer fully supported devices.
598    ///
599    /// # Errors
600    ///
601    /// Returns an error if the installed NVML version does not support the
602    /// request layout, if NVML rejects the request, if NVML has not been
603    /// initialized, or if NVML reports an unexpected failure.
604    pub fn system_event_set(&self) -> Result<SystemEventSet> {
605        let mut request = sys::nvmlSystemEventSetCreateRequest_t {
606            version: struct_version::<sys::nvmlSystemEventSetCreateRequest_t>(1),
607            ..Default::default()
608        };
609        unsafe {
610            try_ffi!(sys::nvmlSystemEventSetCreate(&raw mut request))?;
611        }
612        Ok(SystemEventSet(request.set))
613    }
614
615    /// Returns the global NVLink bandwidth mode.
616    ///
617    /// # Errors
618    ///
619    /// Returns an error if NVML rejects the output, if the system does not
620    /// support global NVLink bandwidth mode, or if the current process lacks
621    /// the required privileges.
622    pub fn nvlink_bw_mode(&self) -> Result<u32> {
623        let mut mode = 0;
624        unsafe {
625            try_ffi!(sys::nvmlSystemGetNvlinkBwMode(&raw mut mode))?;
626        }
627        Ok(mode)
628    }
629
630    /// Returns Confidential Computing system capabilities.
631    ///
632    /// For Ampere or newer fully supported devices.
633    /// Supported on Linux, Windows TCC.
634    ///
635    /// # Errors
636    ///
637    /// Returns an error if NVML rejects the output, if the system does not
638    /// support this Confidential Computing query, or if NVML has not been
639    /// initialized.
640    pub fn conf_compute_capabilities(&self) -> Result<ConfComputeSystemCaps> {
641        let mut caps = sys::nvmlConfComputeSystemCaps_t::default();
642        unsafe {
643            try_ffi!(sys::nvmlSystemGetConfComputeCapabilities(&raw mut caps))?;
644        }
645        Ok(caps.into())
646    }
647
648    /// Returns Confidential Computing system state.
649    ///
650    /// For Ampere or newer fully supported devices.
651    /// Supported on Linux, Windows TCC.
652    ///
653    /// # Errors
654    ///
655    /// Returns an error if NVML rejects the output, if the system does not
656    /// support this Confidential Computing query, or if NVML has not been
657    /// initialized.
658    pub fn conf_compute_state(&self) -> Result<ConfComputeSystemState> {
659        let mut state = sys::nvmlConfComputeSystemState_t::default();
660        unsafe {
661            try_ffi!(sys::nvmlSystemGetConfComputeState(&raw mut state))?;
662        }
663        Ok(state.into())
664    }
665
666    /// Returns Confidential Computing GPU ready state.
667    ///
668    /// For Ampere or newer fully supported devices.
669    /// Supported on Linux, Windows TCC.
670    ///
671    /// # Errors
672    ///
673    /// Returns an error if NVML rejects the output, if the system does not
674    /// support this Confidential Computing query, or if NVML has not been
675    /// initialized.
676    pub fn conf_compute_gpus_ready_state(&self) -> Result<bool> {
677        let mut accepting_work = 0;
678        unsafe {
679            try_ffi!(sys::nvmlSystemGetConfComputeGpusReadyState(
680                &raw mut accepting_work
681            ))?;
682        }
683        Ok(accepting_work == sys::NVML_CC_ACCEPTING_CLIENT_REQUESTS_TRUE)
684    }
685
686    /// Returns Confidential Computing key rotation threshold detail.
687    ///
688    /// For Hopper or newer fully supported devices.
689    /// Supported on Linux, Windows TCC.
690    ///
691    /// # Errors
692    ///
693    /// Returns an error if NVML rejects the request, if the system does not
694    /// support this Confidential Computing query, if NVML has not been
695    /// initialized, or if NVML reports an unexpected failure.
696    pub fn conf_compute_key_rotation_threshold(&self) -> Result<ConfComputeKeyRotationThreshold> {
697        let mut info = sys::nvmlConfComputeGetKeyRotationThresholdInfo_t {
698            version: struct_version::<sys::nvmlConfComputeGetKeyRotationThresholdInfo_t>(1),
699            ..Default::default()
700        };
701        unsafe {
702            try_ffi!(sys::nvmlSystemGetConfComputeKeyRotationThresholdInfo(
703                &raw mut info,
704            ))?;
705        }
706        Ok(info.into())
707    }
708
709    /// Returns Confidential Computing system settings.
710    ///
711    /// For Hopper or newer fully supported devices.
712    /// Supported on Linux, Windows TCC.
713    ///
714    /// # Errors
715    ///
716    /// Returns an error if the installed NVML version does not support the
717    /// request layout, if NVML rejects the request, if a target GPU is
718    /// inaccessible, if the system does not support Confidential Computing
719    /// settings, if NVML has not been initialized, or if NVML reports an
720    /// unexpected failure.
721    pub fn conf_compute_settings(&self) -> Result<ConfComputeSystemSettings> {
722        let mut settings = sys::nvmlSystemConfComputeSettings_t {
723            version: struct_version::<sys::nvmlSystemConfComputeSettings_t>(1),
724            ..Default::default()
725        };
726        unsafe {
727            try_ffi!(sys::nvmlSystemGetConfComputeSettings(&raw mut settings))?;
728        }
729        Ok(settings.into())
730    }
731
732    /// Returns the IDs and firmware versions for any Host Interface Cards (HICs) in the system.
733    ///
734    /// For S-class products.
735    ///
736    /// This wrapper queries the required entry count internally.
737    /// The HIC must be connected to an S-class system to be reported.
738    ///
739    /// # Errors
740    ///
741    /// Returns an error if the number of HIC entries changes while the wrapper
742    /// is fetching them, if NVML rejects the query, or if NVML has not been
743    /// initialized.
744    pub fn hic_versions(&self) -> Result<Vec<HwbcEntry>> {
745        let mut count = 0;
746        let status = unsafe { sys::nvmlSystemGetHicVersion(&raw mut count, ptr::null_mut()) };
747        if status == sys::nvmlReturn_t::NVML_SUCCESS && count == 0 {
748            return Ok(Vec::new());
749        }
750        if status != sys::nvmlReturn_t::NVML_ERROR_INSUFFICIENT_SIZE {
751            return Err(status.into());
752        }
753
754        let mut entries = vec![sys::nvmlHwbcEntry_t::default(); count as usize];
755        unsafe {
756            try_ffi!(sys::nvmlSystemGetHicVersion(
757                &raw mut count,
758                entries.as_mut_ptr(),
759            ))?;
760        }
761        entries.truncate(count as usize);
762        Ok(entries.into_iter().map(Into::into).collect())
763    }
764
765    /// Allocates a sample buffer to be used with NVML GPM.
766    /// At least two of these buffers are required to use the NVML GPM feature.
767    ///
768    /// For Hopper or newer fully supported devices.
769    ///
770    /// # Errors
771    ///
772    /// Returns an error if NVML rejects the allocation request or if system
773    /// memory is insufficient.
774    pub fn gpm_sample(&self) -> Result<GpmSample> {
775        let mut sample = ptr::null_mut();
776        unsafe {
777            try_ffi!(sys::nvmlGpmSampleAlloc(&raw mut sample))?;
778        }
779        Ok(GpmSample(sample))
780    }
781
782    /// Calculate GPM metrics from two samples.
783    ///
784    /// For Hopper or newer fully supported devices.
785    ///
786    /// To retrieve metrics, allocate two sample buffers with [`Library::gpm_sample`] and store them in `metrics.sample1` and `metrics.sample2`.
787    /// Next, fill each requested metric ID in `metrics.metrics[i].metric_id` and set `metrics.num_metrics` to the total number of metrics to retrieve.
788    /// Then call [`Device::gpm_sample`] twice to obtain two samples of counters.
789    ///
790    /// The interval between these two [`Device::gpm_sample`] calls must be greater than 100 ms due to the internal sample refresh rate.
791    /// Finally, call [`Library::gpm_metrics`] to retrieve the metrics into `metrics.metrics`.
792    ///
793    pub fn gpm_metrics(
794        &self,
795        sample1: &GpmSample,
796        sample2: &GpmSample,
797        metric_ids: &[GpmMetricId],
798    ) -> Result<Vec<GpmMetric>> {
799        let mut metrics_get = sys::nvmlGpmMetricsGet_t {
800            version: sys::NVML_GPM_METRICS_GET_VERSION,
801            numMetrics: metric_ids.len() as u32,
802            sample1: sample1.0,
803            sample2: sample2.0,
804            ..Default::default()
805        };
806
807        for (slot, metric_id) in metrics_get.metrics.iter_mut().zip(metric_ids) {
808            slot.metricId = metric_id.0;
809        }
810
811        unsafe {
812            try_ffi!(sys::nvmlGpmMetricsGet(&raw mut metrics_get))?;
813        }
814
815        Ok(metrics_get.metrics[..metric_ids.len()]
816            .iter()
817            .copied()
818            .map(Into::into)
819            .collect())
820    }
821}
822
823impl Drop for Library {
824    fn drop(&mut self) {
825        unsafe {
826            let _ = sys::nvmlShutdown();
827        }
828    }
829}
830
831impl GpmSample {
832    pub const fn as_raw(&self) -> sys::nvmlGpmSample_t {
833        self.0
834    }
835}
836
837impl Drop for GpmSample {
838    fn drop(&mut self) {
839        unsafe {
840            let _ = sys::nvmlGpmSampleFree(self.0);
841        }
842    }
843}
844
845impl EventSet {
846    /// Starts recording the requested events for the specified device.
847    ///
848    /// For Fermi or newer fully supported devices.
849    /// ECC events are available only on ECC-enabled devices; power-capping events are available only on devices with power management support.
850    ///
851    /// For Linux only.
852    ///
853    /// All events that occurred before this call are not recorded.
854    /// Use [`EventSet::wait`] to observe recorded events.
855    ///
856    /// If NVML reports [`Status::Unknown`], the event set is in an undefined
857    /// state and must be freed.
858    /// If NVML reports [`Status::NotSupported`], the event set can still be
859    /// used, but none of the requested event types are registered.
860    ///
861    /// # Errors
862    ///
863    /// Returns an error if the device is inaccessible, if NVML rejects the
864    /// event set or event mask, if the platform or requested event types are not
865    /// supported, if NVML has not been initialized, or if NVML reports an
866    /// unexpected failure.
867    pub fn register_device(&self, device: Device, event_types: EventTypes) -> Result<()> {
868        unsafe {
869            try_ffi!(sys::nvmlDeviceRegisterEvents(
870                device.as_raw(),
871                event_types.bits(),
872                self.0,
873            ))
874        }
875    }
876
877    /// Waits for and returns the next event.
878    ///
879    /// For Fermi or newer fully supported devices.
880    ///
881    /// If events are ready when this is called, it returns immediately.
882    /// If no events are ready, it sleeps until an event arrives or `timeout_ms` expires.
883    /// In some conditions, such as an interrupt, this can return before the timeout expires.
884    ///
885    /// On Windows, after an Xid error, this method returns the most recent Xid error type seen by the system.
886    /// If multiple Xid errors are generated before this wait call is made, the last seen Xid error type is returned for all Xid error events.
887    ///
888    /// On Linux, every Xid error event returns the associated event data and
889    /// other information if applicable.
890    ///
891    /// In MIG mode, if a device handle is provided, NVML reports events for all
892    /// available instances only when the caller has appropriate privileges.
893    /// Without those privileges, only events affecting all instances, namely
894    /// the whole device, are reported.
895    ///
896    /// This does not currently support per-instance event reporting using MIG device handles.
897    ///
898    /// # Errors
899    ///
900    /// Returns an error if a registered GPU is inaccessible, if NVML rejects the
901    /// event-data output, if no event arrives before `timeout_ms` or the wait is
902    /// interrupted, if NVML has not been initialized, or if NVML reports an
903    /// unexpected failure.
904    pub fn wait(&self, timeout_ms: u32) -> Result<EventData> {
905        unsafe {
906            let mut data = MaybeUninit::<sys::nvmlEventData_t>::uninit();
907            try_ffi!(sys::nvmlEventSetWait_v2(
908                self.0,
909                data.as_mut_ptr(),
910                timeout_ms
911            ))?;
912            Ok(data.assume_init().into())
913        }
914    }
915}
916
917impl Drop for EventSet {
918    fn drop(&mut self) {
919        unsafe {
920            let _ = sys::nvmlEventSetFree(self.0);
921        }
922    }
923}
924
925impl SystemEventSet {
926    /// Starts recording the requested events for the system event set.
927    ///
928    /// For Linux only.
929    ///
930    /// Starts recording events on the specified device.
931    /// All events that occurred before this call are not recorded.
932    /// Use [`SystemEventSet::wait`] to check whether an event occurred.
933    ///
934    /// If NVML reports [`Status::Unknown`], the event set is in an undefined
935    /// state and must be freed.
936    /// If NVML reports [`Status::NotSupported`], the event set can still be
937    /// used, but none of the requested event types are registered.
938    ///
939    /// # Errors
940    ///
941    /// Returns an error if the installed NVML version does not support the
942    /// request layout, if NVML rejects the event registration request, if NVML
943    /// has not been initialized, or if NVML reports an unexpected failure.
944    pub fn register_events(&self, event_types: SystemEventTypes) -> Result<()> {
945        let mut request = sys::nvmlSystemRegisterEventRequest_t {
946            version: struct_version::<sys::nvmlSystemRegisterEventRequest_t>(1),
947            eventTypes: event_types.bits(),
948            set: self.0,
949        };
950        unsafe { try_ffi!(sys::nvmlSystemRegisterEvents(&raw mut request)) }
951    }
952
953    /// Waits for system events and returns ready events.
954    ///
955    /// For Fermi or newer fully supported devices.
956    ///
957    /// If events are ready when this is called, it returns immediately.
958    /// If no events are ready, it sleeps until an event arrives or `timeout_ms` expires.
959    /// In some conditions, such as an interrupt, this can return before the timeout expires.
960    ///
961    /// If the returned event count equals the internal event-buffer capacity, there may be outstanding events.
962    /// Call [`SystemEventSet::wait`] again to query all events.
963    ///
964    /// # Errors
965    ///
966    /// Returns an error if the installed NVML version does not support the
967    /// request layout, if NVML rejects the wait request, if no event arrives
968    /// before `timeout_ms`, if NVML has not been initialized, or if NVML reports
969    /// an unexpected failure.
970    pub fn wait(&self, timeout_ms: u32, max_events: u32) -> Result<Vec<SystemEventData>> {
971        let mut data = vec![sys::nvmlSystemEventData_v1_t::default(); max_events as usize];
972        let mut request = sys::nvmlSystemEventSetWaitRequest_t {
973            version: struct_version::<sys::nvmlSystemEventSetWaitRequest_t>(1),
974            timeoutms: timeout_ms,
975            set: self.0,
976            data: data.as_mut_ptr(),
977            dataSize: data.len() as u32,
978            ..Default::default()
979        };
980        unsafe {
981            try_ffi!(sys::nvmlSystemEventSetWait(&raw mut request))?;
982        }
983        data.truncate(request.numEvent as usize);
984        Ok(data.into_iter().map(Into::into).collect())
985    }
986}
987
988impl Drop for SystemEventSet {
989    fn drop(&mut self) {
990        let mut request = sys::nvmlSystemEventSetFreeRequest_t {
991            version: struct_version::<sys::nvmlSystemEventSetFreeRequest_t>(1),
992            set: self.0,
993        };
994        unsafe {
995            let _ = sys::nvmlSystemEventSetFree(&raw mut request);
996        }
997    }
998}
singe_nvml/library.rs

singe_nvml/
library.rs