singe_nvml/library.rs
1#[allow(unused_imports)]
2use crate::error::Status;
3
4use std::{ffi::CString, mem::MaybeUninit, ptr};
5
6use singe_core::string_from_c_chars;
7use singe_nvml_sys as sys;
8
9use crate::{
10 device::Device,
11 error::Result,
12 try_ffi,
13 types::{
14 ConfComputeKeyRotationThreshold, ConfComputeSystemCaps, ConfComputeSystemSettings,
15 ConfComputeSystemState, CudaDriverVersion, EventData, EventTypes, ExcludedDeviceInfo,
16 GpmMetric, GpmMetricId, HwbcEntry, InitFlags, PgpuMetadata, Pid, SystemEventData,
17 SystemEventTypes, VgpuCompatibility, VgpuDriverCapability, VgpuMetadata, VgpuVersion,
18 VgpuVersionRange,
19 },
20 unit::Unit,
21 utility::struct_version,
22};
23
24#[derive(Debug)]
25pub struct EventSet(sys::nvmlEventSet_t);
26
27#[derive(Debug)]
28pub struct SystemEventSet(sys::nvmlSystemEventSet_t);
29
30#[derive(Debug)]
31pub struct Library;
32
33#[derive(Debug)]
34pub struct GpmSample(sys::nvmlGpmSample_t);
35
36impl Library {
37 /// Initializes NVML, but does not initialize any GPUs yet.
38 ///
39 /// * Newer NVML initialization adds flags that let callers adjust initialization behavior.
40 /// * In NVML 5.319, [`Library::create`] replaced the older initialization path that initialized all GPU devices in the system.
41 ///
42 /// This allows NVML to communicate with a GPU when other GPUs in the system are unstable or in a bad state.
43 /// With this initialization mode, GPUs are discovered and initialized lazily when you request a device handle from this crate.
44 ///
45 /// In contrast, the older initialization path in NVML 4.304 would fail if any detected GPU was in a bad or unstable state.
46 ///
47 /// For all products.
48 ///
49 /// Call this once before invoking any other methods in the library.
50 /// A reference count of the number of initializations is maintained.
51 /// Shutdown only occurs when the reference count reaches zero.
52 ///
53 /// # Errors
54 ///
55 /// Returns an error if the NVIDIA driver is not running, if NVML does not
56 /// have permission to communicate with the driver, or if NVML reports an
57 /// unexpected failure.
58 pub fn create() -> Result<Self> {
59 unsafe {
60 try_ffi!(sys::nvmlInit_v2())?;
61 }
62 Ok(Self)
63 }
64
65 /// Initializes NVML with the provided initialization flags.
66 /// This follows the same reference-counting behavior as [`Library::create`].
67 ///
68 /// For all products.
69 ///
70 /// # Errors
71 ///
72 /// Returns an error if the NVIDIA driver is not running, if NVML does not
73 /// have permission to communicate with the driver, or if NVML reports an
74 /// unexpected failure.
75 pub fn create_with_flags(flags: InitFlags) -> Result<Self> {
76 unsafe {
77 try_ffi!(sys::nvmlInitWithFlags(flags.bits()))?;
78 }
79 Ok(Self)
80 }
81
82 /// Returns the version of the system's graphics driver.
83 ///
84 /// For all products.
85 ///
86 /// The version identifier is an alphanumeric string.
87 /// It does not exceed 80 bytes including the terminating NUL byte.
88 /// This wrapper allocates the required NVML buffer internally.
89 ///
90 /// # Errors
91 ///
92 /// Returns an error if the internal version buffer is too small, if NVML
93 /// rejects the output argument, or if NVML has not been initialized.
94 pub fn driver_version(&self) -> Result<String> {
95 let mut buffer = [0i8; sys::NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE as usize];
96 unsafe {
97 try_ffi!(sys::nvmlSystemGetDriverVersion(
98 buffer.as_mut_ptr(),
99 buffer.len() as u32,
100 ))?;
101 }
102 Ok(string_from_c_chars(&buffer))
103 }
104
105 /// Returns the version of the NVML library.
106 ///
107 /// For all products.
108 ///
109 /// The version identifier is an alphanumeric string.
110 /// It does not exceed 80 bytes including the terminating NUL byte.
111 /// This wrapper allocates the required NVML buffer internally.
112 ///
113 /// # Errors
114 ///
115 /// Returns an error if the internal version buffer is too small or if NVML
116 /// rejects the output argument.
117 pub fn version(&self) -> Result<String> {
118 crate::version()
119 }
120
121 /// Returns the version of the CUDA driver from the shared library.
122 ///
123 /// For all products.
124 ///
125 /// CUDA driver version obtained by calling the CUDA driver.
126 ///
127 /// # Errors
128 ///
129 /// Returns an error if the CUDA driver library or version function cannot
130 /// be found, or if NVML rejects the output argument.
131 pub fn cuda_driver_version(&self) -> Result<CudaDriverVersion> {
132 let mut raw = 0;
133 unsafe {
134 try_ffi!(sys::nvmlSystemGetCudaDriverVersion_v2(&raw mut raw))?;
135 }
136 Ok(CudaDriverVersion::from_raw(raw))
137 }
138
139 /// Returns the version of the CUDA driver.
140 ///
141 /// For all products.
142 ///
143 /// The CUDA driver version is retrieved from the currently installed version of CUDA.
144 /// If the CUDA library is not found, this returns a known supported version number.
145 ///
146 /// # Errors
147 ///
148 /// Returns an error if NVML rejects the query arguments.
149 pub fn cuda_driver_version_fallback(&self) -> Result<CudaDriverVersion> {
150 let mut raw = 0;
151 unsafe {
152 try_ffi!(sys::nvmlSystemGetCudaDriverVersion(&raw mut raw))?;
153 }
154 Ok(CudaDriverVersion::from_raw(raw))
155 }
156
157 pub fn process_name(&self, pid: Pid) -> Result<String> {
158 self.process_name_with_capacity(pid, 1024)
159 }
160
161 /// Returns the name of the process with the provided process ID.
162 ///
163 /// For all products.
164 ///
165 /// The returned process name is truncated to `capacity` and encoded in ANSI.
166 ///
167 /// # Errors
168 ///
169 /// Returns an error if NVML rejects the process ID or output buffer, if the
170 /// process does not exist, if the current process lacks permission, if NVML
171 /// has not been initialized, or if NVML reports an unexpected failure.
172 pub fn process_name_with_capacity(&self, pid: Pid, capacity: usize) -> Result<String> {
173 let mut buffer = vec![0i8; capacity];
174 unsafe {
175 try_ffi!(sys::nvmlSystemGetProcessName(
176 pid.0,
177 buffer.as_mut_ptr(),
178 buffer.len() as u32,
179 ))?;
180 }
181 Ok(string_from_c_chars(&buffer))
182 }
183
184 /// Returns the driver branch of the NVIDIA driver installed on the system.
185 ///
186 /// For all products.
187 ///
188 /// The branch identifier is an alphanumeric string.
189 /// It does not exceed 80 bytes including the terminating NUL byte.
190 /// This wrapper allocates the required NVML buffer internally.
191 ///
192 /// # Errors
193 ///
194 /// Returns an error if the internal driver-branch buffer is too small, if
195 /// NVML rejects the query arguments, if NVML has not been initialized, or if
196 /// NVML reports an unexpected failure.
197 pub fn driver_branch(&self) -> Result<String> {
198 let mut info = sys::nvmlSystemDriverBranchInfo_t {
199 version: struct_version::<sys::nvmlSystemDriverBranchInfo_t>(1),
200 ..Default::default()
201 };
202 unsafe {
203 try_ffi!(sys::nvmlSystemGetDriverBranch(
204 &raw mut info,
205 sys::NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE,
206 ))?;
207 }
208 Ok(string_from_c_chars(&info.branch))
209 }
210
211 /// Returns the requested vGPU driver capability.
212 ///
213 /// See [`VgpuDriverCapability`] for the supported capabilities.
214 /// Returns a boolean indicating whether the capability is supported.
215 ///
216 /// For Maxwell or newer fully supported devices.
217 ///
218 /// # Errors
219 ///
220 /// Returns an error if NVML rejects the requested capability, if the current
221 /// driver state does not support vGPU capability queries, if NVML has not
222 /// been initialized, or if NVML reports an unexpected failure.
223 pub fn vgpu_driver_capability(&self, capability: VgpuDriverCapability) -> Result<bool> {
224 let mut result = 0;
225 unsafe {
226 try_ffi!(sys::nvmlGetVgpuDriverCapabilities(
227 capability.into(),
228 &raw mut result,
229 ))?;
230 }
231 Ok(result != 0)
232 }
233
234 /// Query the ranges of supported vGPU versions.
235 ///
236 /// Returns the preset linear range of supported vGPU versions for the NVIDIA vGPU Manager and the administrator-configured range.
237 /// If the preset range has not been overridden by [`Library::set_vgpu_version`], both ranges are the same.
238 ///
239 /// This wrapper returns both the preset supported range and the administrator-configured current range.
240 /// By default, the current range matches the preset range.
241 ///
242 /// # Errors
243 ///
244 /// Returns an error if NVML does not support this query, rejects the output
245 /// buffers, or fails while fetching the version ranges.
246 pub fn vgpu_version(&self) -> Result<VgpuVersionRange> {
247 let mut supported = sys::nvmlVgpuVersion_t::default();
248 let mut current = sys::nvmlVgpuVersion_t::default();
249 unsafe {
250 try_ffi!(sys::nvmlGetVgpuVersion(
251 &raw mut supported,
252 &raw mut current
253 ))?;
254 }
255 Ok(VgpuVersionRange {
256 supported: supported.into(),
257 current: current.into(),
258 })
259 }
260
261 /// Override the preset range of vGPU versions supported by the NVIDIA vGPU Manager with a range set by an administrator.
262 ///
263 /// Configures the NVIDIA vGPU Manager with an administrator-provided range of supported vGPU versions.
264 /// This range must be a subset of the preset range that the NVIDIA vGPU Manager supports.
265 /// The custom range set by an administrator takes precedence over the preset range and is advertised to the guest VM for negotiating the vGPU version.
266 /// See [`Library::vgpu_version`] for details of how to query the preset range of versions supported.
267 ///
268 /// This overrides the preset vGPU version range with the administrator-provided range.
269 ///
270 /// After host system reboot or driver reload, the range of supported versions reverts to the range that is preset for the NVIDIA vGPU Manager.
271 ///
272 /// 1. The range set by the administrator must be a subset of the preset range that the NVIDIA vGPU Manager supports.
273 /// Otherwise, an error is returned.
274 /// 2. If the range of supported guest driver versions does not overlap the range set by the administrator,
275 /// the guest driver fails to load.
276 /// 3. If the range of supported guest driver versions overlaps the range set by the administrator,
277 /// the guest driver loads with a negotiated vGPU version equal to the
278 /// maximum value in the overlapping range.
279 /// 4. No VMs must be running on the host when setting the version range.
280 /// If a VM is running on the host, the call fails.
281 ///
282 /// # Errors
283 ///
284 /// Returns an error if `version` is invalid or outside the preset supported
285 /// range, if a VM is running on the host, or if the installed vGPU Manager
286 /// does not support overriding the version range.
287 pub fn set_vgpu_version(&self, version: VgpuVersion) -> Result<()> {
288 let mut version: sys::nvmlVgpuVersion_t = version.into();
289 unsafe {
290 try_ffi!(sys::nvmlSetVgpuVersion(&raw mut version))?;
291 }
292 Ok(())
293 }
294
295 /// Takes a vGPU instance metadata structure read from [`VgpuInstance::metadata`](crate::vgpu_instance::VgpuInstance::metadata), and a vGPU metadata structure for a physical GPU read from [`Device::vgpu_metadata`], and returns compatibility information of the vGPU instance and the physical GPU.
296 ///
297 /// This wrapper returns compatibility information describing whether the vGPU or VM may be booted on the physical GPU.
298 /// If the vGPU / VM compatibility with the physical GPU is limited, a limit code indicates the factor limiting compatibility.
299 /// See the returned compatibility structure for the reported limit code.
300 ///
301 /// vGPU compatibility does not take into account dynamic capacity conditions that may limit a system's ability to boot a given vGPU or associated VM.
302 ///
303 /// # Errors
304 ///
305 /// Returns an error if NVML rejects either metadata blob or reports an
306 /// unexpected compatibility-query failure.
307 pub fn vgpu_compatibility(
308 &self,
309 vgpu_metadata: &VgpuMetadata,
310 pgpu_metadata: &PgpuMetadata,
311 ) -> Result<VgpuCompatibility> {
312 let mut vgpu_raw = vgpu_metadata.encode_raw();
313 let mut pgpu_raw = pgpu_metadata.encode_raw();
314 let mut compatibility = sys::nvmlVgpuPgpuCompatibility_t::default();
315 unsafe {
316 try_ffi!(sys::nvmlGetVgpuCompatibility(
317 vgpu_raw.as_mut_ptr().cast(),
318 pgpu_raw.as_mut_ptr().cast(),
319 &raw mut compatibility,
320 ))?;
321 }
322 Ok(compatibility.into())
323 }
324
325 /// Returns the set of GPUs that have CPU affinity with the given CPU number.
326 /// Supported on Linux only.
327 ///
328 /// # Errors
329 ///
330 /// Returns an error if NVML rejects the CPU number, if topology discovery is
331 /// not supported on this platform, or if NVML fails while collecting the GPU
332 /// set.
333 pub fn topology_gpu_set(&self, cpu_number: u32) -> Result<Vec<Device>> {
334 let mut count = 0;
335 let status = unsafe {
336 sys::nvmlSystemGetTopologyGpuSet(cpu_number, &raw mut count, ptr::null_mut())
337 };
338 if status == sys::nvmlReturn_t::NVML_SUCCESS && count == 0 {
339 return Ok(Vec::new());
340 }
341 if status != sys::nvmlReturn_t::NVML_ERROR_INSUFFICIENT_SIZE {
342 return Err(status.into());
343 }
344
345 let mut devices = vec![ptr::null_mut(); count as usize];
346 unsafe {
347 try_ffi!(sys::nvmlSystemGetTopologyGpuSet(
348 cpu_number,
349 &raw mut count,
350 devices.as_mut_ptr(),
351 ))?;
352 }
353 devices.truncate(count as usize);
354 Ok(devices
355 .into_iter()
356 .map(|handle| unsafe { Device::from_raw(handle) })
357 .collect())
358 }
359
360 /// Returns the number of excluded GPU devices in the system.
361 ///
362 /// For all products.
363 ///
364 /// # Errors
365 ///
366 /// Returns an error if NVML rejects the count output.
367 pub fn excluded_device_count(&self) -> Result<u32> {
368 let mut count = 0;
369 unsafe {
370 try_ffi!(sys::nvmlGetExcludedDeviceCount(&raw mut count))?;
371 }
372 Ok(count)
373 }
374
375 /// Acquire the device information for an excluded GPU device, based on its index.
376 ///
377 /// For all products.
378 ///
379 /// Valid indices are derived from the count returned by [`Library::excluded_device_count`].
380 /// For example, if the count is 2 the valid indices are 0 and 1, corresponding to GPU 0 and GPU 1.
381 ///
382 /// # Errors
383 ///
384 /// Returns an error if `index` is out of range or NVML rejects the device
385 /// information output.
386 pub fn excluded_device(&self, index: u32) -> Result<ExcludedDeviceInfo> {
387 let mut info = sys::nvmlExcludedDeviceInfo_t::default();
388 unsafe {
389 try_ffi!(sys::nvmlGetExcludedDeviceInfoByIndex(index, &raw mut info))?;
390 }
391 Ok(info.into())
392 }
393
394 pub fn excluded_devices(&self) -> Result<Vec<ExcludedDeviceInfo>> {
395 (0..self.excluded_device_count()?)
396 .map(|index| self.excluded_device(index))
397 .collect()
398 }
399
400 /// Returns the number of compute devices in the system.
401 /// A compute device is a single GPU.
402 ///
403 /// For all products.
404 ///
405 /// [`Library::device_count`] returns the count of all devices in the system even if [`Library::device`] returns [`Status::NoPermission`] for some devices.
406 /// Update your code to handle this error, or use the NVML 4.304 or older header file.
407 /// For backward binary compatibility reasons, the `_v1` symbol is still present in the shared library.
408 /// The old `_v1` NVML entry point does not count devices that NVML has no permission to talk to.
409 ///
410 /// # Errors
411 ///
412 /// Returns an error if NVML rejects the count output, if NVML has not been
413 /// initialized, or if NVML reports an unexpected failure.
414 pub fn device_count(&self) -> Result<u32> {
415 let mut count = 0;
416 unsafe {
417 try_ffi!(sys::nvmlDeviceGetCount_v2(&raw mut count))?;
418 }
419 Ok(count)
420 }
421
422 /// Returns the number of units in the system.
423 ///
424 /// For S-class products.
425 ///
426 /// # Errors
427 ///
428 /// Returns an error if NVML rejects the count output, if NVML has not been
429 /// initialized, or if NVML reports an unexpected failure.
430 pub fn unit_count(&self) -> Result<u32> {
431 let mut count = 0;
432 unsafe {
433 try_ffi!(sys::nvmlUnitGetCount(&raw mut count))?;
434 }
435 Ok(count)
436 }
437
438 /// Acquire the handle for a particular unit, based on its index.
439 ///
440 /// For S-class products.
441 ///
442 /// Valid indices are derived from the count returned by [`Library::unit_count`].
443 /// For example, if the count is 2 the valid indices are 0 and 1, corresponding to UNIT 0 and UNIT 1.
444 ///
445 /// The order in which NVML enumerates units has no guarantees of consistency between reboots.
446 ///
447 /// # Errors
448 ///
449 /// Returns an error if `index` is out of range, if NVML rejects the handle
450 /// output, if NVML has not been initialized, or if NVML reports an
451 /// unexpected failure.
452 pub fn unit(&self, index: u32) -> Result<Unit> {
453 let mut handle = ptr::null_mut();
454 unsafe {
455 try_ffi!(sys::nvmlUnitGetHandleByIndex(index, &raw mut handle))?;
456 }
457 Ok(unsafe { Unit::from_raw(handle) })
458 }
459
460 pub fn units(&self) -> Result<Vec<Unit>> {
461 (0..self.unit_count()?)
462 .map(|index| self.unit(index))
463 .collect()
464 }
465
466 /// Acquire the handle for a particular device, based on its index.
467 ///
468 /// For all products.
469 ///
470 /// Valid indices are derived from the accessible device count returned by [`Library::device_count`].
471 /// For example, if the count is 2 the valid indices are 0 and 1, corresponding to GPU 0 and GPU 1.
472 ///
473 /// The order in which NVML enumerates devices has no guarantees of consistency between reboots.
474 /// Prefer PCI bus IDs or UUIDs for stable device lookup.
475 /// See [`Library::device_by_uuid`] and [`Library::device_by_pci_bus_id`].
476 ///
477 /// The NVML index may not correlate with other libraries, such as the CUDA device index.
478 ///
479 /// Starting from NVML 5, this call causes NVML to initialize the target GPU. NVML may initialize additional GPUs if:
480 ///
481 /// * The target GPU is an SLI slave.
482 ///
483 /// [`Library::device_count`] returns the count of all devices in the system even if [`Library::device`] returns [`Status::NoPermission`] for some devices.
484 /// Update your code to handle this error, or use the NVML 4.304 or older header file.
485 /// For backward binary compatibility reasons, the `_v1` symbol is still present in the shared library.
486 /// The old `_v1` NVML entry point does not count devices that NVML has no permission to talk to.
487 ///
488 /// This means that [`Library::device`] and _v1 can return different devices for the same index.
489 /// Code that uses the default `_v2` mappings at the top of the file is unaffected.
490 ///
491 /// # Errors
492 ///
493 /// Returns an error if `index` is out of range, if the process cannot access
494 /// the target GPU, if the GPU cannot be initialized because of power,
495 /// interrupt, or bus-access problems, if NVML has not been initialized, or
496 /// if NVML reports an unexpected failure.
497 pub fn device(&self, index: u32) -> Result<Device> {
498 let mut handle = ptr::null_mut();
499 unsafe {
500 try_ffi!(sys::nvmlDeviceGetHandleByIndex_v2(index, &raw mut handle))?;
501 }
502 Ok(unsafe { Device::from_raw(handle) })
503 }
504
505 /// Acquire the handle for a device from its globally unique immutable UUID.
506 ///
507 /// For all products.
508 ///
509 /// Starting from NVML 5, this call causes NVML to initialize the target GPU. NVML may initialize additional GPUs as it searches for the target GPU.
510 ///
511 /// # Errors
512 ///
513 /// Returns an error if `uuid` contains an interior NUL byte or does not
514 /// identify a device, if NVML cannot initialize one of the GPUs searched,
515 /// if NVML has not been initialized, or if NVML reports an unexpected
516 /// failure.
517 pub fn device_by_uuid(&self, uuid: &str) -> Result<Device> {
518 // TODO: add a typed nvmlUUID_t-based API for nvmlDeviceGetHandleByUUIDV when we design a clean ASCII/binary UUID abstraction.
519 let uuid = CString::new(uuid)?;
520 let mut handle = ptr::null_mut();
521 unsafe {
522 try_ffi!(sys::nvmlDeviceGetHandleByUUID(
523 uuid.as_ptr(),
524 &raw mut handle
525 ))?;
526 }
527 Ok(unsafe { Device::from_raw(handle) })
528 }
529
530 /// Acquire the handle for a particular device, based on its PCI bus id.
531 ///
532 /// For all products.
533 ///
534 /// This value corresponds to the PCI bus ID returned by [`Device::pci_info`].
535 ///
536 /// Starting from NVML 5, this call causes NVML to initialize the target GPU. NVML may initialize additional GPUs if:
537 ///
538 /// * The target GPU is an SLI slave.
539 ///
540 /// Older NVML releases returned [`Status::NotFound`] instead of [`Status::NoPermission`].
541 ///
542 /// # Errors
543 ///
544 /// Returns an error if `pci_bus_id` contains an interior NUL byte or does
545 /// not identify a device, if the process cannot access the target GPU, if
546 /// NVML cannot initialize it because of power, interrupt, or bus-access
547 /// problems, if NVML has not been initialized, or if NVML reports an
548 /// unexpected failure.
549 pub fn device_by_pci_bus_id(&self, pci_bus_id: &str) -> Result<Device> {
550 let pci_bus_id = CString::new(pci_bus_id)?;
551 let mut handle = ptr::null_mut();
552 unsafe {
553 try_ffi!(sys::nvmlDeviceGetHandleByPciBusId_v2(
554 pci_bus_id.as_ptr(),
555 &raw mut handle,
556 ))?;
557 }
558 Ok(unsafe { Device::from_raw(handle) })
559 }
560
561 pub fn device_by_serial(&self, serial: &str) -> Result<Device> {
562 for device in self.devices()? {
563 if device.serial()? == serial {
564 return Ok(device);
565 }
566 }
567
568 Err(sys::nvmlReturn_t::NVML_ERROR_NOT_FOUND.into())
569 }
570
571 pub fn devices(&self) -> Result<Vec<Device>> {
572 (0..self.device_count()?)
573 .map(|index| self.device(index))
574 .collect()
575 }
576
577 /// Creates an empty set of events.
578 /// The returned event set is freed automatically when dropped.
579 ///
580 /// For Fermi or newer fully supported devices.
581 ///
582 /// # Errors
583 ///
584 /// Returns an error if NVML rejects the event-set output, if NVML has not
585 /// been initialized, or if NVML reports an unexpected failure.
586 pub fn event_set(&self) -> Result<EventSet> {
587 let mut set = ptr::null_mut();
588 unsafe {
589 try_ffi!(sys::nvmlEventSetCreate(&raw mut set))?;
590 }
591 Ok(EventSet(set))
592 }
593
594 /// Creates an empty set of system events.
595 /// The returned system event set is freed automatically when dropped.
596 ///
597 /// For Fermi or newer fully supported devices.
598 ///
599 /// # Errors
600 ///
601 /// Returns an error if the installed NVML version does not support the
602 /// request layout, if NVML rejects the request, if NVML has not been
603 /// initialized, or if NVML reports an unexpected failure.
604 pub fn system_event_set(&self) -> Result<SystemEventSet> {
605 let mut request = sys::nvmlSystemEventSetCreateRequest_t {
606 version: struct_version::<sys::nvmlSystemEventSetCreateRequest_t>(1),
607 ..Default::default()
608 };
609 unsafe {
610 try_ffi!(sys::nvmlSystemEventSetCreate(&raw mut request))?;
611 }
612 Ok(SystemEventSet(request.set))
613 }
614
615 /// Returns the global NVLink bandwidth mode.
616 ///
617 /// # Errors
618 ///
619 /// Returns an error if NVML rejects the output, if the system does not
620 /// support global NVLink bandwidth mode, or if the current process lacks
621 /// the required privileges.
622 pub fn nvlink_bw_mode(&self) -> Result<u32> {
623 let mut mode = 0;
624 unsafe {
625 try_ffi!(sys::nvmlSystemGetNvlinkBwMode(&raw mut mode))?;
626 }
627 Ok(mode)
628 }
629
630 /// Returns Confidential Computing system capabilities.
631 ///
632 /// For Ampere or newer fully supported devices.
633 /// Supported on Linux, Windows TCC.
634 ///
635 /// # Errors
636 ///
637 /// Returns an error if NVML rejects the output, if the system does not
638 /// support this Confidential Computing query, or if NVML has not been
639 /// initialized.
640 pub fn conf_compute_capabilities(&self) -> Result<ConfComputeSystemCaps> {
641 let mut caps = sys::nvmlConfComputeSystemCaps_t::default();
642 unsafe {
643 try_ffi!(sys::nvmlSystemGetConfComputeCapabilities(&raw mut caps))?;
644 }
645 Ok(caps.into())
646 }
647
648 /// Returns Confidential Computing system state.
649 ///
650 /// For Ampere or newer fully supported devices.
651 /// Supported on Linux, Windows TCC.
652 ///
653 /// # Errors
654 ///
655 /// Returns an error if NVML rejects the output, if the system does not
656 /// support this Confidential Computing query, or if NVML has not been
657 /// initialized.
658 pub fn conf_compute_state(&self) -> Result<ConfComputeSystemState> {
659 let mut state = sys::nvmlConfComputeSystemState_t::default();
660 unsafe {
661 try_ffi!(sys::nvmlSystemGetConfComputeState(&raw mut state))?;
662 }
663 Ok(state.into())
664 }
665
666 /// Returns Confidential Computing GPU ready state.
667 ///
668 /// For Ampere or newer fully supported devices.
669 /// Supported on Linux, Windows TCC.
670 ///
671 /// # Errors
672 ///
673 /// Returns an error if NVML rejects the output, if the system does not
674 /// support this Confidential Computing query, or if NVML has not been
675 /// initialized.
676 pub fn conf_compute_gpus_ready_state(&self) -> Result<bool> {
677 let mut accepting_work = 0;
678 unsafe {
679 try_ffi!(sys::nvmlSystemGetConfComputeGpusReadyState(
680 &raw mut accepting_work
681 ))?;
682 }
683 Ok(accepting_work == sys::NVML_CC_ACCEPTING_CLIENT_REQUESTS_TRUE)
684 }
685
686 /// Returns Confidential Computing key rotation threshold detail.
687 ///
688 /// For Hopper or newer fully supported devices.
689 /// Supported on Linux, Windows TCC.
690 ///
691 /// # Errors
692 ///
693 /// Returns an error if NVML rejects the request, if the system does not
694 /// support this Confidential Computing query, if NVML has not been
695 /// initialized, or if NVML reports an unexpected failure.
696 pub fn conf_compute_key_rotation_threshold(&self) -> Result<ConfComputeKeyRotationThreshold> {
697 let mut info = sys::nvmlConfComputeGetKeyRotationThresholdInfo_t {
698 version: struct_version::<sys::nvmlConfComputeGetKeyRotationThresholdInfo_t>(1),
699 ..Default::default()
700 };
701 unsafe {
702 try_ffi!(sys::nvmlSystemGetConfComputeKeyRotationThresholdInfo(
703 &raw mut info,
704 ))?;
705 }
706 Ok(info.into())
707 }
708
709 /// Returns Confidential Computing system settings.
710 ///
711 /// For Hopper or newer fully supported devices.
712 /// Supported on Linux, Windows TCC.
713 ///
714 /// # Errors
715 ///
716 /// Returns an error if the installed NVML version does not support the
717 /// request layout, if NVML rejects the request, if a target GPU is
718 /// inaccessible, if the system does not support Confidential Computing
719 /// settings, if NVML has not been initialized, or if NVML reports an
720 /// unexpected failure.
721 pub fn conf_compute_settings(&self) -> Result<ConfComputeSystemSettings> {
722 let mut settings = sys::nvmlSystemConfComputeSettings_t {
723 version: struct_version::<sys::nvmlSystemConfComputeSettings_t>(1),
724 ..Default::default()
725 };
726 unsafe {
727 try_ffi!(sys::nvmlSystemGetConfComputeSettings(&raw mut settings))?;
728 }
729 Ok(settings.into())
730 }
731
732 /// Returns the IDs and firmware versions for any Host Interface Cards (HICs) in the system.
733 ///
734 /// For S-class products.
735 ///
736 /// This wrapper queries the required entry count internally.
737 /// The HIC must be connected to an S-class system to be reported.
738 ///
739 /// # Errors
740 ///
741 /// Returns an error if the number of HIC entries changes while the wrapper
742 /// is fetching them, if NVML rejects the query, or if NVML has not been
743 /// initialized.
744 pub fn hic_versions(&self) -> Result<Vec<HwbcEntry>> {
745 let mut count = 0;
746 let status = unsafe { sys::nvmlSystemGetHicVersion(&raw mut count, ptr::null_mut()) };
747 if status == sys::nvmlReturn_t::NVML_SUCCESS && count == 0 {
748 return Ok(Vec::new());
749 }
750 if status != sys::nvmlReturn_t::NVML_ERROR_INSUFFICIENT_SIZE {
751 return Err(status.into());
752 }
753
754 let mut entries = vec![sys::nvmlHwbcEntry_t::default(); count as usize];
755 unsafe {
756 try_ffi!(sys::nvmlSystemGetHicVersion(
757 &raw mut count,
758 entries.as_mut_ptr(),
759 ))?;
760 }
761 entries.truncate(count as usize);
762 Ok(entries.into_iter().map(Into::into).collect())
763 }
764
765 /// Allocates a sample buffer to be used with NVML GPM.
766 /// At least two of these buffers are required to use the NVML GPM feature.
767 ///
768 /// For Hopper or newer fully supported devices.
769 ///
770 /// # Errors
771 ///
772 /// Returns an error if NVML rejects the allocation request or if system
773 /// memory is insufficient.
774 pub fn gpm_sample(&self) -> Result<GpmSample> {
775 let mut sample = ptr::null_mut();
776 unsafe {
777 try_ffi!(sys::nvmlGpmSampleAlloc(&raw mut sample))?;
778 }
779 Ok(GpmSample(sample))
780 }
781
782 /// Calculate GPM metrics from two samples.
783 ///
784 /// For Hopper or newer fully supported devices.
785 ///
786 /// To retrieve metrics, allocate two sample buffers with [`Library::gpm_sample`] and store them in `metrics.sample1` and `metrics.sample2`.
787 /// Next, fill each requested metric ID in `metrics.metrics[i].metric_id` and set `metrics.num_metrics` to the total number of metrics to retrieve.
788 /// Then call [`Device::gpm_sample`] twice to obtain two samples of counters.
789 ///
790 /// The interval between these two [`Device::gpm_sample`] calls must be greater than 100 ms due to the internal sample refresh rate.
791 /// Finally, call [`Library::gpm_metrics`] to retrieve the metrics into `metrics.metrics`.
792 ///
793 pub fn gpm_metrics(
794 &self,
795 sample1: &GpmSample,
796 sample2: &GpmSample,
797 metric_ids: &[GpmMetricId],
798 ) -> Result<Vec<GpmMetric>> {
799 let mut metrics_get = sys::nvmlGpmMetricsGet_t {
800 version: sys::NVML_GPM_METRICS_GET_VERSION,
801 numMetrics: metric_ids.len() as u32,
802 sample1: sample1.0,
803 sample2: sample2.0,
804 ..Default::default()
805 };
806
807 for (slot, metric_id) in metrics_get.metrics.iter_mut().zip(metric_ids) {
808 slot.metricId = metric_id.0;
809 }
810
811 unsafe {
812 try_ffi!(sys::nvmlGpmMetricsGet(&raw mut metrics_get))?;
813 }
814
815 Ok(metrics_get.metrics[..metric_ids.len()]
816 .iter()
817 .copied()
818 .map(Into::into)
819 .collect())
820 }
821}
822
823impl Drop for Library {
824 fn drop(&mut self) {
825 unsafe {
826 let _ = sys::nvmlShutdown();
827 }
828 }
829}
830
831impl GpmSample {
832 pub const fn as_raw(&self) -> sys::nvmlGpmSample_t {
833 self.0
834 }
835}
836
837impl Drop for GpmSample {
838 fn drop(&mut self) {
839 unsafe {
840 let _ = sys::nvmlGpmSampleFree(self.0);
841 }
842 }
843}
844
845impl EventSet {
846 /// Starts recording the requested events for the specified device.
847 ///
848 /// For Fermi or newer fully supported devices.
849 /// ECC events are available only on ECC-enabled devices; power-capping events are available only on devices with power management support.
850 ///
851 /// For Linux only.
852 ///
853 /// All events that occurred before this call are not recorded.
854 /// Use [`EventSet::wait`] to observe recorded events.
855 ///
856 /// If NVML reports [`Status::Unknown`], the event set is in an undefined
857 /// state and must be freed.
858 /// If NVML reports [`Status::NotSupported`], the event set can still be
859 /// used, but none of the requested event types are registered.
860 ///
861 /// # Errors
862 ///
863 /// Returns an error if the device is inaccessible, if NVML rejects the
864 /// event set or event mask, if the platform or requested event types are not
865 /// supported, if NVML has not been initialized, or if NVML reports an
866 /// unexpected failure.
867 pub fn register_device(&self, device: Device, event_types: EventTypes) -> Result<()> {
868 unsafe {
869 try_ffi!(sys::nvmlDeviceRegisterEvents(
870 device.as_raw(),
871 event_types.bits(),
872 self.0,
873 ))
874 }
875 }
876
877 /// Waits for and returns the next event.
878 ///
879 /// For Fermi or newer fully supported devices.
880 ///
881 /// If events are ready when this is called, it returns immediately.
882 /// If no events are ready, it sleeps until an event arrives or `timeout_ms` expires.
883 /// In some conditions, such as an interrupt, this can return before the timeout expires.
884 ///
885 /// On Windows, after an Xid error, this method returns the most recent Xid error type seen by the system.
886 /// If multiple Xid errors are generated before this wait call is made, the last seen Xid error type is returned for all Xid error events.
887 ///
888 /// On Linux, every Xid error event returns the associated event data and
889 /// other information if applicable.
890 ///
891 /// In MIG mode, if a device handle is provided, NVML reports events for all
892 /// available instances only when the caller has appropriate privileges.
893 /// Without those privileges, only events affecting all instances, namely
894 /// the whole device, are reported.
895 ///
896 /// This does not currently support per-instance event reporting using MIG device handles.
897 ///
898 /// # Errors
899 ///
900 /// Returns an error if a registered GPU is inaccessible, if NVML rejects the
901 /// event-data output, if no event arrives before `timeout_ms` or the wait is
902 /// interrupted, if NVML has not been initialized, or if NVML reports an
903 /// unexpected failure.
904 pub fn wait(&self, timeout_ms: u32) -> Result<EventData> {
905 unsafe {
906 let mut data = MaybeUninit::<sys::nvmlEventData_t>::uninit();
907 try_ffi!(sys::nvmlEventSetWait_v2(
908 self.0,
909 data.as_mut_ptr(),
910 timeout_ms
911 ))?;
912 Ok(data.assume_init().into())
913 }
914 }
915}
916
917impl Drop for EventSet {
918 fn drop(&mut self) {
919 unsafe {
920 let _ = sys::nvmlEventSetFree(self.0);
921 }
922 }
923}
924
925impl SystemEventSet {
926 /// Starts recording the requested events for the system event set.
927 ///
928 /// For Linux only.
929 ///
930 /// Starts recording events on the specified device.
931 /// All events that occurred before this call are not recorded.
932 /// Use [`SystemEventSet::wait`] to check whether an event occurred.
933 ///
934 /// If NVML reports [`Status::Unknown`], the event set is in an undefined
935 /// state and must be freed.
936 /// If NVML reports [`Status::NotSupported`], the event set can still be
937 /// used, but none of the requested event types are registered.
938 ///
939 /// # Errors
940 ///
941 /// Returns an error if the installed NVML version does not support the
942 /// request layout, if NVML rejects the event registration request, if NVML
943 /// has not been initialized, or if NVML reports an unexpected failure.
944 pub fn register_events(&self, event_types: SystemEventTypes) -> Result<()> {
945 let mut request = sys::nvmlSystemRegisterEventRequest_t {
946 version: struct_version::<sys::nvmlSystemRegisterEventRequest_t>(1),
947 eventTypes: event_types.bits(),
948 set: self.0,
949 };
950 unsafe { try_ffi!(sys::nvmlSystemRegisterEvents(&raw mut request)) }
951 }
952
953 /// Waits for system events and returns ready events.
954 ///
955 /// For Fermi or newer fully supported devices.
956 ///
957 /// If events are ready when this is called, it returns immediately.
958 /// If no events are ready, it sleeps until an event arrives or `timeout_ms` expires.
959 /// In some conditions, such as an interrupt, this can return before the timeout expires.
960 ///
961 /// If the returned event count equals the internal event-buffer capacity, there may be outstanding events.
962 /// Call [`SystemEventSet::wait`] again to query all events.
963 ///
964 /// # Errors
965 ///
966 /// Returns an error if the installed NVML version does not support the
967 /// request layout, if NVML rejects the wait request, if no event arrives
968 /// before `timeout_ms`, if NVML has not been initialized, or if NVML reports
969 /// an unexpected failure.
970 pub fn wait(&self, timeout_ms: u32, max_events: u32) -> Result<Vec<SystemEventData>> {
971 let mut data = vec![sys::nvmlSystemEventData_v1_t::default(); max_events as usize];
972 let mut request = sys::nvmlSystemEventSetWaitRequest_t {
973 version: struct_version::<sys::nvmlSystemEventSetWaitRequest_t>(1),
974 timeoutms: timeout_ms,
975 set: self.0,
976 data: data.as_mut_ptr(),
977 dataSize: data.len() as u32,
978 ..Default::default()
979 };
980 unsafe {
981 try_ffi!(sys::nvmlSystemEventSetWait(&raw mut request))?;
982 }
983 data.truncate(request.numEvent as usize);
984 Ok(data.into_iter().map(Into::into).collect())
985 }
986}
987
988impl Drop for SystemEventSet {
989 fn drop(&mut self) {
990 let mut request = sys::nvmlSystemEventSetFreeRequest_t {
991 version: struct_version::<sys::nvmlSystemEventSetFreeRequest_t>(1),
992 set: self.0,
993 };
994 unsafe {
995 let _ = sys::nvmlSystemEventSetFree(&raw mut request);
996 }
997 }
998}