nvml_wrapper/lib.rs
1/*!
2A safe and ergonomic Rust wrapper for the [NVIDIA Management Library][nvml] (NVML),
3a C-based programmatic interface for monitoring and managing various states within
4NVIDIA GPUs.
5
6```
7use nvml_wrapper::Nvml;
8# use nvml_wrapper::error::*;
9# fn test() -> Result<(), NvmlError> {
10
11let nvml = Nvml::init()?;
12// Get the first `Device` (GPU) in the system
13let device = nvml.device_by_index(0)?;
14
15let brand = device.brand()?; // GeForce on my system
16let fan_speed = device.fan_speed(0)?; // Currently 17% on my system
17let power_limit = device.enforced_power_limit()?; // 275k milliwatts on my system
18let encoder_util = device.encoder_utilization()?; // Currently 0 on my system; Not encoding anything
19let memory_info = device.memory_info()?; // Currently 1.63/6.37 GB used on my system
20
21// ... and there's a whole lot more you can do. Most everything in NVML is wrapped and ready to go
22# Ok(())
23# }
24```
25
26NVML is intended to be a platform for building 3rd-party applications, and is
27also the underlying library for NVIDIA's nvidia-smi tool.
28
29## Usage
30
31`nvml-wrapper` builds on top of generated bindings for NVML that make use of the
32[`libloading`][libloading] crate. This means the NVML library gets loaded upon
33calling `Nvml::init` and can return an error if NVML isn't present, making it
34possible to drop NVIDIA-related features in your code at runtime on systems that
35don't have relevant hardware.
36
37Successful execution of `Nvml::init` means:
38
39* The NVML library was present on the system and able to be opened
40* The function symbol to initialize NVML was loaded and called successfully
41* An attempt has been made to load all other NVML function symbols
42
43Every function you call thereafter will individually return an error if it couldn't
44be loaded from the NVML library during the `Nvml::init` call.
45
46Note that it's not advised to repeatedly call `Nvml::init` as the constructor
47has to perform all the work of loading the function symbols from the library
48each time it gets called. Instead, call `Nvml::init` once and store the resulting
49`Nvml` instance somewhere to be accessed throughout the lifetime of your program
50(perhaps in a [`once_cell`][once_cell]).
51
52## NVML Support
53
54This wrapper is being developed against and currently supports NVML version
5511. Each new version of NVML is guaranteed to be backwards-compatible according
56to NVIDIA, so this wrapper should continue to work without issue regardless of
57NVML version bumps.
58
59### Legacy Functions
60
61Sometimes there will be function-level API version bumps in new NVML releases.
62For example:
63
64```text
65nvmlDeviceGetComputeRunningProcesses
66nvmlDeviceGetComputeRunningProcesses_v2
67nvmlDeviceGetComputeRunningProcesses_v3
68```
69
70The older versions of the functions will generally continue to work with the
71newer NVML releases; however, the newer function versions will not work with
72older NVML installs.
73
74By default this wrapper only provides access to the newest function versions.
75Enable the `legacy-functions` feature if you require the ability to call older
76functions.
77
78## MSRV
79
80The Minimum Supported Rust Version is currently 1.51.0. I will not go out of my
81way to avoid bumping this.
82
83## Cargo Features
84
85The `serde` feature can be toggled on in order to `#[derive(Serialize, Deserialize)]`
86for every NVML data structure.
87
88[nvml]: https://developer.nvidia.com/nvidia-management-library-nvml
89[libloading]: https://github.com/nagisa/rust_libloading
90[once_cell]: https://docs.rs/once_cell/latest/once_cell/sync/struct.Lazy.html
91*/
92
93#![recursion_limit = "1024"]
94#![allow(non_upper_case_globals)]
95
96extern crate libloading;
97extern crate nvml_wrapper_sys as ffi;
98
99pub mod bitmasks;
100pub mod device;
101pub mod enum_wrappers;
102pub mod enums;
103pub mod error;
104pub mod event;
105pub mod gpm;
106pub mod high_level;
107pub mod nv_link;
108pub mod struct_wrappers;
109pub mod structs;
110#[cfg(test)]
111mod test_utils;
112pub mod unit;
113pub mod vgpu;
114
115// Re-exports for convenience
116pub use crate::device::Device;
117pub use crate::event::EventSet;
118pub use crate::gpm::GpmSample;
119pub use crate::nv_link::NvLink;
120pub use crate::unit::Unit;
121
122/// Re-exports from `nvml-wrapper-sys` that are necessary for use of this wrapper.
123pub mod sys_exports {
124 /// Use these constants to populate the `structs::device::FieldId` newtype.
125 pub mod field_id {
126 pub use crate::ffi::bindings::field_id::*;
127 }
128}
129
130#[cfg(target_os = "linux")]
131use std::convert::TryInto;
132#[cfg(target_os = "linux")]
133use std::ptr;
134use std::{
135 convert::TryFrom,
136 ffi::{CStr, CString, OsStr},
137 mem::{self, ManuallyDrop},
138 os::raw::{c_int, c_uint},
139};
140
141use static_assertions::assert_impl_all;
142
143#[cfg(target_os = "linux")]
144use crate::enum_wrappers::device::TopologyLevel;
145
146use crate::error::{nvml_sym, nvml_try, NvmlError};
147use crate::ffi::bindings::*;
148
149use crate::struct_wrappers::ExcludedDeviceInfo;
150
151#[cfg(target_os = "linux")]
152use crate::struct_wrappers::device::PciInfo;
153use crate::struct_wrappers::device::VgpuVersion;
154use crate::struct_wrappers::unit::HwbcEntry;
155
156use crate::bitmasks::InitFlags;
157
158#[cfg(not(target_os = "linux"))]
159const LIB_PATH: &str = "nvml.dll";
160
161#[cfg(target_os = "linux")]
162const LIB_PATH: &str = "libnvidia-ml.so.1";
163
164/// Determines the major version of the CUDA driver given the full version.
165///
166/// Obtain the full version via `Nvml.sys_cuda_driver_version()`.
167pub fn cuda_driver_version_major(version: i32) -> i32 {
168 version / 1000
169}
170
171/// Determines the minor version of the CUDA driver given the full version.
172///
173/// Obtain the full version via `NVML.sys_cuda_driver_version()`.
174pub fn cuda_driver_version_minor(version: i32) -> i32 {
175 (version % 1000) / 10
176}
177
178/**
179The main struct that this library revolves around.
180
181According to NVIDIA's documentation, "It is the user's responsibility to call `nvmlInit()`
182before calling any other methods, and `nvmlShutdown()` once NVML is no longer being used."
183This struct is used to enforce those rules.
184
185Also according to NVIDIA's documentation, "NVML is thread-safe so it is safe to make
186simultaneous NVML calls from multiple threads." In the Rust world, this translates to `NVML`
187being `Send` + `Sync`. You can `.clone()` an `Arc` wrapped `NVML` and enjoy using it on any thread.
188
189NOTE: If you care about possible errors returned from `nvmlShutdown()`, use the `.shutdown()`
190method on this struct. **The `Drop` implementation ignores errors.**
191
192When reading documentation on this struct and its members, remember that a lot of it,
193especially in regards to errors returned, is copied from NVIDIA's docs. While they can be found
194online [here](http://docs.nvidia.com/deploy/nvml-api/index.html), the hosted docs sometimes outdated
195and may not accurately reflect the version of NVML that this library is written for; beware. You
196should ideally read the doc comments on an up-to-date NVML API header. Such a header can be
197downloaded as part of the [CUDA toolkit](https://developer.nvidia.com/cuda-downloads).
198*/
199pub struct Nvml {
200 lib: ManuallyDrop<NvmlLib>,
201}
202
203assert_impl_all!(Nvml: Send, Sync);
204
205impl std::fmt::Debug for Nvml {
206 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
207 f.write_str("NVML")
208 }
209}
210
211impl Nvml {
212 /**
213 Handles NVML initialization and must be called before doing anything else.
214
215 While it is possible to initialize `NVML` multiple times (NVIDIA's docs state
216 that reference counting is used internally), you should strive to initialize
217 `NVML` once at the start of your program's execution; the constructors handle
218 dynamically loading function symbols from the `NVML` lib and are therefore
219 somewhat expensive.
220
221 Note that this will initialize NVML but not any GPUs. This means that NVML can
222 communicate with a GPU even when other GPUs in a system are bad or unstable.
223
224 By default, initialization looks for "libnvidia-ml.so" on linux and "nvml.dll"
225 on Windows. These default names should work for default installs on those
226 platforms; if further specification is required, use `Nvml::builder`.
227
228 # Errors
229
230 * `DriverNotLoaded`, if the NVIDIA driver is not running
231 * `NoPermission`, if NVML does not have permission to talk to the driver
232 * `Unknown`, on any unexpected error
233 */
234 // Checked against local
235 #[doc(alias = "nvmlInit_v2")]
236 pub fn init() -> Result<Self, NvmlError> {
237 Self::init_internal(LIB_PATH)
238 }
239
240 fn init_internal(path: impl AsRef<std::ffi::OsStr>) -> Result<Self, NvmlError> {
241 let lib = unsafe {
242 let lib = NvmlLib::new(path)?;
243 let sym = nvml_sym(lib.nvmlInit_v2.as_ref())?;
244
245 nvml_try(sym())?;
246 ManuallyDrop::new(lib)
247 };
248
249 Ok(Self { lib })
250 }
251
252 /**
253 An initialization function that allows you to pass flags to control certain behaviors.
254
255 This is the same as `init()` except for the addition of flags.
256
257 # Errors
258
259 * `DriverNotLoaded`, if the NVIDIA driver is not running
260 * `NoPermission`, if NVML does not have permission to talk to the driver
261 * `Unknown`, on any unexpected error
262
263 # Examples
264
265 ```
266 # use nvml_wrapper::Nvml;
267 # use nvml_wrapper::error::*;
268 use nvml_wrapper::bitmasks::InitFlags;
269
270 # fn main() -> Result<(), NvmlError> {
271 // Don't fail if the system doesn't have any NVIDIA GPUs
272 //
273 // Also, don't attach any GPUs during initialization
274 Nvml::init_with_flags(InitFlags::NO_GPUS | InitFlags::NO_ATTACH)?;
275 # Ok(())
276 # }
277 ```
278 */
279 #[doc(alias = "nvmlInitWithFlags")]
280 pub fn init_with_flags(flags: InitFlags) -> Result<Self, NvmlError> {
281 Self::init_with_flags_internal(LIB_PATH, flags)
282 }
283
284 fn init_with_flags_internal(
285 path: impl AsRef<std::ffi::OsStr>,
286 flags: InitFlags,
287 ) -> Result<Self, NvmlError> {
288 let lib = unsafe {
289 let lib = NvmlLib::new(path)?;
290 let sym = nvml_sym(lib.nvmlInitWithFlags.as_ref())?;
291
292 nvml_try(sym(flags.bits()))?;
293 ManuallyDrop::new(lib)
294 };
295
296 Ok(Self { lib })
297 }
298
299 /// Create an `NvmlBuilder` for further flexibility in how NVML is initialized.
300 pub fn builder<'a>() -> NvmlBuilder<'a> {
301 NvmlBuilder::default()
302 }
303
304 /// Get the underlying `NvmlLib` instance.
305 pub fn lib(&self) -> &NvmlLib {
306 &self.lib
307 }
308
309 /**
310 Use this to shutdown NVML and release allocated resources if you care about handling
311 potential errors (*the `Drop` implementation ignores errors!*).
312
313 # Errors
314
315 * `Uninitialized`, if the library has not been successfully initialized
316 * `Unknown`, on any unexpected error
317 */
318 // Thanks to `sorear` on IRC for suggesting this approach
319 // Checked against local
320 // Tested
321 #[doc(alias = "nvmlShutdown")]
322 pub fn shutdown(mut self) -> Result<(), NvmlError> {
323 let sym = nvml_sym(self.lib.nvmlShutdown.as_ref())?;
324
325 unsafe {
326 nvml_try(sym())?;
327 }
328
329 // SAFETY: we `mem::forget(self)` after this, so `self.lib` won't get
330 // touched by our `Drop` impl
331 let lib = unsafe { ManuallyDrop::take(&mut self.lib) };
332 mem::forget(self);
333
334 Ok(lib.__library.close()?)
335 }
336
337 /**
338 Get the number of compute devices in the system (compute device == one GPU).
339
340 Note that this count can include devices you do not have permission to access.
341
342 # Errors
343
344 * `Uninitialized`, if the library has not been successfully initialized
345 * `Unknown`, on any unexpected error
346 */
347 // Checked against local
348 // Tested
349 #[doc(alias = "nvmlDeviceGetCount_v2")]
350 pub fn device_count(&self) -> Result<u32, NvmlError> {
351 let sym = nvml_sym(self.lib.nvmlDeviceGetCount_v2.as_ref())?;
352
353 unsafe {
354 let mut count: c_uint = mem::zeroed();
355 nvml_try(sym(&mut count))?;
356
357 Ok(count)
358 }
359 }
360
361 /**
362 Gets the version of the system's graphics driver and returns it as an alphanumeric
363 string.
364
365 # Errors
366
367 * `Uninitialized`, if the library has not been successfully initialized
368 * `Utf8Error`, if the string obtained from the C function is not valid Utf8
369 */
370 // Checked against local
371 // Tested
372 #[doc(alias = "nvmlSystemGetDriverVersion")]
373 pub fn sys_driver_version(&self) -> Result<String, NvmlError> {
374 let sym = nvml_sym(self.lib.nvmlSystemGetDriverVersion.as_ref())?;
375
376 unsafe {
377 let mut version_vec = vec![0; NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE as usize];
378
379 nvml_try(sym(
380 version_vec.as_mut_ptr(),
381 NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE,
382 ))?;
383
384 let version_raw = CStr::from_ptr(version_vec.as_ptr());
385 Ok(version_raw.to_str()?.into())
386 }
387 }
388
389 /**
390 Gets the version of the system's NVML library and returns it as an alphanumeric
391 string.
392
393 # Errors
394
395 * `Utf8Error`, if the string obtained from the C function is not valid Utf8
396 */
397 // Checked against local
398 // Tested
399 #[doc(alias = "nvmlSystemGetNVMLVersion")]
400 pub fn sys_nvml_version(&self) -> Result<String, NvmlError> {
401 let sym = nvml_sym(self.lib.nvmlSystemGetNVMLVersion.as_ref())?;
402
403 unsafe {
404 let mut version_vec = vec![0; NVML_SYSTEM_NVML_VERSION_BUFFER_SIZE as usize];
405
406 nvml_try(sym(
407 version_vec.as_mut_ptr(),
408 NVML_SYSTEM_NVML_VERSION_BUFFER_SIZE,
409 ))?;
410
411 // Thanks to `Amaranth` on IRC for help with this
412 let version_raw = CStr::from_ptr(version_vec.as_ptr());
413 Ok(version_raw.to_str()?.into())
414 }
415 }
416
417 /**
418 Gets the version of the system's CUDA driver.
419
420 Calls into the CUDA library (cuDriverGetVersion()).
421
422 You can use `cuda_driver_version_major` and `cuda_driver_version_minor`
423 to get the major and minor driver versions from this number.
424
425 # Errors
426
427 * `FunctionNotFound`, if cuDriverGetVersion() is not found in the shared library
428 * `LibraryNotFound`, if libcuda.so.1 or libcuda.dll cannot be found
429 */
430 #[doc(alias = "nvmlSystemGetCudaDriverVersion_v2")]
431 pub fn sys_cuda_driver_version(&self) -> Result<i32, NvmlError> {
432 let sym = nvml_sym(self.lib.nvmlSystemGetCudaDriverVersion_v2.as_ref())?;
433
434 unsafe {
435 let mut version: c_int = mem::zeroed();
436 nvml_try(sym(&mut version))?;
437
438 Ok(version)
439 }
440 }
441
442 /**
443 Gets the name of the process for the given process ID, cropped to the provided length.
444
445 # Errors
446
447 * `Uninitialized`, if the library has not been successfully initialized
448 * `InvalidArg`, if the length is 0 (if this is returned without length being 0, file an issue)
449 * `NotFound`, if the process does not exist
450 * `NoPermission`, if the user doesn't have permission to perform the operation
451 * `Utf8Error`, if the string obtained from the C function is not valid UTF-8. NVIDIA's docs say
452 that the string encoding is ANSI, so this may very well happen.
453 * `Unknown`, on any unexpected error
454 */
455 // TODO: The docs say the string is ANSI-encoded. Not sure if I should try
456 // to do anything about that
457 // Checked against local
458 // Tested
459 #[doc(alias = "nvmlSystemGetProcessName")]
460 pub fn sys_process_name(&self, pid: u32, length: usize) -> Result<String, NvmlError> {
461 let sym = nvml_sym(self.lib.nvmlSystemGetProcessName.as_ref())?;
462
463 unsafe {
464 let mut name_vec = vec![0; length];
465
466 nvml_try(sym(pid, name_vec.as_mut_ptr(), length as c_uint))?;
467
468 let name_raw = CStr::from_ptr(name_vec.as_ptr());
469 Ok(name_raw.to_str()?.into())
470 }
471 }
472
473 /**
474 Acquire the handle for a particular device based on its index (starts at 0).
475
476 Usage of this function causes NVML to initialize the target GPU. Additional
477 GPUs may be initialized if the target GPU is an SLI slave.
478
479 You can determine valid indices by using `.device_count()`. This
480 function doesn't call that for you, but the actual C function to get
481 the device handle will return an error in the case of an invalid index.
482 This means that the `InvalidArg` error will be returned if you pass in
483 an invalid index.
484
485 NVIDIA's docs state that "The order in which NVML enumerates devices has
486 no guarantees of consistency between reboots. For that reason it is recommended
487 that devices be looked up by their PCI ids or UUID." In this library, that translates
488 into usage of `.device_by_uuid()` and `.device_by_pci_bus_id()`.
489
490 The NVML index may not correlate with other APIs such as the CUDA device index.
491
492 # Errors
493
494 * `Uninitialized`, if the library has not been successfully initialized
495 * `InvalidArg`, if index is invalid
496 * `InsufficientPower`, if any attached devices have improperly attached external power cables
497 * `NoPermission`, if the user doesn't have permission to talk to this device
498 * `IrqIssue`, if the NVIDIA kernel detected an interrupt issue with the attached GPUs
499 * `GpuLost`, if the target GPU has fallen off the bus or is otherwise inaccessible
500 * `Unknown`, on any unexpected error
501 */
502 // Checked against local
503 // Tested
504 #[doc(alias = "nvmlDeviceGetHandleByIndex_v2")]
505 pub fn device_by_index(&self, index: u32) -> Result<Device<'_>, NvmlError> {
506 let sym = nvml_sym(self.lib.nvmlDeviceGetHandleByIndex_v2.as_ref())?;
507
508 unsafe {
509 let mut device: nvmlDevice_t = mem::zeroed();
510 nvml_try(sym(index, &mut device))?;
511
512 Ok(Device::new(device, self))
513 }
514 }
515
516 /**
517 Acquire the handle for a particular device based on its PCI bus ID.
518
519 Usage of this function causes NVML to initialize the target GPU. Additional
520 GPUs may be initialized if the target GPU is an SLI slave.
521
522 The bus ID corresponds to the `bus_id` returned by `Device.pci_info()`.
523
524 # Errors
525
526 * `Uninitialized`, if the library has not been successfully initialized
527 * `InvalidArg`, if `pci_bus_id` is invalid
528 * `NotFound`, if `pci_bus_id` does not match a valid device on the system
529 * `InsufficientPower`, if any attached devices have improperly attached external power cables
530 * `NoPermission`, if the user doesn't have permission to talk to this device
531 * `IrqIssue`, if the NVIDIA kernel detected an interrupt issue with the attached GPUs
532 * `GpuLost`, if the target GPU has fallen off the bus or is otherwise inaccessible
533 * `NulError`, for which you can read the docs on `std::ffi::NulError`
534 * `Unknown`, on any unexpected error
535 */
536 // Checked against local
537 // Tested
538 #[doc(alias = "nvmlDeviceGetHandleByPciBusId_v2")]
539 pub fn device_by_pci_bus_id<S: AsRef<str>>(
540 &self,
541 pci_bus_id: S,
542 ) -> Result<Device<'_>, NvmlError>
543 where
544 Vec<u8>: From<S>,
545 {
546 let sym = nvml_sym(self.lib.nvmlDeviceGetHandleByPciBusId_v2.as_ref())?;
547
548 unsafe {
549 let c_string = CString::new(pci_bus_id)?;
550 let mut device: nvmlDevice_t = mem::zeroed();
551
552 nvml_try(sym(c_string.as_ptr(), &mut device))?;
553
554 Ok(Device::new(device, self))
555 }
556 }
557
558 /// Not documenting this because it's deprecated and does not seem to work
559 /// anymore.
560 // Tested (for an error)
561 #[deprecated(note = "use `.device_by_uuid()`, this errors on dual GPU boards")]
562 #[doc(alias = "nvmlDeviceGetHandleBySerial")]
563 pub fn device_by_serial<S: AsRef<str>>(&self, board_serial: S) -> Result<Device<'_>, NvmlError>
564 where
565 Vec<u8>: From<S>,
566 {
567 let sym = nvml_sym(self.lib.nvmlDeviceGetHandleBySerial.as_ref())?;
568
569 unsafe {
570 let c_string = CString::new(board_serial)?;
571 let mut device: nvmlDevice_t = mem::zeroed();
572
573 nvml_try(sym(c_string.as_ptr(), &mut device))?;
574
575 Ok(Device::new(device, self))
576 }
577 }
578
579 /**
580 Acquire the handle for a particular device based on its globally unique immutable
581 UUID.
582
583 Usage of this function causes NVML to initialize the target GPU. Additional
584 GPUs may be initialized as the function called within searches for the target GPU.
585
586 # Errors
587
588 * `Uninitialized`, if the library has not been successfully initialized
589 * `InvalidArg`, if `uuid` is invalid
590 * `NotFound`, if `uuid` does not match a valid device on the system
591 * `InsufficientPower`, if any attached devices have improperly attached external power cables
592 * `IrqIssue`, if the NVIDIA kernel detected an interrupt issue with the attached GPUs
593 * `GpuLost`, if the target GPU has fallen off the bus or is otherwise inaccessible
594 * `NulError`, for which you can read the docs on `std::ffi::NulError`
595 * `Unknown`, on any unexpected error
596
597 NVIDIA doesn't mention `NoPermission` for this one. Strange!
598 */
599 // Checked against local
600 // Tested
601 #[doc(alias = "nvmlDeviceGetHandleByUUID")]
602 pub fn device_by_uuid<S: AsRef<str>>(&self, uuid: S) -> Result<Device<'_>, NvmlError>
603 where
604 Vec<u8>: From<S>,
605 {
606 let sym = nvml_sym(self.lib.nvmlDeviceGetHandleByUUID.as_ref())?;
607
608 unsafe {
609 let c_string = CString::new(uuid)?;
610 let mut device: nvmlDevice_t = mem::zeroed();
611
612 nvml_try(sym(c_string.as_ptr(), &mut device))?;
613
614 Ok(Device::new(device, self))
615 }
616 }
617
618 /**
619 Gets the common ancestor for two devices.
620
621 Note: this is the same as `Device.topology_common_ancestor()`.
622
623 # Errors
624
625 * `InvalidArg`, if the device is invalid
626 * `NotSupported`, if this `Device` or the OS does not support this feature
627 * `UnexpectedVariant`, for which you can read the docs for
628 * `Unknown`, on any unexpected error
629
630 # Platform Support
631
632 Only supports Linux.
633 */
634 // Checked against local
635 // Tested
636 #[cfg(target_os = "linux")]
637 #[doc(alias = "nvmlDeviceGetTopologyCommonAncestor")]
638 pub fn topology_common_ancestor(
639 &self,
640 device1: &Device,
641 device2: &Device,
642 ) -> Result<TopologyLevel, NvmlError> {
643 let sym = nvml_sym(self.lib.nvmlDeviceGetTopologyCommonAncestor.as_ref())?;
644
645 unsafe {
646 let mut level: nvmlGpuTopologyLevel_t = mem::zeroed();
647
648 nvml_try(sym(device1.handle(), device2.handle(), &mut level))?;
649
650 TopologyLevel::try_from(level)
651 }
652 }
653
654 /**
655 Acquire the handle for a particular `Unit` based on its index.
656
657 Valid indices are derived from the count returned by `.unit_count()`.
658 For example, if `unit_count` is 2 the valid indices are 0 and 1, corresponding
659 to UNIT 0 and UNIT 1.
660
661 Note that the order in which NVML enumerates units has no guarantees of
662 consistency between reboots.
663
664 # Errors
665
666 * `Uninitialized`, if the library has not been successfully initialized
667 * `InvalidArg`, if `index` is invalid
668 * `Unknown`, on any unexpected error
669
670 # Device Support
671
672 For S-class products.
673 */
674 // Checked against local
675 // Tested (for an error)
676 #[doc(alias = "nvmlUnitGetHandleByIndex")]
677 pub fn unit_by_index(&self, index: u32) -> Result<Unit<'_>, NvmlError> {
678 let sym = nvml_sym(self.lib.nvmlUnitGetHandleByIndex.as_ref())?;
679
680 unsafe {
681 let mut unit: nvmlUnit_t = mem::zeroed();
682 nvml_try(sym(index as c_uint, &mut unit))?;
683
684 Ok(Unit::new(unit, self))
685 }
686 }
687
688 /**
689 Checks if the passed-in `Device`s are on the same physical board.
690
691 Note: this is the same as `Device.is_on_same_board_as()`.
692
693 # Errors
694
695 * `Uninitialized`, if the library has not been successfully initialized
696 * `InvalidArg`, if either `Device` is invalid
697 * `NotSupported`, if this check is not supported by this `Device`
698 * `GpuLost`, if this `Device` has fallen off the bus or is otherwise inaccessible
699 * `Unknown`, on any unexpected error
700 */
701 // Checked against local
702 // Tested
703 #[doc(alias = "nvmlDeviceOnSameBoard")]
704 pub fn are_devices_on_same_board(
705 &self,
706 device1: &Device,
707 device2: &Device,
708 ) -> Result<bool, NvmlError> {
709 let sym = nvml_sym(self.lib.nvmlDeviceOnSameBoard.as_ref())?;
710
711 unsafe {
712 let mut bool_int: c_int = mem::zeroed();
713
714 nvml_try(sym(device1.handle(), device2.handle(), &mut bool_int))?;
715
716 match bool_int {
717 0 => Ok(false),
718 _ => Ok(true),
719 }
720 }
721 }
722
723 /**
724 Gets the set of GPUs that have a CPU affinity with the given CPU number.
725
726 # Errors
727
728 * `InvalidArg`, if `cpu_number` is invalid
729 * `NotSupported`, if this `Device` or the OS does not support this feature
730 * `Unknown`, an error has occurred in the underlying topology discovery
731
732 # Platform Support
733
734 Only supports Linux.
735 */
736 // Tested
737 #[cfg(target_os = "linux")]
738 #[doc(alias = "nvmlSystemGetTopologyGpuSet")]
739 pub fn topology_gpu_set(&self, cpu_number: u32) -> Result<Vec<Device<'_>>, NvmlError> {
740 let sym = nvml_sym(self.lib.nvmlSystemGetTopologyGpuSet.as_ref())?;
741
742 unsafe {
743 let mut count = match self.topology_gpu_set_count(cpu_number)? {
744 0 => return Ok(vec![]),
745 value => value,
746 };
747 let mut devices: Vec<nvmlDevice_t> = vec![mem::zeroed(); count as usize];
748
749 nvml_try(sym(cpu_number, &mut count, devices.as_mut_ptr()))?;
750
751 Ok(devices.into_iter().map(|d| Device::new(d, self)).collect())
752 }
753 }
754
755 // Helper function for the above.
756 #[cfg(target_os = "linux")]
757 fn topology_gpu_set_count(&self, cpu_number: u32) -> Result<c_uint, NvmlError> {
758 let sym = nvml_sym(self.lib.nvmlSystemGetTopologyGpuSet.as_ref())?;
759
760 unsafe {
761 // Indicates that we want the count
762 let mut count: c_uint = 0;
763
764 // Passing null doesn't indicate that we want the count, just allowed
765 nvml_try(sym(cpu_number, &mut count, ptr::null_mut()))?;
766
767 Ok(count)
768 }
769 }
770
771 /**
772 Gets the IDs and firmware versions for any Host Interface Cards in the system.
773
774 # Errors
775
776 * `Uninitialized`, if the library has not been successfully initialized
777
778 # Device Support
779
780 Supports S-class products.
781 */
782 // Checked against local
783 // Tested
784 #[doc(alias = "nvmlSystemGetHicVersion")]
785 pub fn hic_versions(&self) -> Result<Vec<HwbcEntry>, NvmlError> {
786 let sym = nvml_sym(self.lib.nvmlSystemGetHicVersion.as_ref())?;
787
788 unsafe {
789 let mut count: c_uint = match self.hic_count()? {
790 0 => return Ok(vec![]),
791 value => value,
792 };
793 let mut hics: Vec<nvmlHwbcEntry_t> = vec![mem::zeroed(); count as usize];
794
795 nvml_try(sym(&mut count, hics.as_mut_ptr()))?;
796
797 hics.into_iter().map(HwbcEntry::try_from).collect()
798 }
799 }
800
801 /**
802 Gets the count of Host Interface Cards in the system.
803
804 # Errors
805
806 * `Uninitialized`, if the library has not been successfully initialized
807
808 # Device Support
809
810 Supports S-class products.
811 */
812 // Tested as part of the above method
813 #[doc(alias = "nvmlSystemGetHicVersion")]
814 pub fn hic_count(&self) -> Result<u32, NvmlError> {
815 let sym = nvml_sym(self.lib.nvmlSystemGetHicVersion.as_ref())?;
816
817 unsafe {
818 /*
819 NVIDIA doesn't even say that `count` will be set to the count if
820 `InsufficientSize` is returned. But we can assume sanity, right?
821
822 The idea here is:
823 If there are 0 HICs, NVML_SUCCESS is returned, `count` is set
824 to 0. We return count, all good.
825 If there is 1 HIC, NVML_SUCCESS is returned, `count` is set to
826 1. We return count, all good.
827 If there are >= 2 HICs, NVML_INSUFFICIENT_SIZE is returned.
828 `count` is theoretically set to the actual count, and we
829 return it.
830 */
831 let mut count: c_uint = 1;
832 let mut hics: [nvmlHwbcEntry_t; 1] = [mem::zeroed()];
833
834 match sym(&mut count, hics.as_mut_ptr()) {
835 nvmlReturn_enum_NVML_SUCCESS | nvmlReturn_enum_NVML_ERROR_INSUFFICIENT_SIZE => {
836 Ok(count)
837 }
838 // We know that this will be an error
839 other => nvml_try(other).map(|_| 0),
840 }
841 }
842 }
843
844 /**
845 Gets the number of units in the system.
846
847 # Errors
848
849 * `Uninitialized`, if the library has not been successfully initialized
850 * `Unknown`, on any unexpected error
851
852 # Device Support
853
854 Supports S-class products.
855 */
856 // Checked against local
857 // Tested
858 #[doc(alias = "nvmlUnitGetCount")]
859 pub fn unit_count(&self) -> Result<u32, NvmlError> {
860 let sym = nvml_sym(self.lib.nvmlUnitGetCount.as_ref())?;
861
862 unsafe {
863 let mut count: c_uint = mem::zeroed();
864 nvml_try(sym(&mut count))?;
865
866 Ok(count)
867 }
868 }
869
870 /**
871 Create an empty set of events.
872
873 # Errors
874
875 * `Uninitialized`, if the library has not been successfully initialized
876 * `Unknown`, on any unexpected error
877
878 # Device Support
879
880 Supports Fermi and newer fully supported devices.
881 */
882 // Checked against local
883 // Tested
884 #[doc(alias = "nvmlEventSetCreate")]
885 pub fn create_event_set(&self) -> Result<EventSet<'_>, NvmlError> {
886 let sym = nvml_sym(self.lib.nvmlEventSetCreate.as_ref())?;
887
888 unsafe {
889 let mut set: nvmlEventSet_t = mem::zeroed();
890 nvml_try(sym(&mut set))?;
891
892 Ok(EventSet::new(set, self))
893 }
894 }
895
896 /**
897 Request the OS and the NVIDIA kernel driver to rediscover a portion of the PCI
898 subsystem in search of GPUs that were previously removed.
899
900 The portion of the PCI tree can be narrowed by specifying a domain, bus, and
901 device in the passed-in `pci_info`. **If all of these fields are zeroes, the
902 entire PCI tree will be searched.** Note that for long-running NVML processes,
903 the enumeration of devices will change based on how many GPUs are discovered
904 and where they are inserted in bus order.
905
906 All newly discovered GPUs will be initialized and have their ECC scrubbed which
907 may take several seconds per GPU. **All device handles are no longer guaranteed
908 to be valid post discovery**. I am not sure if this means **all** device
909 handles, literally, or if NVIDIA is referring to handles that had previously
910 been obtained to devices that were then removed and have now been
911 re-discovered.
912
913 Must be run as administrator.
914
915 # Errors
916
917 * `Uninitialized`, if the library has not been successfully initialized
918 * `OperatingSystem`, if the operating system is denying this feature
919 * `NoPermission`, if the calling process has insufficient permissions to
920 perform this operation
921 * `NulError`, if an issue is encountered when trying to convert a Rust
922 `String` into a `CString`.
923 * `Unknown`, on any unexpected error
924
925 # Device Support
926
927 Supports Pascal and newer fully supported devices.
928
929 Some Kepler devices are also supported (that's all NVIDIA says, no specifics).
930
931 # Platform Support
932
933 Only supports Linux.
934 */
935 // TODO: constructor for default pci_infos ^
936 // Checked against local
937 // Tested
938 #[cfg(target_os = "linux")]
939 #[doc(alias = "nvmlDeviceDiscoverGpus")]
940 pub fn discover_gpus(&self, pci_info: PciInfo) -> Result<(), NvmlError> {
941 let sym = nvml_sym(self.lib.nvmlDeviceDiscoverGpus.as_ref())?;
942
943 unsafe { nvml_try(sym(&mut pci_info.try_into()?)) }
944 }
945
946 /**
947 Gets the number of excluded GPU devices in the system.
948
949 # Device Support
950
951 Supports all devices.
952 */
953 #[doc(alias = "nvmlGetExcludedDeviceCount")]
954 pub fn excluded_device_count(&self) -> Result<u32, NvmlError> {
955 let sym = nvml_sym(self.lib.nvmlGetExcludedDeviceCount.as_ref())?;
956
957 unsafe {
958 let mut count: c_uint = mem::zeroed();
959
960 nvml_try(sym(&mut count))?;
961 Ok(count)
962 }
963 }
964
965 /**
966 Gets information for the specified excluded device.
967
968 # Errors
969
970 * `InvalidArg`, if the given index is invalid
971 * `Utf8Error`, if strings obtained from the C function are not valid Utf8
972
973 # Device Support
974
975 Supports all devices.
976 */
977 #[doc(alias = "nvmlGetExcludedDeviceInfoByIndex")]
978 pub fn excluded_device_info(&self, index: u32) -> Result<ExcludedDeviceInfo, NvmlError> {
979 let sym = nvml_sym(self.lib.nvmlGetExcludedDeviceInfoByIndex.as_ref())?;
980
981 unsafe {
982 let mut info: nvmlExcludedDeviceInfo_t = mem::zeroed();
983
984 nvml_try(sym(index, &mut info))?;
985 ExcludedDeviceInfo::try_from(info)
986 }
987 }
988
989 /**
990 Gets the loaded vGPU list of capabilities
991
992 # Errors
993
994 * `Uninitialized`, if the library has not been successfully initialized
995 * `Unknown`, on any unexpected error
996
997 # Device Support
998
999 Supports all devices.
1000 */
1001 #[doc(alias = "nvmlGetVgpuDriverCapabilities")]
1002 pub fn vgpu_driver_capabilities(
1003 &self,
1004 capability: nvmlVgpuDriverCapability_t,
1005 ) -> Result<u32, NvmlError> {
1006 let sym = nvml_sym(self.lib.nvmlGetVgpuDriverCapabilities.as_ref())?;
1007
1008 unsafe {
1009 let mut mask: u32 = mem::zeroed();
1010
1011 nvml_try(sym(capability, &mut mask))?;
1012 Ok(mask)
1013 }
1014 }
1015
1016 /**
1017 Get the supported and actual vGPU versions range.
1018
1019 # Errors
1020
1021 * `Uninitialized`, if the library has not been successfully initialized
1022 * `Unknown`, on any unexpected error
1023
1024 # Device Support
1025 */
1026 #[doc(alias = "nvmlGetVgpuVersion")]
1027 pub fn vgpu_version(&self) -> Result<(VgpuVersion, VgpuVersion), NvmlError> {
1028 let sym = nvml_sym(self.lib.nvmlGetVgpuVersion.as_ref())?;
1029
1030 unsafe {
1031 let mut supported: nvmlVgpuVersion_t = mem::zeroed();
1032 let mut current: nvmlVgpuVersion_t = mem::zeroed();
1033
1034 nvml_try(sym(&mut supported, &mut current))?;
1035 Ok((VgpuVersion::from(supported), VgpuVersion::from(current)))
1036 }
1037 }
1038
1039 #[doc(alias = "nvmlSetVgpuVersion")]
1040 pub fn set_vgpu_version(&self, version: VgpuVersion) -> Result<(), NvmlError> {
1041 let sym = nvml_sym(self.lib.nvmlSetVgpuVersion.as_ref())?;
1042
1043 unsafe { nvml_try(sym(&mut version.as_c())) }
1044 }
1045}
1046
1047/// This `Drop` implementation ignores errors! Use the `.shutdown()` method on
1048/// the `Nvml` struct
1049/// if you care about handling them.
1050impl Drop for Nvml {
1051 #[doc(alias = "nvmlShutdown")]
1052 fn drop(&mut self) {
1053 unsafe {
1054 self.lib.nvmlShutdown();
1055
1056 // SAFETY: called after the last usage of `self.lib`
1057 ManuallyDrop::drop(&mut self.lib);
1058 }
1059 }
1060}
1061
1062/**
1063A builder struct that provides further flexibility in how NVML is initialized.
1064
1065# Examples
1066
1067Initialize NVML with a non-default name for the shared object file:
1068
1069```
1070use nvml_wrapper::Nvml;
1071use std::ffi::OsStr;
1072
1073let init_result = Nvml::builder().lib_path(OsStr::new("libnvidia-ml-other-name.so")).init();
1074```
1075
1076Initialize NVML with a non-default path to the shared object file:
1077
1078```
1079use nvml_wrapper::Nvml;
1080use std::ffi::OsStr;
1081
1082let init_result = Nvml::builder().lib_path(OsStr::new("/some/path/to/libnvidia-ml.so")).init();
1083```
1084*/
1085#[derive(Debug, Clone, Eq, PartialEq, Default)]
1086pub struct NvmlBuilder<'a> {
1087 lib_path: Option<&'a OsStr>,
1088 flags: InitFlags,
1089}
1090
1091impl<'a> NvmlBuilder<'a> {
1092 /**
1093 Set the path to the NVML lib file.
1094
1095 See [`libloading`'s docs][libloading] for details about how this lib path is
1096 handled.
1097
1098 [libloading]: https://docs.rs/libloading/0.6.6/libloading/struct.Library.html#method.new
1099 */
1100 pub fn lib_path(&mut self, path: &'a OsStr) -> &mut Self {
1101 self.lib_path = Some(path);
1102 self
1103 }
1104
1105 /// Set the `InitFlags` to initialize NVML with.
1106 pub fn flags(&mut self, flags: InitFlags) -> &mut Self {
1107 self.flags = flags;
1108 self
1109 }
1110
1111 /// Perform initialization.
1112 pub fn init(&self) -> Result<Nvml, NvmlError> {
1113 let lib_path = self.lib_path.unwrap_or_else(|| LIB_PATH.as_ref());
1114
1115 if self.flags.is_empty() {
1116 Nvml::init_internal(lib_path)
1117 } else {
1118 Nvml::init_with_flags_internal(lib_path, self.flags)
1119 }
1120 }
1121}
1122
1123#[cfg(test)]
1124mod test {
1125 use super::*;
1126 use crate::bitmasks::InitFlags;
1127 use crate::error::NvmlError;
1128 use crate::test_utils::*;
1129
1130 #[test]
1131 fn init_with_flags() {
1132 Nvml::init_with_flags(InitFlags::NO_GPUS).unwrap();
1133 }
1134
1135 #[test]
1136 fn shutdown() {
1137 test(3, || nvml().shutdown())
1138 }
1139
1140 #[test]
1141 fn device_count() {
1142 test(3, || nvml().device_count())
1143 }
1144
1145 #[test]
1146 fn sys_driver_version() {
1147 test(3, || nvml().sys_driver_version())
1148 }
1149
1150 #[test]
1151 fn sys_nvml_version() {
1152 test(3, || nvml().sys_nvml_version())
1153 }
1154
1155 #[test]
1156 fn sys_cuda_driver_version() {
1157 test(3, || nvml().sys_cuda_driver_version())
1158 }
1159
1160 #[test]
1161 fn sys_cuda_driver_version_major() {
1162 test(3, || {
1163 Ok(cuda_driver_version_major(nvml().sys_cuda_driver_version()?))
1164 })
1165 }
1166
1167 #[test]
1168 fn sys_cuda_driver_version_minor() {
1169 test(3, || {
1170 Ok(cuda_driver_version_minor(nvml().sys_cuda_driver_version()?))
1171 })
1172 }
1173
1174 #[test]
1175 fn sys_process_name() {
1176 let nvml = nvml();
1177 test_with_device(3, &nvml, |device| {
1178 let processes = device.running_graphics_processes()?;
1179 match nvml.sys_process_name(processes[0].pid, 64) {
1180 Err(NvmlError::NoPermission) => Ok("No permission error".into()),
1181 v => v,
1182 }
1183 })
1184 }
1185
1186 #[test]
1187 fn device_by_index() {
1188 let nvml = nvml();
1189 test(3, || nvml.device_by_index(0))
1190 }
1191
1192 #[test]
1193 fn device_by_pci_bus_id() {
1194 let nvml = nvml();
1195 test_with_device(3, &nvml, |device| {
1196 let id = device.pci_info()?.bus_id;
1197 nvml.device_by_pci_bus_id(id)
1198 })
1199 }
1200
1201 // Can't get serial on my machine
1202 #[ignore = "my machine does not support this call"]
1203 #[test]
1204 fn device_by_serial() {
1205 let nvml = nvml();
1206
1207 #[allow(deprecated)]
1208 test_with_device(3, &nvml, |device| {
1209 let serial = device.serial()?;
1210 nvml.device_by_serial(serial)
1211 })
1212 }
1213
1214 #[test]
1215 fn device_by_uuid() {
1216 let nvml = nvml();
1217 test_with_device(3, &nvml, |device| {
1218 let uuid = device.uuid()?;
1219 nvml.device_by_uuid(uuid)
1220 })
1221 }
1222
1223 // I don't have 2 devices
1224 #[ignore = "my machine does not support this call"]
1225 #[cfg(target_os = "linux")]
1226 #[test]
1227 fn topology_common_ancestor() {
1228 let nvml = nvml();
1229 let device1 = device(&nvml);
1230 let device2 = nvml.device_by_index(1).expect("device");
1231
1232 nvml.topology_common_ancestor(&device1, &device2)
1233 .expect("TopologyLevel");
1234 }
1235
1236 // Errors on my machine
1237
1238 #[test]
1239 #[ignore = "my machine does not support this call"]
1240 fn unit_by_index() {
1241 let nvml = nvml();
1242 test(3, || nvml.unit_by_index(0))
1243 }
1244
1245 // I don't have 2 devices
1246 #[ignore = "my machine does not support this call"]
1247 #[test]
1248 fn are_devices_on_same_board() {
1249 let nvml = nvml();
1250 let device1 = device(&nvml);
1251 let device2 = nvml.device_by_index(1).expect("device");
1252
1253 nvml.are_devices_on_same_board(&device1, &device2)
1254 .expect("bool");
1255 }
1256
1257 #[cfg(target_os = "linux")]
1258 #[test]
1259 fn topology_gpu_set() {
1260 let nvml = nvml();
1261 test(3, || nvml.topology_gpu_set(0))
1262 }
1263
1264 #[test]
1265 fn hic_version() {
1266 let nvml = nvml();
1267 test(3, || nvml.hic_versions())
1268 }
1269
1270 #[test]
1271 fn unit_count() {
1272 test(3, || nvml().unit_count())
1273 }
1274
1275 #[test]
1276 fn create_event_set() {
1277 let nvml = nvml();
1278 test(3, || nvml.create_event_set())
1279 }
1280
1281 #[cfg(target_os = "linux")]
1282 #[should_panic(expected = "OperatingSystem")]
1283 #[test]
1284 fn discover_gpus() {
1285 let nvml = nvml();
1286 test_with_device(3, &nvml, |device| {
1287 let pci_info = device.pci_info()?;
1288
1289 // We don't test with admin perms and therefore expect an error
1290 match nvml.discover_gpus(pci_info) {
1291 Err(NvmlError::NoPermission) => panic!("NoPermission"),
1292 other => other,
1293 }
1294 })
1295 }
1296
1297 #[test]
1298 fn excluded_device_count() {
1299 let nvml = nvml();
1300 test(3, || nvml.excluded_device_count())
1301 }
1302
1303 #[test]
1304 fn excluded_device_info() {
1305 let nvml = nvml();
1306
1307 if nvml.excluded_device_count().unwrap() > 0 {
1308 test(3, || nvml.excluded_device_info(0))
1309 }
1310 }
1311
1312 #[test]
1313 fn vgpu_driver_capabilities() {
1314 let nvml = nvml();
1315 test(3, || nvml.vgpu_driver_capabilities(1))
1316 }
1317
1318 #[test]
1319 fn vgpu_version() {
1320 let nvml = nvml();
1321 test(3, || nvml.vgpu_version())
1322 }
1323
1324 #[test]
1325 fn set_vgpu_version() {
1326 let nvml = nvml();
1327 test(3, || nvml.set_vgpu_version(VgpuVersion { min: 0, max: 0 }))
1328 }
1329}