nvml_wrapper/
lib.rs

1/*!
2A safe and ergonomic Rust wrapper for the [NVIDIA Management Library][nvml] (NVML),
3a C-based programmatic interface for monitoring and managing various states within
4NVIDIA GPUs.
5
6```
7use nvml_wrapper::Nvml;
8# use nvml_wrapper::error::*;
9# fn test() -> Result<(), NvmlError> {
10
11let nvml = Nvml::init()?;
12// Get the first `Device` (GPU) in the system
13let device = nvml.device_by_index(0)?;
14
15let brand = device.brand()?; // GeForce on my system
16let fan_speed = device.fan_speed(0)?; // Currently 17% on my system
17let power_limit = device.enforced_power_limit()?; // 275k milliwatts on my system
18let encoder_util = device.encoder_utilization()?; // Currently 0 on my system; Not encoding anything
19let memory_info = device.memory_info()?; // Currently 1.63/6.37 GB used on my system
20
21// ... and there's a whole lot more you can do. Most everything in NVML is wrapped and ready to go
22# Ok(())
23# }
24```
25
26NVML is intended to be a platform for building 3rd-party applications, and is
27also the underlying library for NVIDIA's nvidia-smi tool.
28
29## Usage
30
31`nvml-wrapper` builds on top of generated bindings for NVML that make use of the
32[`libloading`][libloading] crate. This means the NVML library gets loaded upon
33calling `Nvml::init` and can return an error if NVML isn't present, making it
34possible to drop NVIDIA-related features in your code at runtime on systems that
35don't have relevant hardware.
36
37Successful execution of `Nvml::init` means:
38
39* The NVML library was present on the system and able to be opened
40* The function symbol to initialize NVML was loaded and called successfully
41* An attempt has been made to load all other NVML function symbols
42
43Every function you call thereafter will individually return an error if it couldn't
44be loaded from the NVML library during the `Nvml::init` call.
45
46Note that it's not advised to repeatedly call `Nvml::init` as the constructor
47has to perform all the work of loading the function symbols from the library
48each time it gets called. Instead, call `Nvml::init` once and store the resulting
49`Nvml` instance somewhere to be accessed throughout the lifetime of your program
50(perhaps in a [`once_cell`][once_cell]).
51
52## NVML Support
53
54This wrapper is being developed against and currently supports NVML version
5511. Each new version of NVML is guaranteed to be backwards-compatible according
56to NVIDIA, so this wrapper should continue to work without issue regardless of
57NVML version bumps.
58
59### Legacy Functions
60
61Sometimes there will be function-level API version bumps in new NVML releases.
62For example:
63
64```text
65nvmlDeviceGetComputeRunningProcesses
66nvmlDeviceGetComputeRunningProcesses_v2
67nvmlDeviceGetComputeRunningProcesses_v3
68```
69
70The older versions of the functions will generally continue to work with the
71newer NVML releases; however, the newer function versions will not work with
72older NVML installs.
73
74By default this wrapper only provides access to the newest function versions.
75Enable the `legacy-functions` feature if you require the ability to call older
76functions.
77
78## MSRV
79
80The Minimum Supported Rust Version is currently 1.51.0. I will not go out of my
81way to avoid bumping this.
82
83## Cargo Features
84
85The `serde` feature can be toggled on in order to `#[derive(Serialize, Deserialize)]`
86for every NVML data structure.
87
88[nvml]: https://developer.nvidia.com/nvidia-management-library-nvml
89[libloading]: https://github.com/nagisa/rust_libloading
90[once_cell]: https://docs.rs/once_cell/latest/once_cell/sync/struct.Lazy.html
91*/
92
93#![recursion_limit = "1024"]
94#![allow(non_upper_case_globals)]
95
96extern crate libloading;
97extern crate nvml_wrapper_sys as ffi;
98
99pub mod bitmasks;
100pub mod device;
101pub mod enum_wrappers;
102pub mod enums;
103pub mod error;
104pub mod event;
105pub mod gpm;
106pub mod high_level;
107pub mod nv_link;
108pub mod struct_wrappers;
109pub mod structs;
110#[cfg(test)]
111mod test_utils;
112pub mod unit;
113pub mod vgpu;
114
115// Re-exports for convenience
116pub use crate::device::Device;
117pub use crate::event::EventSet;
118pub use crate::gpm::GpmSample;
119pub use crate::nv_link::NvLink;
120pub use crate::unit::Unit;
121
122/// Re-exports from `nvml-wrapper-sys` that are necessary for use of this wrapper.
123pub mod sys_exports {
124    /// Use these constants to populate the `structs::device::FieldId` newtype.
125    pub mod field_id {
126        pub use crate::ffi::bindings::field_id::*;
127    }
128}
129
130#[cfg(target_os = "linux")]
131use std::convert::TryInto;
132#[cfg(target_os = "linux")]
133use std::ptr;
134use std::{
135    convert::TryFrom,
136    ffi::{CStr, CString, OsStr},
137    mem::{self, ManuallyDrop},
138    os::raw::{c_int, c_uint},
139};
140
141use static_assertions::assert_impl_all;
142
143#[cfg(target_os = "linux")]
144use crate::enum_wrappers::device::TopologyLevel;
145
146use crate::error::{nvml_sym, nvml_try, NvmlError};
147use crate::ffi::bindings::*;
148
149use crate::struct_wrappers::ExcludedDeviceInfo;
150
151#[cfg(target_os = "linux")]
152use crate::struct_wrappers::device::PciInfo;
153use crate::struct_wrappers::device::VgpuVersion;
154use crate::struct_wrappers::unit::HwbcEntry;
155
156use crate::bitmasks::InitFlags;
157
158#[cfg(not(target_os = "linux"))]
159const LIB_PATH: &str = "nvml.dll";
160
161#[cfg(target_os = "linux")]
162const LIB_PATH: &str = "libnvidia-ml.so.1";
163
164/// Determines the major version of the CUDA driver given the full version.
165///
166/// Obtain the full version via `Nvml.sys_cuda_driver_version()`.
167pub fn cuda_driver_version_major(version: i32) -> i32 {
168    version / 1000
169}
170
171/// Determines the minor version of the CUDA driver given the full version.
172///
173/// Obtain the full version via `NVML.sys_cuda_driver_version()`.
174pub fn cuda_driver_version_minor(version: i32) -> i32 {
175    (version % 1000) / 10
176}
177
178/**
179The main struct that this library revolves around.
180
181According to NVIDIA's documentation, "It is the user's responsibility to call `nvmlInit()`
182before calling any other methods, and `nvmlShutdown()` once NVML is no longer being used."
183This struct is used to enforce those rules.
184
185Also according to NVIDIA's documentation, "NVML is thread-safe so it is safe to make
186simultaneous NVML calls from multiple threads." In the Rust world, this translates to `NVML`
187being `Send` + `Sync`. You can `.clone()` an `Arc` wrapped `NVML` and enjoy using it on any thread.
188
189NOTE: If you care about possible errors returned from `nvmlShutdown()`, use the `.shutdown()`
190method on this struct. **The `Drop` implementation ignores errors.**
191
192When reading documentation on this struct and its members, remember that a lot of it,
193especially in regards to errors returned, is copied from NVIDIA's docs. While they can be found
194online [here](http://docs.nvidia.com/deploy/nvml-api/index.html), the hosted docs sometimes outdated
195and may not accurately reflect the version of NVML that this library is written for; beware. You
196should ideally read the doc comments on an up-to-date NVML API header. Such a header can be
197downloaded as part of the [CUDA toolkit](https://developer.nvidia.com/cuda-downloads).
198*/
199pub struct Nvml {
200    lib: ManuallyDrop<NvmlLib>,
201}
202
203assert_impl_all!(Nvml: Send, Sync);
204
205impl std::fmt::Debug for Nvml {
206    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
207        f.write_str("NVML")
208    }
209}
210
211impl Nvml {
212    /**
213    Handles NVML initialization and must be called before doing anything else.
214
215    While it is possible to initialize `NVML` multiple times (NVIDIA's docs state
216    that reference counting is used internally), you should strive to initialize
217    `NVML` once at the start of your program's execution; the constructors handle
218    dynamically loading function symbols from the `NVML` lib and are therefore
219    somewhat expensive.
220
221    Note that this will initialize NVML but not any GPUs. This means that NVML can
222    communicate with a GPU even when other GPUs in a system are bad or unstable.
223
224    By default, initialization looks for "libnvidia-ml.so" on linux and "nvml.dll"
225    on Windows. These default names should work for default installs on those
226    platforms; if further specification is required, use `Nvml::builder`.
227
228    # Errors
229
230    * `DriverNotLoaded`, if the NVIDIA driver is not running
231    * `NoPermission`, if NVML does not have permission to talk to the driver
232    * `Unknown`, on any unexpected error
233    */
234    // Checked against local
235    #[doc(alias = "nvmlInit_v2")]
236    pub fn init() -> Result<Self, NvmlError> {
237        Self::init_internal(LIB_PATH)
238    }
239
240    fn init_internal(path: impl AsRef<std::ffi::OsStr>) -> Result<Self, NvmlError> {
241        let lib = unsafe {
242            let lib = NvmlLib::new(path)?;
243            let sym = nvml_sym(lib.nvmlInit_v2.as_ref())?;
244
245            nvml_try(sym())?;
246            ManuallyDrop::new(lib)
247        };
248
249        Ok(Self { lib })
250    }
251
252    /**
253    An initialization function that allows you to pass flags to control certain behaviors.
254
255    This is the same as `init()` except for the addition of flags.
256
257    # Errors
258
259    * `DriverNotLoaded`, if the NVIDIA driver is not running
260    * `NoPermission`, if NVML does not have permission to talk to the driver
261    * `Unknown`, on any unexpected error
262
263    # Examples
264
265    ```
266    # use nvml_wrapper::Nvml;
267    # use nvml_wrapper::error::*;
268    use nvml_wrapper::bitmasks::InitFlags;
269
270    # fn main() -> Result<(), NvmlError> {
271    // Don't fail if the system doesn't have any NVIDIA GPUs
272    //
273    // Also, don't attach any GPUs during initialization
274    Nvml::init_with_flags(InitFlags::NO_GPUS | InitFlags::NO_ATTACH)?;
275    # Ok(())
276    # }
277    ```
278    */
279    #[doc(alias = "nvmlInitWithFlags")]
280    pub fn init_with_flags(flags: InitFlags) -> Result<Self, NvmlError> {
281        Self::init_with_flags_internal(LIB_PATH, flags)
282    }
283
284    fn init_with_flags_internal(
285        path: impl AsRef<std::ffi::OsStr>,
286        flags: InitFlags,
287    ) -> Result<Self, NvmlError> {
288        let lib = unsafe {
289            let lib = NvmlLib::new(path)?;
290            let sym = nvml_sym(lib.nvmlInitWithFlags.as_ref())?;
291
292            nvml_try(sym(flags.bits()))?;
293            ManuallyDrop::new(lib)
294        };
295
296        Ok(Self { lib })
297    }
298
299    /// Create an `NvmlBuilder` for further flexibility in how NVML is initialized.
300    pub fn builder<'a>() -> NvmlBuilder<'a> {
301        NvmlBuilder::default()
302    }
303
304    /// Get the underlying `NvmlLib` instance.
305    pub fn lib(&self) -> &NvmlLib {
306        &self.lib
307    }
308
309    /**
310    Use this to shutdown NVML and release allocated resources if you care about handling
311    potential errors (*the `Drop` implementation ignores errors!*).
312
313    # Errors
314
315    * `Uninitialized`, if the library has not been successfully initialized
316    * `Unknown`, on any unexpected error
317    */
318    // Thanks to `sorear` on IRC for suggesting this approach
319    // Checked against local
320    // Tested
321    #[doc(alias = "nvmlShutdown")]
322    pub fn shutdown(mut self) -> Result<(), NvmlError> {
323        let sym = nvml_sym(self.lib.nvmlShutdown.as_ref())?;
324
325        unsafe {
326            nvml_try(sym())?;
327        }
328
329        // SAFETY: we `mem::forget(self)` after this, so `self.lib` won't get
330        // touched by our `Drop` impl
331        let lib = unsafe { ManuallyDrop::take(&mut self.lib) };
332        mem::forget(self);
333
334        Ok(lib.__library.close()?)
335    }
336
337    /**
338    Get the number of compute devices in the system (compute device == one GPU).
339
340    Note that this count can include devices you do not have permission to access.
341
342    # Errors
343
344    * `Uninitialized`, if the library has not been successfully initialized
345    * `Unknown`, on any unexpected error
346    */
347    // Checked against local
348    // Tested
349    #[doc(alias = "nvmlDeviceGetCount_v2")]
350    pub fn device_count(&self) -> Result<u32, NvmlError> {
351        let sym = nvml_sym(self.lib.nvmlDeviceGetCount_v2.as_ref())?;
352
353        unsafe {
354            let mut count: c_uint = mem::zeroed();
355            nvml_try(sym(&mut count))?;
356
357            Ok(count)
358        }
359    }
360
361    /**
362    Gets the version of the system's graphics driver and returns it as an alphanumeric
363    string.
364
365    # Errors
366
367    * `Uninitialized`, if the library has not been successfully initialized
368    * `Utf8Error`, if the string obtained from the C function is not valid Utf8
369    */
370    // Checked against local
371    // Tested
372    #[doc(alias = "nvmlSystemGetDriverVersion")]
373    pub fn sys_driver_version(&self) -> Result<String, NvmlError> {
374        let sym = nvml_sym(self.lib.nvmlSystemGetDriverVersion.as_ref())?;
375
376        unsafe {
377            let mut version_vec = vec![0; NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE as usize];
378
379            nvml_try(sym(
380                version_vec.as_mut_ptr(),
381                NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE,
382            ))?;
383
384            let version_raw = CStr::from_ptr(version_vec.as_ptr());
385            Ok(version_raw.to_str()?.into())
386        }
387    }
388
389    /**
390    Gets the version of the system's NVML library and returns it as an alphanumeric
391    string.
392
393    # Errors
394
395    * `Utf8Error`, if the string obtained from the C function is not valid Utf8
396    */
397    // Checked against local
398    // Tested
399    #[doc(alias = "nvmlSystemGetNVMLVersion")]
400    pub fn sys_nvml_version(&self) -> Result<String, NvmlError> {
401        let sym = nvml_sym(self.lib.nvmlSystemGetNVMLVersion.as_ref())?;
402
403        unsafe {
404            let mut version_vec = vec![0; NVML_SYSTEM_NVML_VERSION_BUFFER_SIZE as usize];
405
406            nvml_try(sym(
407                version_vec.as_mut_ptr(),
408                NVML_SYSTEM_NVML_VERSION_BUFFER_SIZE,
409            ))?;
410
411            // Thanks to `Amaranth` on IRC for help with this
412            let version_raw = CStr::from_ptr(version_vec.as_ptr());
413            Ok(version_raw.to_str()?.into())
414        }
415    }
416
417    /**
418    Gets the version of the system's CUDA driver.
419
420    Calls into the CUDA library (cuDriverGetVersion()).
421
422    You can use `cuda_driver_version_major` and `cuda_driver_version_minor`
423    to get the major and minor driver versions from this number.
424
425    # Errors
426
427    * `FunctionNotFound`, if cuDriverGetVersion() is not found in the shared library
428    * `LibraryNotFound`, if libcuda.so.1 or libcuda.dll cannot be found
429    */
430    #[doc(alias = "nvmlSystemGetCudaDriverVersion_v2")]
431    pub fn sys_cuda_driver_version(&self) -> Result<i32, NvmlError> {
432        let sym = nvml_sym(self.lib.nvmlSystemGetCudaDriverVersion_v2.as_ref())?;
433
434        unsafe {
435            let mut version: c_int = mem::zeroed();
436            nvml_try(sym(&mut version))?;
437
438            Ok(version)
439        }
440    }
441
442    /**
443    Gets the name of the process for the given process ID, cropped to the provided length.
444
445    # Errors
446
447    * `Uninitialized`, if the library has not been successfully initialized
448    * `InvalidArg`, if the length is 0 (if this is returned without length being 0, file an issue)
449    * `NotFound`, if the process does not exist
450    * `NoPermission`, if the user doesn't have permission to perform the operation
451    * `Utf8Error`, if the string obtained from the C function is not valid UTF-8. NVIDIA's docs say
452      that the string encoding is ANSI, so this may very well happen.
453    * `Unknown`, on any unexpected error
454    */
455    // TODO: The docs say the string is ANSI-encoded. Not sure if I should try
456    // to do anything about that
457    // Checked against local
458    // Tested
459    #[doc(alias = "nvmlSystemGetProcessName")]
460    pub fn sys_process_name(&self, pid: u32, length: usize) -> Result<String, NvmlError> {
461        let sym = nvml_sym(self.lib.nvmlSystemGetProcessName.as_ref())?;
462
463        unsafe {
464            let mut name_vec = vec![0; length];
465
466            nvml_try(sym(pid, name_vec.as_mut_ptr(), length as c_uint))?;
467
468            let name_raw = CStr::from_ptr(name_vec.as_ptr());
469            Ok(name_raw.to_str()?.into())
470        }
471    }
472
473    /**
474    Acquire the handle for a particular device based on its index (starts at 0).
475
476    Usage of this function causes NVML to initialize the target GPU. Additional
477    GPUs may be initialized if the target GPU is an SLI slave.
478
479    You can determine valid indices by using `.device_count()`. This
480    function doesn't call that for you, but the actual C function to get
481    the device handle will return an error in the case of an invalid index.
482    This means that the `InvalidArg` error will be returned if you pass in
483    an invalid index.
484
485    NVIDIA's docs state that "The order in which NVML enumerates devices has
486    no guarantees of consistency between reboots. For that reason it is recommended
487    that devices be looked up by their PCI ids or UUID." In this library, that translates
488    into usage of `.device_by_uuid()` and `.device_by_pci_bus_id()`.
489
490    The NVML index may not correlate with other APIs such as the CUDA device index.
491
492    # Errors
493
494    * `Uninitialized`, if the library has not been successfully initialized
495    * `InvalidArg`, if index is invalid
496    * `InsufficientPower`, if any attached devices have improperly attached external power cables
497    * `NoPermission`, if the user doesn't have permission to talk to this device
498    * `IrqIssue`, if the NVIDIA kernel detected an interrupt issue with the attached GPUs
499    * `GpuLost`, if the target GPU has fallen off the bus or is otherwise inaccessible
500    * `Unknown`, on any unexpected error
501    */
502    // Checked against local
503    // Tested
504    #[doc(alias = "nvmlDeviceGetHandleByIndex_v2")]
505    pub fn device_by_index(&self, index: u32) -> Result<Device<'_>, NvmlError> {
506        let sym = nvml_sym(self.lib.nvmlDeviceGetHandleByIndex_v2.as_ref())?;
507
508        unsafe {
509            let mut device: nvmlDevice_t = mem::zeroed();
510            nvml_try(sym(index, &mut device))?;
511
512            Ok(Device::new(device, self))
513        }
514    }
515
516    /**
517    Acquire the handle for a particular device based on its PCI bus ID.
518
519    Usage of this function causes NVML to initialize the target GPU. Additional
520    GPUs may be initialized if the target GPU is an SLI slave.
521
522    The bus ID corresponds to the `bus_id` returned by `Device.pci_info()`.
523
524    # Errors
525
526    * `Uninitialized`, if the library has not been successfully initialized
527    * `InvalidArg`, if `pci_bus_id` is invalid
528    * `NotFound`, if `pci_bus_id` does not match a valid device on the system
529    * `InsufficientPower`, if any attached devices have improperly attached external power cables
530    * `NoPermission`, if the user doesn't have permission to talk to this device
531    * `IrqIssue`, if the NVIDIA kernel detected an interrupt issue with the attached GPUs
532    * `GpuLost`, if the target GPU has fallen off the bus or is otherwise inaccessible
533    * `NulError`, for which you can read the docs on `std::ffi::NulError`
534    * `Unknown`, on any unexpected error
535    */
536    // Checked against local
537    // Tested
538    #[doc(alias = "nvmlDeviceGetHandleByPciBusId_v2")]
539    pub fn device_by_pci_bus_id<S: AsRef<str>>(
540        &self,
541        pci_bus_id: S,
542    ) -> Result<Device<'_>, NvmlError>
543    where
544        Vec<u8>: From<S>,
545    {
546        let sym = nvml_sym(self.lib.nvmlDeviceGetHandleByPciBusId_v2.as_ref())?;
547
548        unsafe {
549            let c_string = CString::new(pci_bus_id)?;
550            let mut device: nvmlDevice_t = mem::zeroed();
551
552            nvml_try(sym(c_string.as_ptr(), &mut device))?;
553
554            Ok(Device::new(device, self))
555        }
556    }
557
558    /// Not documenting this because it's deprecated and does not seem to work
559    /// anymore.
560    // Tested (for an error)
561    #[deprecated(note = "use `.device_by_uuid()`, this errors on dual GPU boards")]
562    #[doc(alias = "nvmlDeviceGetHandleBySerial")]
563    pub fn device_by_serial<S: AsRef<str>>(&self, board_serial: S) -> Result<Device<'_>, NvmlError>
564    where
565        Vec<u8>: From<S>,
566    {
567        let sym = nvml_sym(self.lib.nvmlDeviceGetHandleBySerial.as_ref())?;
568
569        unsafe {
570            let c_string = CString::new(board_serial)?;
571            let mut device: nvmlDevice_t = mem::zeroed();
572
573            nvml_try(sym(c_string.as_ptr(), &mut device))?;
574
575            Ok(Device::new(device, self))
576        }
577    }
578
579    /**
580    Acquire the handle for a particular device based on its globally unique immutable
581    UUID.
582
583    Usage of this function causes NVML to initialize the target GPU. Additional
584    GPUs may be initialized as the function called within searches for the target GPU.
585
586    # Errors
587
588    * `Uninitialized`, if the library has not been successfully initialized
589    * `InvalidArg`, if `uuid` is invalid
590    * `NotFound`, if `uuid` does not match a valid device on the system
591    * `InsufficientPower`, if any attached devices have improperly attached external power cables
592    * `IrqIssue`, if the NVIDIA kernel detected an interrupt issue with the attached GPUs
593    * `GpuLost`, if the target GPU has fallen off the bus or is otherwise inaccessible
594    * `NulError`, for which you can read the docs on `std::ffi::NulError`
595    * `Unknown`, on any unexpected error
596
597    NVIDIA doesn't mention `NoPermission` for this one. Strange!
598    */
599    // Checked against local
600    // Tested
601    #[doc(alias = "nvmlDeviceGetHandleByUUID")]
602    pub fn device_by_uuid<S: AsRef<str>>(&self, uuid: S) -> Result<Device<'_>, NvmlError>
603    where
604        Vec<u8>: From<S>,
605    {
606        let sym = nvml_sym(self.lib.nvmlDeviceGetHandleByUUID.as_ref())?;
607
608        unsafe {
609            let c_string = CString::new(uuid)?;
610            let mut device: nvmlDevice_t = mem::zeroed();
611
612            nvml_try(sym(c_string.as_ptr(), &mut device))?;
613
614            Ok(Device::new(device, self))
615        }
616    }
617
618    /**
619    Gets the common ancestor for two devices.
620
621    Note: this is the same as `Device.topology_common_ancestor()`.
622
623    # Errors
624
625    * `InvalidArg`, if the device is invalid
626    * `NotSupported`, if this `Device` or the OS does not support this feature
627    * `UnexpectedVariant`, for which you can read the docs for
628    * `Unknown`, on any unexpected error
629
630    # Platform Support
631
632    Only supports Linux.
633    */
634    // Checked against local
635    // Tested
636    #[cfg(target_os = "linux")]
637    #[doc(alias = "nvmlDeviceGetTopologyCommonAncestor")]
638    pub fn topology_common_ancestor(
639        &self,
640        device1: &Device,
641        device2: &Device,
642    ) -> Result<TopologyLevel, NvmlError> {
643        let sym = nvml_sym(self.lib.nvmlDeviceGetTopologyCommonAncestor.as_ref())?;
644
645        unsafe {
646            let mut level: nvmlGpuTopologyLevel_t = mem::zeroed();
647
648            nvml_try(sym(device1.handle(), device2.handle(), &mut level))?;
649
650            TopologyLevel::try_from(level)
651        }
652    }
653
654    /**
655    Acquire the handle for a particular `Unit` based on its index.
656
657    Valid indices are derived from the count returned by `.unit_count()`.
658    For example, if `unit_count` is 2 the valid indices are 0 and 1, corresponding
659    to UNIT 0 and UNIT 1.
660
661    Note that the order in which NVML enumerates units has no guarantees of
662    consistency between reboots.
663
664    # Errors
665
666    * `Uninitialized`, if the library has not been successfully initialized
667    * `InvalidArg`, if `index` is invalid
668    * `Unknown`, on any unexpected error
669
670    # Device Support
671
672    For S-class products.
673    */
674    // Checked against local
675    // Tested (for an error)
676    #[doc(alias = "nvmlUnitGetHandleByIndex")]
677    pub fn unit_by_index(&self, index: u32) -> Result<Unit<'_>, NvmlError> {
678        let sym = nvml_sym(self.lib.nvmlUnitGetHandleByIndex.as_ref())?;
679
680        unsafe {
681            let mut unit: nvmlUnit_t = mem::zeroed();
682            nvml_try(sym(index as c_uint, &mut unit))?;
683
684            Ok(Unit::new(unit, self))
685        }
686    }
687
688    /**
689    Checks if the passed-in `Device`s are on the same physical board.
690
691    Note: this is the same as `Device.is_on_same_board_as()`.
692
693    # Errors
694
695    * `Uninitialized`, if the library has not been successfully initialized
696    * `InvalidArg`, if either `Device` is invalid
697    * `NotSupported`, if this check is not supported by this `Device`
698    * `GpuLost`, if this `Device` has fallen off the bus or is otherwise inaccessible
699    * `Unknown`, on any unexpected error
700    */
701    // Checked against local
702    // Tested
703    #[doc(alias = "nvmlDeviceOnSameBoard")]
704    pub fn are_devices_on_same_board(
705        &self,
706        device1: &Device,
707        device2: &Device,
708    ) -> Result<bool, NvmlError> {
709        let sym = nvml_sym(self.lib.nvmlDeviceOnSameBoard.as_ref())?;
710
711        unsafe {
712            let mut bool_int: c_int = mem::zeroed();
713
714            nvml_try(sym(device1.handle(), device2.handle(), &mut bool_int))?;
715
716            match bool_int {
717                0 => Ok(false),
718                _ => Ok(true),
719            }
720        }
721    }
722
723    /**
724    Gets the set of GPUs that have a CPU affinity with the given CPU number.
725
726    # Errors
727
728    * `InvalidArg`, if `cpu_number` is invalid
729    * `NotSupported`, if this `Device` or the OS does not support this feature
730    * `Unknown`, an error has occurred in the underlying topology discovery
731
732    # Platform Support
733
734    Only supports Linux.
735    */
736    // Tested
737    #[cfg(target_os = "linux")]
738    #[doc(alias = "nvmlSystemGetTopologyGpuSet")]
739    pub fn topology_gpu_set(&self, cpu_number: u32) -> Result<Vec<Device<'_>>, NvmlError> {
740        let sym = nvml_sym(self.lib.nvmlSystemGetTopologyGpuSet.as_ref())?;
741
742        unsafe {
743            let mut count = match self.topology_gpu_set_count(cpu_number)? {
744                0 => return Ok(vec![]),
745                value => value,
746            };
747            let mut devices: Vec<nvmlDevice_t> = vec![mem::zeroed(); count as usize];
748
749            nvml_try(sym(cpu_number, &mut count, devices.as_mut_ptr()))?;
750
751            Ok(devices.into_iter().map(|d| Device::new(d, self)).collect())
752        }
753    }
754
755    // Helper function for the above.
756    #[cfg(target_os = "linux")]
757    fn topology_gpu_set_count(&self, cpu_number: u32) -> Result<c_uint, NvmlError> {
758        let sym = nvml_sym(self.lib.nvmlSystemGetTopologyGpuSet.as_ref())?;
759
760        unsafe {
761            // Indicates that we want the count
762            let mut count: c_uint = 0;
763
764            // Passing null doesn't indicate that we want the count, just allowed
765            nvml_try(sym(cpu_number, &mut count, ptr::null_mut()))?;
766
767            Ok(count)
768        }
769    }
770
771    /**
772    Gets the IDs and firmware versions for any Host Interface Cards in the system.
773
774    # Errors
775
776    * `Uninitialized`, if the library has not been successfully initialized
777
778    # Device Support
779
780    Supports S-class products.
781    */
782    // Checked against local
783    // Tested
784    #[doc(alias = "nvmlSystemGetHicVersion")]
785    pub fn hic_versions(&self) -> Result<Vec<HwbcEntry>, NvmlError> {
786        let sym = nvml_sym(self.lib.nvmlSystemGetHicVersion.as_ref())?;
787
788        unsafe {
789            let mut count: c_uint = match self.hic_count()? {
790                0 => return Ok(vec![]),
791                value => value,
792            };
793            let mut hics: Vec<nvmlHwbcEntry_t> = vec![mem::zeroed(); count as usize];
794
795            nvml_try(sym(&mut count, hics.as_mut_ptr()))?;
796
797            hics.into_iter().map(HwbcEntry::try_from).collect()
798        }
799    }
800
801    /**
802    Gets the count of Host Interface Cards in the system.
803
804    # Errors
805
806    * `Uninitialized`, if the library has not been successfully initialized
807
808    # Device Support
809
810    Supports S-class products.
811    */
812    // Tested as part of the above method
813    #[doc(alias = "nvmlSystemGetHicVersion")]
814    pub fn hic_count(&self) -> Result<u32, NvmlError> {
815        let sym = nvml_sym(self.lib.nvmlSystemGetHicVersion.as_ref())?;
816
817        unsafe {
818            /*
819            NVIDIA doesn't even say that `count` will be set to the count if
820            `InsufficientSize` is returned. But we can assume sanity, right?
821
822            The idea here is:
823            If there are 0 HICs, NVML_SUCCESS is returned, `count` is set
824              to 0. We return count, all good.
825            If there is 1 HIC, NVML_SUCCESS is returned, `count` is set to
826              1. We return count, all good.
827            If there are >= 2 HICs, NVML_INSUFFICIENT_SIZE is returned.
828             `count` is theoretically set to the actual count, and we
829              return it.
830            */
831            let mut count: c_uint = 1;
832            let mut hics: [nvmlHwbcEntry_t; 1] = [mem::zeroed()];
833
834            match sym(&mut count, hics.as_mut_ptr()) {
835                nvmlReturn_enum_NVML_SUCCESS | nvmlReturn_enum_NVML_ERROR_INSUFFICIENT_SIZE => {
836                    Ok(count)
837                }
838                // We know that this will be an error
839                other => nvml_try(other).map(|_| 0),
840            }
841        }
842    }
843
844    /**
845    Gets the number of units in the system.
846
847    # Errors
848
849    * `Uninitialized`, if the library has not been successfully initialized
850    * `Unknown`, on any unexpected error
851
852    # Device Support
853
854    Supports S-class products.
855    */
856    // Checked against local
857    // Tested
858    #[doc(alias = "nvmlUnitGetCount")]
859    pub fn unit_count(&self) -> Result<u32, NvmlError> {
860        let sym = nvml_sym(self.lib.nvmlUnitGetCount.as_ref())?;
861
862        unsafe {
863            let mut count: c_uint = mem::zeroed();
864            nvml_try(sym(&mut count))?;
865
866            Ok(count)
867        }
868    }
869
870    /**
871    Create an empty set of events.
872
873    # Errors
874
875    * `Uninitialized`, if the library has not been successfully initialized
876    * `Unknown`, on any unexpected error
877
878    # Device Support
879
880    Supports Fermi and newer fully supported devices.
881    */
882    // Checked against local
883    // Tested
884    #[doc(alias = "nvmlEventSetCreate")]
885    pub fn create_event_set(&self) -> Result<EventSet<'_>, NvmlError> {
886        let sym = nvml_sym(self.lib.nvmlEventSetCreate.as_ref())?;
887
888        unsafe {
889            let mut set: nvmlEventSet_t = mem::zeroed();
890            nvml_try(sym(&mut set))?;
891
892            Ok(EventSet::new(set, self))
893        }
894    }
895
896    /**
897    Request the OS and the NVIDIA kernel driver to rediscover a portion of the PCI
898    subsystem in search of GPUs that were previously removed.
899
900    The portion of the PCI tree can be narrowed by specifying a domain, bus, and
901    device in the passed-in `pci_info`. **If all of these fields are zeroes, the
902    entire PCI tree will be searched.** Note that for long-running NVML processes,
903    the enumeration of devices will change based on how many GPUs are discovered
904    and where they are inserted in bus order.
905
906    All newly discovered GPUs will be initialized and have their ECC scrubbed which
907    may take several seconds per GPU. **All device handles are no longer guaranteed
908    to be valid post discovery**. I am not sure if this means **all** device
909    handles, literally, or if NVIDIA is referring to handles that had previously
910    been obtained to devices that were then removed and have now been
911    re-discovered.
912
913    Must be run as administrator.
914
915    # Errors
916
917    * `Uninitialized`, if the library has not been successfully initialized
918    * `OperatingSystem`, if the operating system is denying this feature
919    * `NoPermission`, if the calling process has insufficient permissions to
920      perform this operation
921    * `NulError`, if an issue is encountered when trying to convert a Rust
922      `String` into a `CString`.
923    * `Unknown`, on any unexpected error
924
925    # Device Support
926
927    Supports Pascal and newer fully supported devices.
928
929    Some Kepler devices are also supported (that's all NVIDIA says, no specifics).
930
931    # Platform Support
932
933    Only supports Linux.
934    */
935    // TODO: constructor for default pci_infos ^
936    // Checked against local
937    // Tested
938    #[cfg(target_os = "linux")]
939    #[doc(alias = "nvmlDeviceDiscoverGpus")]
940    pub fn discover_gpus(&self, pci_info: PciInfo) -> Result<(), NvmlError> {
941        let sym = nvml_sym(self.lib.nvmlDeviceDiscoverGpus.as_ref())?;
942
943        unsafe { nvml_try(sym(&mut pci_info.try_into()?)) }
944    }
945
946    /**
947    Gets the number of excluded GPU devices in the system.
948
949    # Device Support
950
951    Supports all devices.
952    */
953    #[doc(alias = "nvmlGetExcludedDeviceCount")]
954    pub fn excluded_device_count(&self) -> Result<u32, NvmlError> {
955        let sym = nvml_sym(self.lib.nvmlGetExcludedDeviceCount.as_ref())?;
956
957        unsafe {
958            let mut count: c_uint = mem::zeroed();
959
960            nvml_try(sym(&mut count))?;
961            Ok(count)
962        }
963    }
964
965    /**
966    Gets information for the specified excluded device.
967
968    # Errors
969
970    * `InvalidArg`, if the given index is invalid
971    * `Utf8Error`, if strings obtained from the C function are not valid Utf8
972
973    # Device Support
974
975    Supports all devices.
976    */
977    #[doc(alias = "nvmlGetExcludedDeviceInfoByIndex")]
978    pub fn excluded_device_info(&self, index: u32) -> Result<ExcludedDeviceInfo, NvmlError> {
979        let sym = nvml_sym(self.lib.nvmlGetExcludedDeviceInfoByIndex.as_ref())?;
980
981        unsafe {
982            let mut info: nvmlExcludedDeviceInfo_t = mem::zeroed();
983
984            nvml_try(sym(index, &mut info))?;
985            ExcludedDeviceInfo::try_from(info)
986        }
987    }
988
989    /**
990    Gets the loaded vGPU list of capabilities
991
992    # Errors
993
994    * `Uninitialized`, if the library has not been successfully initialized
995    * `Unknown`, on any unexpected error
996
997    # Device Support
998
999    Supports all devices.
1000    */
1001    #[doc(alias = "nvmlGetVgpuDriverCapabilities")]
1002    pub fn vgpu_driver_capabilities(
1003        &self,
1004        capability: nvmlVgpuDriverCapability_t,
1005    ) -> Result<u32, NvmlError> {
1006        let sym = nvml_sym(self.lib.nvmlGetVgpuDriverCapabilities.as_ref())?;
1007
1008        unsafe {
1009            let mut mask: u32 = mem::zeroed();
1010
1011            nvml_try(sym(capability, &mut mask))?;
1012            Ok(mask)
1013        }
1014    }
1015
1016    /**
1017    Get the supported and actual vGPU versions range.
1018
1019    # Errors
1020
1021    * `Uninitialized`, if the library has not been successfully initialized
1022    * `Unknown`, on any unexpected error
1023
1024    # Device Support
1025    */
1026    #[doc(alias = "nvmlGetVgpuVersion")]
1027    pub fn vgpu_version(&self) -> Result<(VgpuVersion, VgpuVersion), NvmlError> {
1028        let sym = nvml_sym(self.lib.nvmlGetVgpuVersion.as_ref())?;
1029
1030        unsafe {
1031            let mut supported: nvmlVgpuVersion_t = mem::zeroed();
1032            let mut current: nvmlVgpuVersion_t = mem::zeroed();
1033
1034            nvml_try(sym(&mut supported, &mut current))?;
1035            Ok((VgpuVersion::from(supported), VgpuVersion::from(current)))
1036        }
1037    }
1038
1039    #[doc(alias = "nvmlSetVgpuVersion")]
1040    pub fn set_vgpu_version(&self, version: VgpuVersion) -> Result<(), NvmlError> {
1041        let sym = nvml_sym(self.lib.nvmlSetVgpuVersion.as_ref())?;
1042
1043        unsafe { nvml_try(sym(&mut version.as_c())) }
1044    }
1045}
1046
1047/// This `Drop` implementation ignores errors! Use the `.shutdown()` method on
1048/// the `Nvml` struct
1049/// if you care about handling them.
1050impl Drop for Nvml {
1051    #[doc(alias = "nvmlShutdown")]
1052    fn drop(&mut self) {
1053        unsafe {
1054            self.lib.nvmlShutdown();
1055
1056            // SAFETY: called after the last usage of `self.lib`
1057            ManuallyDrop::drop(&mut self.lib);
1058        }
1059    }
1060}
1061
1062/**
1063A builder struct that provides further flexibility in how NVML is initialized.
1064
1065# Examples
1066
1067Initialize NVML with a non-default name for the shared object file:
1068
1069```
1070use nvml_wrapper::Nvml;
1071use std::ffi::OsStr;
1072
1073let init_result = Nvml::builder().lib_path(OsStr::new("libnvidia-ml-other-name.so")).init();
1074```
1075
1076Initialize NVML with a non-default path to the shared object file:
1077
1078```
1079use nvml_wrapper::Nvml;
1080use std::ffi::OsStr;
1081
1082let init_result = Nvml::builder().lib_path(OsStr::new("/some/path/to/libnvidia-ml.so")).init();
1083```
1084*/
1085#[derive(Debug, Clone, Eq, PartialEq, Default)]
1086pub struct NvmlBuilder<'a> {
1087    lib_path: Option<&'a OsStr>,
1088    flags: InitFlags,
1089}
1090
1091impl<'a> NvmlBuilder<'a> {
1092    /**
1093    Set the path to the NVML lib file.
1094
1095    See [`libloading`'s docs][libloading] for details about how this lib path is
1096    handled.
1097
1098    [libloading]: https://docs.rs/libloading/0.6.6/libloading/struct.Library.html#method.new
1099    */
1100    pub fn lib_path(&mut self, path: &'a OsStr) -> &mut Self {
1101        self.lib_path = Some(path);
1102        self
1103    }
1104
1105    /// Set the `InitFlags` to initialize NVML with.
1106    pub fn flags(&mut self, flags: InitFlags) -> &mut Self {
1107        self.flags = flags;
1108        self
1109    }
1110
1111    /// Perform initialization.
1112    pub fn init(&self) -> Result<Nvml, NvmlError> {
1113        let lib_path = self.lib_path.unwrap_or_else(|| LIB_PATH.as_ref());
1114
1115        if self.flags.is_empty() {
1116            Nvml::init_internal(lib_path)
1117        } else {
1118            Nvml::init_with_flags_internal(lib_path, self.flags)
1119        }
1120    }
1121}
1122
1123#[cfg(test)]
1124mod test {
1125    use super::*;
1126    use crate::bitmasks::InitFlags;
1127    use crate::error::NvmlError;
1128    use crate::test_utils::*;
1129
1130    #[test]
1131    fn init_with_flags() {
1132        Nvml::init_with_flags(InitFlags::NO_GPUS).unwrap();
1133    }
1134
1135    #[test]
1136    fn shutdown() {
1137        test(3, || nvml().shutdown())
1138    }
1139
1140    #[test]
1141    fn device_count() {
1142        test(3, || nvml().device_count())
1143    }
1144
1145    #[test]
1146    fn sys_driver_version() {
1147        test(3, || nvml().sys_driver_version())
1148    }
1149
1150    #[test]
1151    fn sys_nvml_version() {
1152        test(3, || nvml().sys_nvml_version())
1153    }
1154
1155    #[test]
1156    fn sys_cuda_driver_version() {
1157        test(3, || nvml().sys_cuda_driver_version())
1158    }
1159
1160    #[test]
1161    fn sys_cuda_driver_version_major() {
1162        test(3, || {
1163            Ok(cuda_driver_version_major(nvml().sys_cuda_driver_version()?))
1164        })
1165    }
1166
1167    #[test]
1168    fn sys_cuda_driver_version_minor() {
1169        test(3, || {
1170            Ok(cuda_driver_version_minor(nvml().sys_cuda_driver_version()?))
1171        })
1172    }
1173
1174    #[test]
1175    fn sys_process_name() {
1176        let nvml = nvml();
1177        test_with_device(3, &nvml, |device| {
1178            let processes = device.running_graphics_processes()?;
1179            match nvml.sys_process_name(processes[0].pid, 64) {
1180                Err(NvmlError::NoPermission) => Ok("No permission error".into()),
1181                v => v,
1182            }
1183        })
1184    }
1185
1186    #[test]
1187    fn device_by_index() {
1188        let nvml = nvml();
1189        test(3, || nvml.device_by_index(0))
1190    }
1191
1192    #[test]
1193    fn device_by_pci_bus_id() {
1194        let nvml = nvml();
1195        test_with_device(3, &nvml, |device| {
1196            let id = device.pci_info()?.bus_id;
1197            nvml.device_by_pci_bus_id(id)
1198        })
1199    }
1200
1201    // Can't get serial on my machine
1202    #[ignore = "my machine does not support this call"]
1203    #[test]
1204    fn device_by_serial() {
1205        let nvml = nvml();
1206
1207        #[allow(deprecated)]
1208        test_with_device(3, &nvml, |device| {
1209            let serial = device.serial()?;
1210            nvml.device_by_serial(serial)
1211        })
1212    }
1213
1214    #[test]
1215    fn device_by_uuid() {
1216        let nvml = nvml();
1217        test_with_device(3, &nvml, |device| {
1218            let uuid = device.uuid()?;
1219            nvml.device_by_uuid(uuid)
1220        })
1221    }
1222
1223    // I don't have 2 devices
1224    #[ignore = "my machine does not support this call"]
1225    #[cfg(target_os = "linux")]
1226    #[test]
1227    fn topology_common_ancestor() {
1228        let nvml = nvml();
1229        let device1 = device(&nvml);
1230        let device2 = nvml.device_by_index(1).expect("device");
1231
1232        nvml.topology_common_ancestor(&device1, &device2)
1233            .expect("TopologyLevel");
1234    }
1235
1236    // Errors on my machine
1237
1238    #[test]
1239    #[ignore = "my machine does not support this call"]
1240    fn unit_by_index() {
1241        let nvml = nvml();
1242        test(3, || nvml.unit_by_index(0))
1243    }
1244
1245    // I don't have 2 devices
1246    #[ignore = "my machine does not support this call"]
1247    #[test]
1248    fn are_devices_on_same_board() {
1249        let nvml = nvml();
1250        let device1 = device(&nvml);
1251        let device2 = nvml.device_by_index(1).expect("device");
1252
1253        nvml.are_devices_on_same_board(&device1, &device2)
1254            .expect("bool");
1255    }
1256
1257    #[cfg(target_os = "linux")]
1258    #[test]
1259    fn topology_gpu_set() {
1260        let nvml = nvml();
1261        test(3, || nvml.topology_gpu_set(0))
1262    }
1263
1264    #[test]
1265    fn hic_version() {
1266        let nvml = nvml();
1267        test(3, || nvml.hic_versions())
1268    }
1269
1270    #[test]
1271    fn unit_count() {
1272        test(3, || nvml().unit_count())
1273    }
1274
1275    #[test]
1276    fn create_event_set() {
1277        let nvml = nvml();
1278        test(3, || nvml.create_event_set())
1279    }
1280
1281    #[cfg(target_os = "linux")]
1282    #[should_panic(expected = "OperatingSystem")]
1283    #[test]
1284    fn discover_gpus() {
1285        let nvml = nvml();
1286        test_with_device(3, &nvml, |device| {
1287            let pci_info = device.pci_info()?;
1288
1289            // We don't test with admin perms and therefore expect an error
1290            match nvml.discover_gpus(pci_info) {
1291                Err(NvmlError::NoPermission) => panic!("NoPermission"),
1292                other => other,
1293            }
1294        })
1295    }
1296
1297    #[test]
1298    fn excluded_device_count() {
1299        let nvml = nvml();
1300        test(3, || nvml.excluded_device_count())
1301    }
1302
1303    #[test]
1304    fn excluded_device_info() {
1305        let nvml = nvml();
1306
1307        if nvml.excluded_device_count().unwrap() > 0 {
1308            test(3, || nvml.excluded_device_info(0))
1309        }
1310    }
1311
1312    #[test]
1313    fn vgpu_driver_capabilities() {
1314        let nvml = nvml();
1315        test(3, || nvml.vgpu_driver_capabilities(1))
1316    }
1317
1318    #[test]
1319    fn vgpu_version() {
1320        let nvml = nvml();
1321        test(3, || nvml.vgpu_version())
1322    }
1323
1324    #[test]
1325    fn set_vgpu_version() {
1326        let nvml = nvml();
1327        test(3, || nvml.set_vgpu_version(VgpuVersion { min: 0, max: 0 }))
1328    }
1329}
nvml_wrapper/lib.rs

nvml_wrapper/
lib.rs