Skip to main content

nvml_wrapper/
error.rs

1use crate::ffi::bindings::*;
2#[cfg(feature = "serde")]
3use serde_derive::{Deserialize, Serialize};
4use thiserror::Error;
5
6#[derive(Debug, Clone, Eq, PartialEq, Hash)]
7#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
8pub enum Bits {
9    U32(u32),
10    U64(u64),
11}
12
13/// An `NvmlError` with an optionally present source error for chaining errors
14#[derive(Error, Debug)]
15#[error("{error}")]
16pub struct NvmlErrorWithSource {
17    pub error: NvmlError,
18    pub source: Option<NvmlError>,
19}
20
21impl From<NvmlError> for NvmlErrorWithSource {
22    fn from(error: NvmlError) -> Self {
23        Self {
24            error,
25            source: None,
26        }
27    }
28}
29
30#[derive(Error, Debug)]
31pub enum NvmlError {
32    #[error("could not interpret string as utf-8")]
33    Utf8Error(#[from] std::str::Utf8Error),
34    #[error("nul byte inside string")]
35    NulError(#[from] std::ffi::NulError),
36    #[error("a libloading error occurred: {0}")]
37    LibloadingError(#[from] libloading::Error),
38
39    /**
40    A function symbol failed to load.
41
42    This variant is constructed with a textual description of a
43    `libloading::Error`. The error variant itself can't be provided because we're
44    unable to take ownership of the error when attempting to use a symbol, and
45    `libloading::Error` doesn't impl `Clone`.
46    */
47    #[error("function symbol failed to load: {0}")]
48    FailedToLoadSymbol(String),
49
50    #[error("max string length was {max_len} but string length is {actual_len}")]
51    StringTooLong { max_len: usize, actual_len: usize },
52
53    #[error("invalid combination of bits ({0:?}) when trying to interpret as bitflags")]
54    IncorrectBits(Bits),
55
56    /**
57    An unexpected enum variant was encountered.
58
59    This error is specific to this Rust wrapper. It is used to represent the
60    possibility that an enum variant that is not defined within the Rust bindings
61    can be returned from a C call.
62
63    The single field contains the value that could not be mapped to a
64    defined enum variant.
65
66    See [this issue](https://github.com/rust-lang/rust/issues/36927).
67    */
68    #[error("unexpected enum variant value: {0}")]
69    UnexpectedVariant(u32),
70
71    #[error("a call to `EventSet.release_events()` failed")]
72    SetReleaseFailed,
73
74    #[error("a call to `Device.pci_info()` failed")]
75    GetPciInfoFailed,
76
77    #[error("a call to `PciInfo.try_into_c()` failed")]
78    PciInfoToCFailed,
79
80    #[error("NVML was not first initialized with `Nvml::init()`")]
81    Uninitialized,
82
83    #[error("a supplied argument was invalid")]
84    InvalidArg,
85
86    #[error("the requested operation is not available on the target device")]
87    NotSupported,
88
89    #[error("the current user does not have permission to perform this operation")]
90    NoPermission,
91
92    #[error("NVML was already initialized")]
93    #[deprecated = "deprecated in NVML (multiple initializations now allowed via refcounting)"]
94    AlreadyInitialized,
95
96    #[error("a query to find an object was unsuccessful")]
97    NotFound,
98
99    /**
100    An input argument is not large enough.
101
102    The single field is the size required for a successful call (if `Some`)
103    and `None` if unknown.
104    */
105    // TODO: verify that ^
106    #[error(
107        "an input argument is not large enough{}",
108        if let Some(size) = .0 {
109            format!(", needs to be at least {}", size)
110        } else {
111            "".into()
112        }
113    )]
114    InsufficientSize(Option<usize>),
115
116    #[error("device's external power cables are not properly attached")]
117    InsufficientPower,
118
119    #[error("NVIDIA driver is not loaded")]
120    DriverNotLoaded,
121
122    #[error("the provided timeout was reached")]
123    Timeout,
124
125    #[error("NVIDIA kernel detected an interrupt issue with a device")]
126    IrqIssue,
127
128    #[error("a shared library couldn't be found or loaded")]
129    LibraryNotFound,
130
131    #[error("a function couldn't be found in a shared library")]
132    FunctionNotFound,
133
134    #[error("the infoROM is corrupted")]
135    CorruptedInfoROM,
136
137    #[error("device fell off the bus or has otherwise become inacessible")]
138    GpuLost,
139
140    #[error("device requires a reset before it can be used again")]
141    ResetRequired,
142
143    #[error("device control has been blocked by the operating system/cgroups")]
144    OperatingSystem,
145
146    #[error("RM detects a driver/library version mismatch")]
147    LibRmVersionMismatch,
148
149    #[error("operation cannot be performed because the GPU is currently in use")]
150    InUse,
151
152    #[error("insufficient memory")]
153    InsufficientMemory,
154
155    #[error("no data")]
156    NoData,
157
158    #[error(
159        "the requested vgpu operation is not available on the target device because \
160        ECC is enabled"
161    )]
162    VgpuEccNotSupported,
163
164    #[error("an internal driver error occurred")]
165    Unknown,
166}
167
168/// Converts an `nvmlReturn_t` type into a `Result<(), NvmlError>`.
169pub fn nvml_try(code: nvmlReturn_t) -> Result<(), NvmlError> {
170    if code == nvmlReturn_enum_NVML_SUCCESS {
171        return Ok(());
172    }
173    Err(code.into())
174}
175
176/// Converts an `nvmlReturn_t` type into a `Result<(), NvmlError>`, allowing for the call to return the
177/// value `nvmlReturn_enum_NVML_ERROR_INSUFFICIENT_SIZE` which is a common return value when using an
178/// in/out parameter that provides the size of a buffer needed to complete that call
179pub fn nvml_try_count(code: nvmlReturn_t) -> Result<(), NvmlError> {
180    if code == nvmlReturn_enum_NVML_SUCCESS || code == nvmlReturn_enum_NVML_ERROR_INSUFFICIENT_SIZE
181    {
182        return Ok(());
183    }
184    Err(code.into())
185}
186
187#[allow(deprecated)]
188impl From<nvmlReturn_t> for NvmlError {
189    fn from(value: nvmlReturn_t) -> Self {
190        use NvmlError::*;
191        match value {
192            nvmlReturn_enum_NVML_ERROR_UNINITIALIZED => Uninitialized,
193            nvmlReturn_enum_NVML_ERROR_INVALID_ARGUMENT => InvalidArg,
194            nvmlReturn_enum_NVML_ERROR_NOT_SUPPORTED => NotSupported,
195            nvmlReturn_enum_NVML_ERROR_NO_PERMISSION => NoPermission,
196            nvmlReturn_enum_NVML_ERROR_ALREADY_INITIALIZED => AlreadyInitialized,
197            nvmlReturn_enum_NVML_ERROR_NOT_FOUND => NotFound,
198            nvmlReturn_enum_NVML_ERROR_INSUFFICIENT_SIZE => InsufficientSize(None),
199            nvmlReturn_enum_NVML_ERROR_INSUFFICIENT_POWER => InsufficientPower,
200            nvmlReturn_enum_NVML_ERROR_DRIVER_NOT_LOADED => DriverNotLoaded,
201            nvmlReturn_enum_NVML_ERROR_TIMEOUT => Timeout,
202            nvmlReturn_enum_NVML_ERROR_IRQ_ISSUE => IrqIssue,
203            nvmlReturn_enum_NVML_ERROR_LIBRARY_NOT_FOUND => LibraryNotFound,
204            nvmlReturn_enum_NVML_ERROR_FUNCTION_NOT_FOUND => FunctionNotFound,
205            nvmlReturn_enum_NVML_ERROR_CORRUPTED_INFOROM => CorruptedInfoROM,
206            nvmlReturn_enum_NVML_ERROR_GPU_IS_LOST => GpuLost,
207            nvmlReturn_enum_NVML_ERROR_RESET_REQUIRED => ResetRequired,
208            nvmlReturn_enum_NVML_ERROR_OPERATING_SYSTEM => OperatingSystem,
209            nvmlReturn_enum_NVML_ERROR_LIB_RM_VERSION_MISMATCH => LibRmVersionMismatch,
210            nvmlReturn_enum_NVML_ERROR_IN_USE => InUse,
211            nvmlReturn_enum_NVML_ERROR_MEMORY => InsufficientMemory,
212            nvmlReturn_enum_NVML_ERROR_NO_DATA => NoData,
213            nvmlReturn_enum_NVML_ERROR_VGPU_ECC_NOT_SUPPORTED => VgpuEccNotSupported,
214            nvmlReturn_enum_NVML_ERROR_UNKNOWN => Unknown,
215            _ => UnexpectedVariant(value),
216        }
217    }
218}
219
220#[allow(deprecated)]
221impl From<NvmlError> for nvmlReturn_t {
222    fn from(error: NvmlError) -> Self {
223        use NvmlError::*;
224
225        match error {
226            Uninitialized => nvmlReturn_enum_NVML_ERROR_UNINITIALIZED,
227            InvalidArg => nvmlReturn_enum_NVML_ERROR_INVALID_ARGUMENT,
228            NotSupported => nvmlReturn_enum_NVML_ERROR_NOT_SUPPORTED,
229            NoPermission => nvmlReturn_enum_NVML_ERROR_NO_PERMISSION,
230            AlreadyInitialized => nvmlReturn_enum_NVML_ERROR_ALREADY_INITIALIZED,
231            NotFound => nvmlReturn_enum_NVML_ERROR_NOT_FOUND,
232            InsufficientSize(_) => nvmlReturn_enum_NVML_ERROR_INSUFFICIENT_SIZE,
233            InsufficientPower => nvmlReturn_enum_NVML_ERROR_INSUFFICIENT_POWER,
234            DriverNotLoaded => nvmlReturn_enum_NVML_ERROR_DRIVER_NOT_LOADED,
235            Timeout => nvmlReturn_enum_NVML_ERROR_TIMEOUT,
236            IrqIssue => nvmlReturn_enum_NVML_ERROR_IRQ_ISSUE,
237            LibraryNotFound => nvmlReturn_enum_NVML_ERROR_LIBRARY_NOT_FOUND,
238            FunctionNotFound => nvmlReturn_enum_NVML_ERROR_FUNCTION_NOT_FOUND,
239            CorruptedInfoROM => nvmlReturn_enum_NVML_ERROR_CORRUPTED_INFOROM,
240            GpuLost => nvmlReturn_enum_NVML_ERROR_GPU_IS_LOST,
241            ResetRequired => nvmlReturn_enum_NVML_ERROR_RESET_REQUIRED,
242            OperatingSystem => nvmlReturn_enum_NVML_ERROR_OPERATING_SYSTEM,
243            LibRmVersionMismatch => nvmlReturn_enum_NVML_ERROR_LIB_RM_VERSION_MISMATCH,
244            InUse => nvmlReturn_enum_NVML_ERROR_IN_USE,
245            InsufficientMemory => nvmlReturn_enum_NVML_ERROR_MEMORY,
246            NoData => nvmlReturn_enum_NVML_ERROR_NO_DATA,
247            VgpuEccNotSupported => nvmlReturn_enum_NVML_ERROR_VGPU_ECC_NOT_SUPPORTED,
248            Unknown => nvmlReturn_enum_NVML_ERROR_UNKNOWN,
249            UnexpectedVariant(code) => code,
250            // For non-NVML errors, return UNKNOWN
251            Utf8Error(_)
252            | NulError(_)
253            | LibloadingError(_)
254            | FailedToLoadSymbol(_)
255            | StringTooLong { .. }
256            | IncorrectBits(_)
257            | SetReleaseFailed
258            | GetPciInfoFailed
259            | PciInfoToCFailed => nvmlReturn_enum_NVML_ERROR_UNKNOWN,
260        }
261    }
262}
263
264/// Helper to map a `&libloading::Error` into an `NvmlError`
265pub fn nvml_sym<'a, T>(sym: Result<&'a T, &libloading::Error>) -> Result<&'a T, NvmlError> {
266    sym.map_err(|e| NvmlError::FailedToLoadSymbol(e.to_string()))
267}