Skip to main content

trueno_gpu/
error.rs

1//! Error types for trueno-gpu operations
2//!
3//! Provides comprehensive error handling for PTX generation, CUDA driver operations,
4//! and memory management.
5//!
6//! Design: Toyota Principle #7 (Visual Control) - Clear error messages with GPU state context
7
8use thiserror::Error;
9
10/// Result type alias for trueno-gpu operations
11pub type Result<T> = std::result::Result<T, GpuError>;
12
13/// Errors that can occur during GPU operations
14#[derive(Error, Debug)]
15pub enum GpuError {
16    /// PTX generation error
17    #[error("PTX generation error: {0}")]
18    PtxGeneration(String),
19
20    /// I/O error (file operations)
21    #[error("I/O error: {0}")]
22    Io(#[from] std::io::Error),
23
24    /// Invalid parameter
25    #[error("Invalid parameter: {0}")]
26    InvalidParameter(String),
27
28    /// Invalid PTX version
29    #[error("Invalid PTX version: {major}.{minor} (requires >= 7.0)")]
30    InvalidPtxVersion {
31        /// Major version
32        major: u32,
33        /// Minor version
34        minor: u32,
35    },
36
37    /// Invalid compute capability target
38    #[error("Invalid compute capability: {0} (requires sm_70+)")]
39    InvalidTarget(String),
40
41    /// CUDA driver error
42    #[error("CUDA driver error: {0} (code: {1})")]
43    CudaDriver(String, i32),
44
45    /// Memory allocation error
46    #[error("GPU memory allocation failed: {0}")]
47    MemoryAllocation(String),
48
49    /// Kernel launch error
50    #[error("Kernel launch failed: {0}")]
51    KernelLaunch(String),
52
53    /// Invalid kernel configuration
54    #[error("Invalid launch config: {0}")]
55    InvalidLaunchConfig(String),
56
57    /// Register allocation error
58    #[error("Register allocation failed: {0}")]
59    RegisterAllocation(String),
60
61    /// Bank conflict detected (debugging)
62    #[error("Bank conflict detected in shared memory access")]
63    BankConflict,
64
65    // =========================================================================
66    // CUDA Runtime Errors (CRT-001 to CRT-006)
67    // =========================================================================
68    /// CUDA device initialization failed
69    #[error("CUDA device initialization failed: {0}")]
70    DeviceInit(String),
71
72    /// CUDA device not found
73    #[error("CUDA device {0} not found (available: {1})")]
74    DeviceNotFound(i32, usize),
75
76    /// CUDA module/PTX loading failed
77    #[error("CUDA module loading failed: {0}")]
78    ModuleLoad(String),
79
80    /// CUDA function not found in module
81    #[error("CUDA function '{0}' not found in module")]
82    FunctionNotFound(String),
83
84    /// CUDA stream creation failed
85    #[error("CUDA stream creation failed: {0}")]
86    StreamCreate(String),
87
88    /// CUDA stream synchronization failed
89    #[error("CUDA stream synchronization failed: {0}")]
90    StreamSync(String),
91
92    /// CUDA memory transfer (H2D/D2H) failed
93    #[error("CUDA memory transfer failed: {0}")]
94    Transfer(String),
95
96    /// Out of GPU memory
97    #[error("Out of GPU memory: requested {requested} bytes, available {available} bytes")]
98    OutOfMemory {
99        /// Bytes requested
100        requested: usize,
101        /// Bytes available
102        available: usize,
103    },
104
105    /// CUDA not available (no driver or no GPU)
106    #[error("CUDA not available: {0}")]
107    CudaNotAvailable(String),
108
109    /// Feature not supported on this device/platform
110    #[error("Not supported: {0}")]
111    NotSupported(String),
112
113    // =========================================================================
114    // CUDA Graph Errors (PAR-037)
115    // =========================================================================
116    /// CUDA graph creation failed
117    #[error("CUDA graph creation failed: {0}")]
118    GraphCreate(String),
119
120    /// CUDA graph capture failed
121    #[error("CUDA graph capture failed: {0}")]
122    GraphCapture(String),
123
124    /// CUDA graph instantiation failed
125    #[error("CUDA graph instantiation failed: {0}")]
126    GraphInstantiate(String),
127
128    /// CUDA graph launch failed
129    #[error("CUDA graph launch failed: {0}")]
130    GraphLaunch(String),
131}
132
133#[cfg(test)]
134mod tests {
135    use super::*;
136
137    #[test]
138    fn test_ptx_generation_error() {
139        let err = GpuError::PtxGeneration("invalid instruction".to_string());
140        assert!(err.to_string().contains("PTX generation error"));
141        assert!(err.to_string().contains("invalid instruction"));
142    }
143
144    #[test]
145    fn test_invalid_ptx_version() {
146        let err = GpuError::InvalidPtxVersion { major: 6, minor: 5 };
147        assert!(err.to_string().contains("6.5"));
148        assert!(err.to_string().contains("requires >= 7.0"));
149    }
150
151    #[test]
152    fn test_invalid_target() {
153        let err = GpuError::InvalidTarget("sm_50".to_string());
154        assert!(err.to_string().contains("sm_50"));
155        assert!(err.to_string().contains("requires sm_70+"));
156    }
157
158    #[test]
159    fn test_cuda_driver_error() {
160        let err = GpuError::CudaDriver("out of memory".to_string(), 2);
161        assert!(err.to_string().contains("out of memory"));
162        assert!(err.to_string().contains("code: 2"));
163    }
164
165    #[test]
166    fn test_memory_allocation_error() {
167        let err = GpuError::MemoryAllocation("insufficient device memory".to_string());
168        assert!(err.to_string().contains("allocation failed"));
169    }
170
171    #[test]
172    fn test_kernel_launch_error() {
173        let err = GpuError::KernelLaunch("invalid grid dimensions".to_string());
174        assert!(err.to_string().contains("launch failed"));
175    }
176
177    #[test]
178    fn test_error_debug() {
179        let err = GpuError::BankConflict;
180        // Just verify Debug is implemented
181        let _ = format!("{:?}", err);
182    }
183
184    #[test]
185    fn test_error_display() {
186        let err = GpuError::PtxGeneration("test".to_string());
187        assert!(err.to_string().contains("test"));
188    }
189
190    #[test]
191    fn test_io_error() {
192        let io_err = std::io::Error::new(std::io::ErrorKind::NotFound, "file not found");
193        let err: GpuError = io_err.into();
194        assert!(err.to_string().contains("I/O error"));
195    }
196
197    #[test]
198    fn test_invalid_parameter() {
199        let err = GpuError::InvalidParameter("bad value".to_string());
200        assert!(err.to_string().contains("Invalid parameter"));
201        assert!(err.to_string().contains("bad value"));
202    }
203
204    // =========================================================================
205    // CUDA Runtime Error Tests (CRT-001 to CRT-006)
206    // =========================================================================
207
208    #[test]
209    fn test_device_init_error() {
210        let err = GpuError::DeviceInit("no CUDA driver".to_string());
211        assert!(err.to_string().contains("initialization failed"));
212        assert!(err.to_string().contains("no CUDA driver"));
213    }
214
215    #[test]
216    fn test_device_not_found_error() {
217        let err = GpuError::DeviceNotFound(5, 2);
218        assert!(err.to_string().contains("device 5"));
219        assert!(err.to_string().contains("available: 2"));
220    }
221
222    #[test]
223    fn test_module_load_error() {
224        let err = GpuError::ModuleLoad("invalid PTX".to_string());
225        assert!(err.to_string().contains("module loading failed"));
226    }
227
228    #[test]
229    fn test_function_not_found_error() {
230        let err = GpuError::FunctionNotFound("my_kernel".to_string());
231        assert!(err.to_string().contains("my_kernel"));
232        assert!(err.to_string().contains("not found"));
233    }
234
235    #[test]
236    fn test_stream_create_error() {
237        let err = GpuError::StreamCreate("resource exhausted".to_string());
238        assert!(err.to_string().contains("stream creation"));
239    }
240
241    #[test]
242    fn test_stream_sync_error() {
243        let err = GpuError::StreamSync("timeout".to_string());
244        assert!(err.to_string().contains("synchronization"));
245    }
246
247    #[test]
248    fn test_transfer_error() {
249        let err = GpuError::Transfer("DMA error".to_string());
250        assert!(err.to_string().contains("transfer failed"));
251    }
252
253    #[test]
254    fn test_out_of_memory_error() {
255        let err = GpuError::OutOfMemory {
256            requested: 1_000_000_000,
257            available: 500_000_000,
258        };
259        assert!(err.to_string().contains("1000000000"));
260        assert!(err.to_string().contains("500000000"));
261    }
262
263    #[test]
264    fn test_cuda_not_available_error() {
265        let err = GpuError::CudaNotAvailable("no GPU detected".to_string());
266        assert!(err.to_string().contains("not available"));
267    }
268
269    #[test]
270    fn test_not_supported_error() {
271        let err = GpuError::NotSupported("CPU temperature monitoring".to_string());
272        assert!(err.to_string().contains("Not supported"));
273        assert!(err.to_string().contains("CPU temperature"));
274    }
275}