1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
//! GPU Memory Management with Unified Memory support
//!
//! Provides both traditional GPU memory pooling and CUDA Unified Memory
//! for automatic data migration between CPU and GPU.
use std::sync::Arc;
#[cfg(feature = "gpu")]
use cudarc::driver::{CudaContext, CudaSlice, CudaStream};
use crate::error::{DbxError, DbxResult};
/// Unified Memory Buffer - automatically migrated between CPU and GPU
#[cfg(feature = "gpu")]
pub struct UnifiedBuffer<T: Clone> {
/// Device context
device: Arc<CudaContext>,
/// Data pointer (managed memory)
data: Vec<T>,
/// Size in elements
size: usize,
/// Whether data has been prefetched to GPU
prefetched: bool,
}
#[cfg(feature = "gpu")]
impl<T: Clone + cudarc::driver::DeviceRepr> UnifiedBuffer<T> {
/// Create a new unified buffer
pub fn new(device: Arc<CudaContext>, size: usize) -> DbxResult<Self> {
// cudarc 0.19.2 does not expose cudaMallocManaged.
// Host memory is used as a portable fallback with explicit htod transfers.
// When cudarc adds Unified Memory support, replace with managed allocation.
let data = vec![unsafe { std::mem::zeroed() }; size];
Ok(Self {
device,
data,
size,
prefetched: false,
})
}
/// Create from existing data
pub fn from_vec(device: Arc<CudaContext>, data: Vec<T>) -> DbxResult<Self> {
let size = data.len();
Ok(Self {
device,
data,
size,
prefetched: false,
})
}
/// Prefetch data to GPU asynchronously
pub fn prefetch_to_gpu(&mut self) -> DbxResult<()> {
// cudarc 0.19.2 does not expose cudaMemPrefetchAsync.
// Simulated via explicit htod upload on first prefetch call.
// Replace with native prefetch when cudarc adds UVM support.
if !self.prefetched {
// Upload data to GPU (simulating prefetch)
let stream = self.device.default_stream();
let _gpu_slice = stream
.clone_htod(&self.data)
.map_err(|e| DbxError::Gpu(format!("Prefetch failed: {:?}", e)))?;
self.prefetched = true;
}
Ok(())
}
/// Prefetch data to CPU asynchronously
pub fn prefetch_to_cpu(&mut self) -> DbxResult<()> {
// Note: In true Unified Memory, this would hint the driver
// to migrate pages to CPU
// For now, this is a no-op since we're using host memory
self.prefetched = false;
Ok(())
}
/// Get data as slice (CPU-accessible)
pub fn as_slice(&self) -> &[T] {
&self.data
}
/// Get mutable data as slice (CPU-accessible)
pub fn as_mut_slice(&mut self) -> &mut [T] {
self.prefetched = false; // Mark as potentially modified
&mut self.data
}
/// Upload to GPU and get CudaSlice
pub fn to_device(&self) -> DbxResult<CudaSlice<T>> {
let stream = self.device.default_stream();
stream
.clone_htod(&self.data)
.map_err(|e| DbxError::Gpu(format!("Upload failed: {:?}", e)))
}
/// Get size in elements
pub fn len(&self) -> usize {
self.size
}
/// Check if empty
pub fn is_empty(&self) -> bool {
self.size == 0
}
}
/// Memory allocation strategy
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum MemoryStrategy {
/// Traditional GPU memory (explicit transfers)
DeviceOnly,
/// Unified Memory (automatic migration)
Unified,
/// Pinned host memory (faster transfers)
Pinned,
}
/// Memory manager for GPU operations
#[cfg(feature = "gpu")]
pub struct GpuMemoryManager {
/// Device context
device: Arc<CudaContext>,
/// Default memory strategy
strategy: MemoryStrategy,
}
#[cfg(feature = "gpu")]
impl GpuMemoryManager {
/// Create a new memory manager
pub fn new(device: Arc<CudaContext>, strategy: MemoryStrategy) -> Self {
Self { device, strategy }
}
/// Allocate buffer with default strategy
pub fn alloc<T: Clone + cudarc::driver::DeviceRepr>(
&self,
size: usize,
) -> DbxResult<UnifiedBuffer<T>> {
match self.strategy {
MemoryStrategy::Unified => UnifiedBuffer::new(self.device.clone(), size),
MemoryStrategy::DeviceOnly => {
// For device-only, we still use UnifiedBuffer but don't prefetch
UnifiedBuffer::new(self.device.clone(), size)
}
MemoryStrategy::Pinned => {
// Pinned memory would use cudaMallocHost
// For now, fall back to regular allocation
UnifiedBuffer::new(self.device.clone(), size)
}
}
}
/// Allocate buffer from existing data
pub fn alloc_from<T: Clone + cudarc::driver::DeviceRepr>(
&self,
data: Vec<T>,
) -> DbxResult<UnifiedBuffer<T>> {
UnifiedBuffer::from_vec(self.device.clone(), data)
}
/// Get current strategy
pub fn strategy(&self) -> MemoryStrategy {
self.strategy
}
/// Set memory strategy
pub fn set_strategy(&mut self, strategy: MemoryStrategy) {
self.strategy = strategy;
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
#[cfg(feature = "gpu")]
fn test_unified_buffer_creation() {
// This test requires CUDA runtime
// Skip if not available
if let Ok(device) = CudaContext::new(0) {
let device = Arc::new(device);
let buffer: UnifiedBuffer<i32> = UnifiedBuffer::new(device, 1000).unwrap();
assert_eq!(buffer.len(), 1000);
assert!(!buffer.is_empty());
}
}
#[test]
#[cfg(feature = "gpu")]
fn test_unified_buffer_from_vec() {
if let Ok(device) = CudaContext::new(0) {
let device = Arc::new(device);
let data = vec![1, 2, 3, 4, 5];
let buffer = UnifiedBuffer::from_vec(device, data.clone()).unwrap();
assert_eq!(buffer.as_slice(), &data[..]);
}
}
}