1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
//! GPU memory transfer operations
//!
//! Host-to-device, device-to-host, and device-to-device copy methods
//! for `GpuBuffer<T>`. Both synchronous and asynchronous variants.
use std::ffi::c_void;
use crate::driver::context::{get_driver, CudaContext};
use crate::driver::stream::CudaStream;
use crate::driver::sys::CudaDriver;
use crate::GpuError;
use super::buffer::GpuBuffer;
// ============================================================================
// Host <-> Device Transfers
// ============================================================================
impl<T: Copy> GpuBuffer<T> {
/// Copy data from host to device (synchronous)
///
/// # Arguments
///
/// * `data` - Host data to copy (must have same length as buffer)
///
/// # Errors
///
/// Returns `Err(GpuError::Transfer)` if copy fails.
/// Returns `Err(GpuError::InvalidValue)` if lengths don't match.
pub fn copy_from_host(&mut self, data: &[T]) -> Result<(), GpuError> {
if data.len() != self.len {
return Err(GpuError::Transfer(format!(
"Length mismatch: host {} vs device {}",
data.len(),
self.len
)));
}
if self.len == 0 {
return Ok(());
}
let driver = get_driver()?;
let size = self.size_bytes();
// SAFETY: data is valid for size bytes, ptr is valid device pointer
let result =
unsafe { (driver.cuMemcpyHtoD)(self.ptr, data.as_ptr() as *const c_void, size) };
CudaDriver::check(result).map_err(|e| GpuError::Transfer(e.to_string()))
}
/// Copy data from device to host (synchronous)
///
/// # Arguments
///
/// * `data` - Host buffer to copy into (must have same length as buffer)
///
/// # Errors
///
/// Returns `Err(GpuError::Transfer)` if copy fails.
pub fn copy_to_host(&self, data: &mut [T]) -> Result<(), GpuError> {
if data.len() != self.len {
return Err(GpuError::Transfer(format!(
"Length mismatch: host {} vs device {}",
data.len(),
self.len
)));
}
if self.len == 0 {
return Ok(());
}
let driver = get_driver()?;
let size = self.size_bytes();
// SAFETY: data is valid for size bytes, ptr is valid device pointer
let result =
unsafe { (driver.cuMemcpyDtoH)(data.as_mut_ptr() as *mut c_void, self.ptr, size) };
CudaDriver::check(result).map_err(|e| GpuError::Transfer(e.to_string()))
}
/// Copy data from host to device (asynchronous)
///
/// # Arguments
///
/// * `data` - Host data to copy (must have same length as buffer)
/// * `stream` - Stream for async operation
///
/// # Safety
///
/// The host data must remain valid until the stream is synchronized.
///
/// # Errors
///
/// Returns `Err(GpuError::Transfer)` if copy fails.
pub unsafe fn copy_from_host_async(
&mut self,
data: &[T],
stream: &CudaStream,
) -> Result<(), GpuError> {
if data.len() != self.len {
return Err(GpuError::Transfer(format!(
"Length mismatch: host {} vs device {}",
data.len(),
self.len
)));
}
if self.len == 0 {
return Ok(());
}
let driver = get_driver()?;
let size = self.size_bytes();
// SAFETY: data is valid for size bytes, caller ensures data outlives stream ops
let result = unsafe {
(driver.cuMemcpyHtoDAsync)(self.ptr, data.as_ptr() as *const c_void, size, stream.raw())
};
CudaDriver::check(result).map_err(|e| GpuError::Transfer(e.to_string()))
}
/// Copy data from device to host (asynchronous)
///
/// # Arguments
///
/// * `data` - Host buffer to copy into
/// * `stream` - Stream for async operation
///
/// # Safety
///
/// The host buffer must remain valid until the stream is synchronized.
///
/// # Errors
///
/// Returns `Err(GpuError::Transfer)` if copy fails.
pub unsafe fn copy_to_host_async(
&self,
data: &mut [T],
stream: &CudaStream,
) -> Result<(), GpuError> {
if data.len() != self.len {
return Err(GpuError::Transfer(format!(
"Length mismatch: host {} vs device {}",
data.len(),
self.len
)));
}
if self.len == 0 {
return Ok(());
}
let driver = get_driver()?;
let size = self.size_bytes();
// SAFETY: data is valid for size bytes, caller ensures data outlives stream ops
let result = unsafe {
(driver.cuMemcpyDtoHAsync)(
data.as_mut_ptr() as *mut c_void,
self.ptr,
size,
stream.raw(),
)
};
CudaDriver::check(result).map_err(|e| GpuError::Transfer(e.to_string()))
}
/// Create buffer and initialize from host data
///
/// Convenience method combining allocation and upload.
///
/// # Arguments
///
/// * `ctx` - CUDA context
/// * `data` - Host data to upload
///
/// # Errors
///
/// Returns allocation or transfer errors.
pub fn from_host(ctx: &CudaContext, data: &[T]) -> Result<Self, GpuError> {
let mut buf = Self::new(ctx, data.len())?;
buf.copy_from_host(data)?;
Ok(buf)
}
/// Copy partial data from host to device at specific offset (PAR-018)
///
/// # Arguments
///
/// * `data` - Host data to copy
/// * `offset` - Element offset in device buffer where copy begins
///
/// # Errors
///
/// Returns `Err(GpuError::Transfer)` if offset + data.len() exceeds buffer size.
pub fn copy_from_host_at(&mut self, data: &[T], offset: usize) -> Result<(), GpuError> {
if offset + data.len() > self.len {
return Err(GpuError::Transfer(format!(
"Partial copy out of bounds: offset {} + len {} > buffer {}",
offset,
data.len(),
self.len
)));
}
if data.is_empty() {
return Ok(());
}
let driver = get_driver()?;
let size = std::mem::size_of_val(data);
let dst_ptr = self.ptr + (offset * std::mem::size_of::<T>()) as u64;
// SAFETY: bounds checked above, data and ptr are valid
let result =
unsafe { (driver.cuMemcpyHtoD)(dst_ptr, data.as_ptr() as *const c_void, size) };
CudaDriver::check(result).map_err(|e| GpuError::Transfer(e.to_string()))
}
/// Copy partial data from device to host at specific offset (PAR-018)
///
/// # Arguments
///
/// * `data` - Host buffer to copy into
/// * `offset` - Element offset in device buffer where copy begins
///
/// # Errors
///
/// Returns `Err(GpuError::Transfer)` if offset + data.len() exceeds buffer size.
pub fn copy_to_host_at(&self, data: &mut [T], offset: usize) -> Result<(), GpuError> {
if offset + data.len() > self.len {
return Err(GpuError::Transfer(format!(
"Partial copy out of bounds: offset {} + len {} > buffer {}",
offset,
data.len(),
self.len
)));
}
if data.is_empty() {
return Ok(());
}
let driver = get_driver()?;
let size = std::mem::size_of_val(data);
let src_ptr = self.ptr + (offset * std::mem::size_of::<T>()) as u64;
// SAFETY: bounds checked above, data and ptr are valid
let result =
unsafe { (driver.cuMemcpyDtoH)(data.as_mut_ptr() as *mut c_void, src_ptr, size) };
CudaDriver::check(result).map_err(|e| GpuError::Transfer(e.to_string()))
}
// =========================================================================
// PAR-023: Device-to-Device Copy (Zero-Sync Pipeline)
// =========================================================================
/// Clone buffer to new GPU memory (device-to-device copy)
///
/// Allocates new GPU memory and copies contents from self.
///
/// # Arguments
///
/// * `ctx` - CUDA context (must be current)
///
/// # Errors
///
/// Returns `Err(GpuError::MemoryAllocation)` if allocation fails.
/// Returns `Err(GpuError::Transfer)` if copy fails.
pub fn clone(&self, ctx: &CudaContext) -> Result<Self, GpuError> {
let mut new_buffer = GpuBuffer::new(ctx, self.len)?;
new_buffer.copy_from_buffer(self)?;
Ok(new_buffer)
}
/// Copy data from another GPU buffer (device-to-device, synchronous)
///
/// Enables zero-sync GPU pipelines by keeping data on device.
///
/// # Arguments
///
/// * `src` - Source GPU buffer (must have same length)
///
/// # Errors
///
/// Returns `Err(GpuError::Transfer)` if lengths don't match or copy fails.
pub fn copy_from_buffer(&mut self, src: &GpuBuffer<T>) -> Result<(), GpuError> {
if src.len != self.len {
return Err(GpuError::Transfer(format!(
"PAR-023: D2D length mismatch: src {} vs dst {}",
src.len, self.len
)));
}
if self.len == 0 {
return Ok(());
}
let driver = get_driver()?;
let size = self.size_bytes();
// SAFETY: both buffers are valid, size is correct
let result = unsafe { (driver.cuMemcpyDtoD)(self.ptr, src.ptr, size) };
CudaDriver::check(result).map_err(|e| GpuError::Transfer(e.to_string()))
}
/// Copy partial data from another GPU buffer at specific offset (PAR-023)
///
/// Enables GPU-resident KV cache updates without host round-trip.
///
/// # Arguments
///
/// * `src` - Source GPU buffer
/// * `dst_offset` - Element offset in destination (this buffer)
/// * `src_offset` - Element offset in source buffer
/// * `count` - Number of elements to copy
///
/// # Errors
///
/// Returns `Err(GpuError::Transfer)` if copy would exceed buffer bounds.
pub fn copy_from_buffer_at(
&mut self,
src: &GpuBuffer<T>,
dst_offset: usize,
src_offset: usize,
count: usize,
) -> Result<(), GpuError> {
if dst_offset + count > self.len {
return Err(GpuError::Transfer(format!(
"PAR-023: D2D dst out of bounds: {} + {} > {}",
dst_offset, count, self.len
)));
}
if src_offset + count > src.len {
return Err(GpuError::Transfer(format!(
"PAR-023: D2D src out of bounds: {} + {} > {}",
src_offset, count, src.len
)));
}
if count == 0 {
return Ok(());
}
let driver = get_driver()?;
let size = count * std::mem::size_of::<T>();
let dst_ptr = self.ptr + (dst_offset * std::mem::size_of::<T>()) as u64;
let src_ptr = src.ptr + (src_offset * std::mem::size_of::<T>()) as u64;
// SAFETY: bounds checked above, both ptrs are valid
let result = unsafe { (driver.cuMemcpyDtoD)(dst_ptr, src_ptr, size) };
CudaDriver::check(result).map_err(|e| GpuError::Transfer(e.to_string()))
}
/// Async copy from another GPU buffer (PAR-023)
///
/// # Safety
///
/// Both buffers must remain valid until stream is synchronized.
pub unsafe fn copy_from_buffer_async(
&mut self,
src: &GpuBuffer<T>,
stream: &CudaStream,
) -> Result<(), GpuError> {
if src.len != self.len {
return Err(GpuError::Transfer(format!(
"PAR-023: Async D2D length mismatch: src {} vs dst {}",
src.len, self.len
)));
}
if self.len == 0 {
return Ok(());
}
let driver = get_driver()?;
let size = self.size_bytes();
// SAFETY: both buffers valid, caller ensures lifetime
let result = unsafe { (driver.cuMemcpyDtoDAsync)(self.ptr, src.ptr, size, stream.raw()) };
CudaDriver::check(result).map_err(|e| GpuError::Transfer(e.to_string()))
}
/// PAR-023: Async D2D copy with offsets
///
/// Copies a region from source buffer to destination buffer asynchronously.
/// Does not synchronize - caller must ensure stream sync before accessing data.
///
/// # Arguments
///
/// * `src` - Source GPU buffer
/// * `dst_offset` - Element offset in destination (this buffer)
/// * `src_offset` - Element offset in source buffer
/// * `count` - Number of elements to copy
/// * `stream` - CUDA stream for async operation
///
/// # Safety
///
/// Both buffers must remain valid until stream is synchronized.
pub unsafe fn copy_from_buffer_at_async(
&mut self,
src: &GpuBuffer<T>,
dst_offset: usize,
src_offset: usize,
count: usize,
stream: &CudaStream,
) -> Result<(), GpuError> {
if dst_offset + count > self.len {
return Err(GpuError::Transfer(format!(
"PAR-023: Async D2D dst out of bounds: {} + {} > {}",
dst_offset, count, self.len
)));
}
if src_offset + count > src.len {
return Err(GpuError::Transfer(format!(
"PAR-023: Async D2D src out of bounds: {} + {} > {}",
src_offset, count, src.len
)));
}
if count == 0 {
return Ok(());
}
let driver = get_driver()?;
let size = count * std::mem::size_of::<T>();
let dst_ptr = self.ptr + (dst_offset * std::mem::size_of::<T>()) as u64;
let src_ptr = src.ptr + (src_offset * std::mem::size_of::<T>()) as u64;
// SAFETY: bounds checked above, both ptrs are valid, caller ensures lifetime
let result = unsafe { (driver.cuMemcpyDtoDAsync)(dst_ptr, src_ptr, size, stream.raw()) };
CudaDriver::check(result).map_err(|e| GpuError::Transfer(e.to_string()))
}
/// PAR-023: Async D2D copy with raw stream handle
///
/// Same as `copy_from_buffer_at_async` but takes raw CUstream handle.
/// Useful when borrow checker prevents passing &CudaStream due to other borrows.
///
/// # Safety
///
/// - Both buffers must remain valid until stream is synchronized.
/// - Stream handle must be valid.
pub unsafe fn copy_from_buffer_at_async_raw(
&mut self,
src: &GpuBuffer<T>,
dst_offset: usize,
src_offset: usize,
count: usize,
stream_handle: crate::driver::sys::CUstream,
) -> Result<(), GpuError> {
if dst_offset + count > self.len {
return Err(GpuError::Transfer(format!(
"PAR-023: Async D2D dst out of bounds: {} + {} > {}",
dst_offset, count, self.len
)));
}
if src_offset + count > src.len {
return Err(GpuError::Transfer(format!(
"PAR-023: Async D2D src out of bounds: {} + {} > {}",
src_offset, count, src.len
)));
}
if count == 0 {
return Ok(());
}
let driver = get_driver()?;
let size = count * std::mem::size_of::<T>();
let dst_ptr = self.ptr + (dst_offset * std::mem::size_of::<T>()) as u64;
let src_ptr = src.ptr + (src_offset * std::mem::size_of::<T>()) as u64;
// SAFETY: bounds checked above, both ptrs valid, caller ensures lifetime + stream valid
let result = unsafe { (driver.cuMemcpyDtoDAsync)(dst_ptr, src_ptr, size, stream_handle) };
CudaDriver::check(result).map_err(|e| GpuError::Transfer(e.to_string()))
}
}