tritonserver_rs/memory.rs
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618
//! Module responsible for memory allocation and assignments.
//!
//! **NOTE**: some functions that uses CUDA must be run in synchronous context +
//! [crate::context::Context] must me pushed as current. \
//! To simplify satisfaction of this requirement, those methods can be run with [crate::run_in_context] or [crate::run_in_context_sync] macro.
//! ```
//! run_in_context!(0, Buffer::alloc<f32>(10, MemoryType::Gpu))
//! ```
use core::slice;
use std::{
ffi::CStr,
fmt::Debug,
intrinsics::copy_nonoverlapping,
mem::{size_of_val, transmute},
ops::{Bound, RangeBounds},
};
#[cfg(feature = "gpu")]
use cuda_driver_sys::{
cuMemAllocHost_v2, cuMemAlloc_v2, cuMemFreeHost, cuMemFree_v2, cuMemcpyDtoD_v2,
cuMemcpyDtoH_v2, cuMemcpyHtoD_v2, CUdeviceptr,
};
use libc::{c_void, calloc, free};
use crate::{
error::{Error, ErrorCode, CSTR_CONVERT_ERROR_PLUG},
sys, to_cstring,
};
macro_rules! impl_sample {
($type:ty, $data:expr) => {
impl private::Sealed for $type {}
impl Sample for $type {
const DATA_TYPE: DataType = $data;
}
};
}
mod private {
pub trait Sealed: Clone + Copy {}
}
/// Trait of objects that can be stored in Buffer.
pub trait Sample: private::Sealed {
const DATA_TYPE: DataType;
}
/// Tensor data types recognized by TRITONSERVER.
#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
#[repr(u32)]
pub enum DataType {
Bool = sys::TRITONSERVER_datatype_enum_TRITONSERVER_TYPE_BOOL,
Uint8 = sys::TRITONSERVER_datatype_enum_TRITONSERVER_TYPE_UINT8,
Uint16 = sys::TRITONSERVER_datatype_enum_TRITONSERVER_TYPE_UINT16,
Uint32 = sys::TRITONSERVER_datatype_enum_TRITONSERVER_TYPE_UINT32,
Uint64 = sys::TRITONSERVER_datatype_enum_TRITONSERVER_TYPE_UINT64,
Int8 = sys::TRITONSERVER_datatype_enum_TRITONSERVER_TYPE_INT8,
Int16 = sys::TRITONSERVER_datatype_enum_TRITONSERVER_TYPE_INT16,
Int32 = sys::TRITONSERVER_datatype_enum_TRITONSERVER_TYPE_INT32,
Int64 = sys::TRITONSERVER_datatype_enum_TRITONSERVER_TYPE_INT64,
Fp16 = sys::TRITONSERVER_datatype_enum_TRITONSERVER_TYPE_FP16,
Fp32 = sys::TRITONSERVER_datatype_enum_TRITONSERVER_TYPE_FP32,
Fp64 = sys::TRITONSERVER_datatype_enum_TRITONSERVER_TYPE_FP64,
Bytes = sys::TRITONSERVER_datatype_enum_TRITONSERVER_TYPE_BYTES,
}
#[derive(Clone, Copy)]
pub struct Byte(pub u8);
impl_sample!(bool, DataType::Bool);
impl_sample!(u8, DataType::Uint8);
impl_sample!(Byte, DataType::Bytes);
impl_sample!(u16, DataType::Uint16);
impl_sample!(u32, DataType::Uint32);
impl_sample!(u64, DataType::Uint64);
impl_sample!(i8, DataType::Int8);
impl_sample!(i16, DataType::Int16);
impl_sample!(i32, DataType::Int32);
impl_sample!(i64, DataType::Int64);
impl_sample!(half::f16, DataType::Fp16);
impl_sample!(f32, DataType::Fp32);
impl_sample!(f64, DataType::Fp64);
impl DataType {
/// Get the string representation of a data type.
pub fn as_str(self) -> &'static str {
let ptr = unsafe { sys::TRITONSERVER_DataTypeString(self as u32) };
unsafe { CStr::from_ptr(ptr) }
.to_str()
.unwrap_or(CSTR_CONVERT_ERROR_PLUG)
}
/// Get the size of a Triton datatype in bytes. Zero is returned for [DataType::Bytes] because it have variable size.
pub fn size(self) -> u32 {
unsafe { sys::TRITONSERVER_DataTypeByteSize(self as u32) }
}
}
impl TryFrom<&str> for DataType {
type Error = Error;
/// Get the Triton datatype corresponding to a string representation of a datatype.
fn try_from(name: &str) -> Result<Self, Self::Error> {
let name = to_cstring(name)?;
let data_type = unsafe { sys::TRITONSERVER_StringToDataType(name.as_ptr()) };
if data_type != sys::TRITONSERVER_datatype_enum_TRITONSERVER_TYPE_INVALID {
Ok(unsafe { transmute::<u32, crate::memory::DataType>(data_type) })
} else {
Err(Error::new(ErrorCode::InvalidArg, ""))
}
}
}
/// Types of memory recognized by TRITONSERVER.
#[derive(Debug, Copy, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
#[repr(u32)]
pub enum MemoryType {
Cpu = sys::TRITONSERVER_memorytype_enum_TRITONSERVER_MEMORY_CPU,
Pinned = sys::TRITONSERVER_memorytype_enum_TRITONSERVER_MEMORY_CPU_PINNED,
Gpu = sys::TRITONSERVER_memorytype_enum_TRITONSERVER_MEMORY_GPU,
}
impl MemoryType {
/// Get the string representation of a memory type.
pub fn as_str(self) -> &'static str {
let ptr = unsafe { sys::TRITONSERVER_MemoryTypeString(self as u32) };
unsafe { CStr::from_ptr(ptr) }
.to_str()
.unwrap_or(CSTR_CONVERT_ERROR_PLUG)
}
}
/// Representation of GPU based cuda array.
///
/// Does not delete array on drop.
#[cfg(feature = "gpu")]
pub struct CudaArray {
pub ptr: CUdeviceptr,
pub len: usize,
}
/// Data storring buffer.
///
/// Deletes data on drop.
// # Safety
// As long as no one changes the owned field
// and there is no overlapped buffers, they are thread safe. \
// Since no one can change the state of Buffer using reference
// and ptr pointing on heap/gpu, it's Sync.
#[derive(Debug)]
pub struct Buffer {
pub(crate) ptr: *mut c_void,
// Byte size,
pub(crate) len: usize,
pub(crate) data_type: DataType,
pub(crate) memory_type: MemoryType,
/// Should we execute the Drop or not.
pub(crate) owned: bool,
}
unsafe impl Send for Buffer {}
unsafe impl Sync for Buffer {}
/// Buffer creation section.
impl Buffer {
/// Try clone buffer content.
///
/// **Note**: If memory type is not Cpu, should be called in sync with cuda context pinned (check module level documentation for more info).
pub fn try_clone(&self) -> Result<Self, Error> {
self.check_mem_type_feature()?;
let sample_count = self.len / self.data_type.size() as usize;
let mut res = Buffer::alloc_with_data_type(sample_count, self.memory_type, self.data_type)?;
if self.memory_type == MemoryType::Gpu {
#[cfg(feature = "gpu")]
res.copy_from_cuda_array(0, unsafe { self.get_cuda_array() })?;
} else {
res.copy_from_slice(0, self.bytes())?;
}
Ok(res)
}
/// Allocate new buffer of requested memory type.\
/// `count`: size of buffer in `T` units (i.e. 128 chunks of f32 (that has byte size 512) should be allocated with `count=128`).\
/// `memory_type`: Cpu/Pinned/Gpu.
///
/// **Note**: If memory type is not Cpu, should be called in sync with cuda context pinned (check module level documentation for more info).
pub fn alloc<T: Sample>(count: usize, memory_type: MemoryType) -> Result<Self, Error> {
Self::alloc_with_data_type(count, memory_type, T::DATA_TYPE)
}
pub(crate) fn alloc_with_data_type(
count: usize,
memory_type: MemoryType,
data_type: DataType,
) -> Result<Self, Error> {
let data_type_size = data_type.size() as usize;
let size = count * data_type_size;
let ptr = match memory_type {
MemoryType::Cpu => Ok::<_, Error>(unsafe { calloc(count as _, data_type_size) }),
MemoryType::Pinned => {
#[cfg(not(feature = "gpu"))]
return Err(Error::wrong_type(memory_type));
#[cfg(feature = "gpu")]
{
let mut data = std::ptr::null_mut::<c_void>();
cuda_call!(cuMemAllocHost_v2(&mut data, size))?;
Ok(data)
}
}
MemoryType::Gpu => {
#[cfg(not(feature = "gpu"))]
return Err(Error::wrong_type(memory_type));
#[cfg(feature = "gpu")]
{
let mut data = 0;
cuda_call!(cuMemAlloc_v2(&mut data, size))?;
Ok(data as *mut c_void)
}
}
}?;
if ptr.is_null() {
Err(Error::new(
ErrorCode::Internal,
format!("OutOfMemory. {memory_type:?}"),
))
} else {
Ok(Buffer {
ptr,
len: size,
data_type,
memory_type,
owned: true,
})
}
}
/// Create CPU buffer of data type `T::DARA_TYPE` from `slice` of T.
pub fn from<T: Sample, S: AsRef<[T]>>(slice: S) -> Self {
let slice = slice.as_ref();
let ptr = unsafe {
let ptr = calloc(slice.len(), std::mem::size_of::<T>()) as *mut T;
copy_nonoverlapping(slice.as_ptr(), ptr, slice.len());
ptr
};
Buffer {
ptr: ptr as *mut _,
len: size_of_val(slice),
data_type: T::DATA_TYPE,
memory_type: MemoryType::Cpu,
owned: true,
}
}
}
/// Create GPU buffers of [DataType::Uint8] from [CudaArray].
/// Result memory type will be [MemoryType::Gpu].
///
/// Note that nothing is allocated on this call, meaning that result buffer will just point on
/// data provided by argument.
#[cfg(feature = "gpu")]
impl From<CudaArray> for Buffer {
fn from(value: CudaArray) -> Self {
Buffer {
ptr: value.ptr as *mut c_void,
len: value.len,
data_type: DataType::Uint8,
memory_type: MemoryType::Gpu,
owned: true,
}
}
}
/// Create [CudaArray] from [Buffer].
///
/// Buffer destructor will not be invoked so data will be safe.
#[cfg(feature = "gpu")]
impl From<Buffer> for CudaArray {
fn from(value: Buffer) -> CudaArray {
let res = CudaArray {
ptr: value.ptr as _,
len: value.len,
};
std::mem::forget(value);
res
}
}
/// Buffer metadata section.
impl Buffer {
/// Get memory type of storred data.
pub fn memory_type(&self) -> MemoryType {
self.memory_type
}
/// Get data type of storred data.
pub fn data_type(&self) -> DataType {
self.data_type
}
/// Get byte size of data.
pub fn size(&self) -> usize {
self.len
}
/// True if not containing any data.
pub fn is_empty(&self) -> bool {
self.len == 0
}
}
/// Buffer data permutation section.
impl Buffer {
/// Copy `source` content to self from the `offset` position.\
/// Returns error if offset + size_of_val(source) > self.size().
///
/// `offset`: offset (in bytes) from the beginning of the Buffer to location to copy `source` to.
/// `source`: slice of Samples.
///
/// **Note**: If self.memory_type is not Cpu, should be called in sync with cuda context pinned (check module level documentation for more info).
pub fn copy_from_slice<S: AsRef<[T]>, T: Sample>(
&mut self,
offset: usize,
source: S,
) -> Result<(), Error> {
self.check_mem_type_feature()?;
let slice = source.as_ref();
let byte_size = size_of_val(slice);
if self.len < byte_size + offset {
return Err(Error::new(
ErrorCode::Internal,
format!(
"copy_from_slice error: size mismatch! (required {}, buffer len {})",
byte_size + offset,
self.len
),
));
}
match self.memory_type {
MemoryType::Cpu | MemoryType::Pinned => unsafe {
copy_nonoverlapping(slice.as_ptr(), self.ptr.byte_add(offset) as _, slice.len());
},
MemoryType::Gpu => {
#[cfg(feature = "gpu")]
cuda_call!(cuMemcpyHtoD_v2(
self.ptr as CUdeviceptr + offset as CUdeviceptr,
slice.as_ptr() as _,
byte_size
))?;
}
}
Ok(())
}
/// Copy `source` content to self from the `offset` position.\
/// Returns error if offset + source.len > self.size().
///
/// `offset`: offset (in bytes) from the beginning of the Buffer to location to copy `source` to.
/// `source`: cuda array.
///
/// **Note**: This method should be called in sync with cuda context pinned (check module level documentation for more info).
#[cfg(feature = "gpu")]
pub fn copy_from_cuda_array(&mut self, offset: usize, source: CudaArray) -> Result<(), Error> {
let CudaArray { ptr, len } = source;
if len + offset > self.len {
return Err(Error::new(
ErrorCode::Internal,
format!(
"copy_from_cuda_array error: size mismatch (buffer len {}, required {})",
self.len,
len + offset
),
));
}
match self.memory_type {
MemoryType::Pinned | MemoryType::Cpu => {
cuda_call!(cuMemcpyDtoH_v2(
self.ptr.byte_add(offset),
ptr as CUdeviceptr,
len
))?;
}
MemoryType::Gpu => {
cuda_call!(cuMemcpyDtoD_v2(
self.ptr as CUdeviceptr + offset as CUdeviceptr,
ptr as CUdeviceptr,
len
))?;
}
}
Ok(())
}
/// Move this Buffer content to CPU memory.
///
/// **Note**: If self.memory_type() is not Cpu, method should be called in sync with cuda context pinned (check module level documentation for more info).
pub fn into_cpu(self) -> Result<Self, Error> {
self.into_mem_type(MemoryType::Cpu)
}
/// Move this Buffer content to Pinned memory.
///
/// **Note**: This method should be called in sync with cuda context pinned (check module level documentation for more info).
#[cfg(feature = "gpu")]
pub fn into_pinned(self) -> Result<Self, Error> {
self.into_mem_type(MemoryType::Pinned)
}
/// Move this Buffer content to Gpu memory.
///
/// **Note**: This method should be called in sync with cuda context pinned (check module level documentation for more info).
#[cfg(feature = "gpu")]
pub fn into_gpu(self) -> Result<Self, Error> {
self.into_mem_type(MemoryType::Gpu)
}
fn into_mem_type(self, mem_type: MemoryType) -> Result<Self, Error> {
self.check_mem_type_feature()?;
if self.memory_type == mem_type {
return Ok(self);
}
let sample_count = self.len / self.data_type.size() as usize;
let mut res = Buffer::alloc_with_data_type(sample_count, mem_type, self.data_type)?;
if self.memory_type == MemoryType::Gpu {
#[cfg(feature = "gpu")]
res.copy_from_cuda_array(0, unsafe { self.get_cuda_array() })?;
} else {
res.copy_from_slice(0, self.bytes())?;
}
Ok(res)
}
}
/// Obtaining buffer content section.
impl Buffer {
/// Get buffer content as bytes.
///
/// Will return nothing if self.memory_type == Gpu. Use [Buffer::get_owned_slice] instead.
pub fn bytes(&self) -> &[u8] {
if self.memory_type == MemoryType::Gpu {
log::warn!("Use bytes() on Gpu Buffer. empty slice will be returned");
return &[];
}
unsafe { slice::from_raw_parts(self.ptr as *const u8, self.len) }
}
/// Get buffer mutable content as bytes.
///
/// Will return nothing if self.memory_type == Gpu. [Buffer::get_cuda_array] can be used instead to implement this logic.
pub fn bytes_mut(&mut self) -> &mut [u8] {
if self.memory_type == MemoryType::Gpu {
log::warn!("Use bytes_mut() on Gpu Buffer. empty slice will be returned");
return &mut [];
}
unsafe { slice::from_raw_parts_mut(self.ptr as *mut u8, self.len) }
}
/// Get content of the buffer as host located bytes.\
/// `range`: part of the buffer to return.
pub fn get_owned_slice<Range: RangeBounds<usize> + Debug>(
&self,
range: Range,
) -> Result<Vec<u8>, Error> {
self.check_mem_type_feature()?;
let left = match range.start_bound() {
Bound::Unbounded => 0,
Bound::Included(pos) => *pos,
Bound::Excluded(pos) => *pos + 1,
};
let right = match range.end_bound() {
Bound::Unbounded => self.len,
Bound::Included(pos) => *pos + 1,
Bound::Excluded(pos) => *pos,
};
if right > self.len {
return Err(Error::new(
ErrorCode::InvalidArg,
format!(
"get_slice invalid range: {range:?}, buffer len is: {}",
self.len
),
));
}
if self.memory_type != MemoryType::Gpu {
Ok(self.bytes()[left..right].to_vec())
} else {
let mut res = Vec::with_capacity(right - left);
#[cfg(feature = "gpu")]
cuda_call!(cuMemcpyDtoH_v2(
res.as_mut_ptr() as _,
self.ptr as CUdeviceptr + left as CUdeviceptr,
right - left
))?;
unsafe { res.set_len(self.len) };
Ok(res)
}
}
/// Get content of the GPU based buffer.
/// # Panics
/// Panics if self.memory_type != Gpu.
/// # Safety
/// Returned struct points to the same location as buffer, so the rules are the same as sharing *mut on an object. \
/// Be careful: Buffer will delete data on drop, so be afraid of double memory free.
/// Also any shinenigans with the data during the inference are forbidden: Triton must have exclusive write ascess to data during the inference.
#[cfg(feature = "gpu")]
pub unsafe fn get_cuda_array(&self) -> CudaArray {
if self.memory_type != MemoryType::Gpu {
panic!("Invoking get_cuda_array for non GPU-based buffer");
}
CudaArray {
ptr: self.ptr as _,
len: self.len,
}
}
fn check_mem_type_feature(&self) -> Result<(), Error> {
#[cfg(not(feature = "gpu"))]
if self.memory_type != MemoryType::Cpu {
return Err(Error::wrong_type(self.memory_type));
}
Ok(())
}
}
impl<T: Sample> AsRef<[T]> for Buffer {
/// Converts this type into a shared reference on the slice of T.
///
/// Will return nothing if self.memory_type == Gpu.
/// # Panics
/// Panics if T does not match Buffer data type.
fn as_ref(&self) -> &[T] {
if T::DATA_TYPE != self.data_type {
panic!(
"Buffer data_type {:?} != target slice data_type: {:?}",
self.data_type,
T::DATA_TYPE
)
}
if self.memory_type == MemoryType::Gpu {
log::warn!("Use as_ref() on Gpu Buffer. empty slice will be returned");
return &[];
}
unsafe { slice::from_raw_parts(self.ptr as *const T, self.len) }
}
}
impl<T: Sample> AsMut<[T]> for Buffer {
/// Converts this type into a mutable reference on the slice of T.
///
/// Will return nothing if self.memory_type == Gpu.
/// # Panics
/// Panics if T does not match Buffer data type.
fn as_mut(&mut self) -> &mut [T] {
if T::DATA_TYPE != self.data_type {
panic!(
"Buffer data_type {:?} != target slice data_type: {:?}",
self.data_type,
T::DATA_TYPE
)
}
if self.memory_type == MemoryType::Gpu {
log::warn!("Use as_mut() on Gpu Buffer. empty slice will be returned");
return &mut [];
}
unsafe { slice::from_raw_parts_mut(self.ptr as *mut T, self.len) }
}
}
impl Drop for Buffer {
fn drop(&mut self) {
if self.owned && !self.ptr.is_null() {
unsafe {
match self.memory_type {
MemoryType::Cpu => {
free(self.ptr);
}
MemoryType::Pinned => {
#[cfg(feature = "gpu")]
cuMemFreeHost(self.ptr);
}
MemoryType::Gpu => {
#[cfg(feature = "gpu")]
cuMemFree_v2(self.ptr as CUdeviceptr);
}
}
}
}
}
}