use super::*;
use std::sync::Mutex;
use bytemuck::Zeroable;
use std::hash::{ Hash, Hasher };
use std::ops::{ Deref, DerefMut, Index, IndexMut };
use std::fmt;
#[cfg(feature = "cuda")]
use cust::memory::{ DeviceBuffer, DeviceSlice, CopyDestination };
#[cfg(feature = "cuda")]
use cust::context::Context;
pub struct UVec<T: UniversalCopy> {
data_cpu: Option<Box<[T]>>,
#[cfg(feature = "cuda")]
data_cuda: [Option<DeviceBuffer<T>>; MAX_NUM_CUDA_DEVICES],
valid_flag: [bool; MAX_DEVICES],
read_locks: [Mutex<()>; MAX_DEVICES],
size: usize,
capacity: usize,
}
impl<T: UniversalCopy + fmt::Debug> fmt::Debug for UVec<T> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let slice = self.as_ref();
write!(f, "uvec[{}] = [", slice.len())?;
for (i, e) in slice.iter().enumerate() {
if i != 0 {
write!(f, ", ")?;
}
if f.alternate() {
write!(f, "{:#?}", e)?;
}
else {
write!(f, "{:?}", e)?;
}
}
write!(f, "]")
}
}
impl<T: UniversalCopy> Default for UVec<T> {
#[inline]
fn default() -> Self {
Self {
data_cpu: None,
#[cfg(feature = "cuda")]
data_cuda: Default::default(),
valid_flag: [false; MAX_DEVICES],
read_locks: Default::default(),
size: 0,
capacity: 0
}
}
}
impl<T: UniversalCopy> From<Box<[T]>> for UVec<T> {
#[inline]
fn from(b: Box<[T]>) -> UVec<T> {
let len = b.len();
let mut valid_flag = [false; MAX_DEVICES];
valid_flag[Device::CPU.to_id()] = true;
Self {
data_cpu: Some(b),
#[cfg(feature = "cuda")]
data_cuda: Default::default(),
valid_flag,
read_locks: Default::default(),
size: len,
capacity: len
}
}
}
impl<T: UniversalCopy> From<Vec<T>> for UVec<T> {
#[inline]
fn from(v: Vec<T>) -> UVec<T> {
v.into_boxed_slice().into()
}
}
impl<T: UniversalCopy> FromIterator<T> for UVec<T> {
#[inline]
fn from_iter<I: IntoIterator<Item = T>>(iter: I) -> Self {
Vec::from_iter(iter).into()
}
}
impl<T: UniversalCopy + Zeroable> UVec<T> {
#[inline]
fn alloc_zeroed(&mut self, device: Device) {
use Device::*;
match device {
CPU => {
use std::alloc;
self.data_cpu = Some(unsafe {
let ptr = alloc::alloc_zeroed(
alloc::Layout::array::<T>(
self.capacity
).unwrap()) as *mut T;
Box::from_raw(
core::ptr::slice_from_raw_parts_mut(
ptr, self.size))
});
},
#[cfg(feature = "cuda")]
CUDA(c) => {
let _context = Context::new(
CUDA_DEVICES[c as usize].0).unwrap();
self.data_cuda[c as usize] =
Some(DeviceBuffer::zeroed(self.capacity)
.unwrap());
}
}
}
}
#[inline]
unsafe fn alloc_cpu_uninit<T: UniversalCopy>(
sz: usize
) -> Box<[T]> {
use std::alloc;
let ptr = alloc::alloc(alloc::Layout::array::<T>(sz).unwrap())
as *mut T;
Box::from_raw(core::ptr::slice_from_raw_parts_mut(ptr, sz))
}
#[cfg(feature = "cuda")]
#[inline]
unsafe fn alloc_cuda_uninit<T: UniversalCopy>(
sz: usize, dev: u8
) -> DeviceBuffer<T> {
let _context = Context::new(CUDA_DEVICES[dev as usize].0)
.unwrap();
DeviceBuffer::uninitialized(sz).unwrap()
}
impl<T: UniversalCopy> UVec<T> {
#[inline]
unsafe fn alloc_uninitialized(&mut self, device: Device) {
use Device::*;
match device {
CPU => {
self.data_cpu = Some(alloc_cpu_uninit(
self.capacity));
},
#[cfg(feature = "cuda")]
CUDA(c) => {
self.data_cuda[c as usize] = Some(
alloc_cuda_uninit(self.capacity, c));
}
}
}
#[inline]
fn device_valid(&self) -> Option<Device> {
self.valid_flag.iter().enumerate().find(|(_i, v)| **v)
.map(|(i, _v)| Device::from_id(i))
}
#[inline]
fn schedule_device_read(&mut self, device: Device) {
if self.valid_flag[device.to_id()] {
return
}
use Device::*;
let is_none = match device {
CPU => self.data_cpu.is_none(),
#[cfg(feature = "cuda")]
CUDA(c) => self.data_cuda[c as usize].is_none()
};
if is_none {
unsafe { self.alloc_uninitialized(device); }
}
let device_valid = self.device_valid().expect("no valid dev");
match (device_valid, device) {
(CPU, CPU) => {},
#[cfg(feature = "cuda")]
(CPU, CUDA(c)) => {
let c = c as usize;
self.data_cuda[c].as_mut().unwrap().index(..self.size)
.copy_from(
&self.data_cpu.as_ref().unwrap()[..self.size]
).unwrap();
},
#[cfg(feature = "cuda")]
(CUDA(c), CPU) => {
let c = c as usize;
self.data_cuda[c].as_ref().unwrap().index(..self.size)
.copy_to(
&mut self.data_cpu.as_mut().unwrap()[..self.size]
).unwrap();
},
#[cfg(feature = "cuda")]
(CUDA(c1), CUDA(c2)) => {
let (c1, c2) = (c1 as usize, c2 as usize);
assert_ne!(c1, c2);
let c2_mut = unsafe {
&mut *(self.data_cuda[c2].as_ref().unwrap()
as *const DeviceBuffer<T>
as *mut DeviceBuffer<T>)
};
self.data_cuda[c1].as_ref().unwrap().index(..self.size)
.copy_to(
&mut c2_mut.index(..self.size)
).unwrap();
}
}
self.valid_flag[device.to_id()] = true;
}
#[inline]
fn schedule_device_read_ro(&self, device: Device) {
if self.valid_flag[device.to_id()] {
return
}
let locked = self.read_locks[device.to_id()]
.lock().unwrap();
unsafe {
(&mut *(self as *const UVec<T> as *mut UVec<T>))
.schedule_device_read(device);
}
drop(locked);
}
#[inline]
fn schedule_device_write(&mut self, device: Device) {
if !self.valid_flag[device.to_id()] {
self.schedule_device_read(device);
}
self.valid_flag[..].fill(false);
self.valid_flag[device.to_id()] = true;
}
#[inline]
fn drop_all_buf(&mut self) {
self.data_cpu = None;
#[cfg(feature = "cuda")]
for d in &mut self.data_cuda {
*d = None;
}
}
#[inline]
unsafe fn realloc_uninit_nopreserve(&mut self, device: Device) {
self.drop_all_buf();
if self.capacity > 10000000 {
clilog::debug!("large realloc: capacity {}",
self.capacity);
}
self.alloc_uninitialized(device);
self.valid_flag.fill(false);
self.valid_flag[device.to_id()] = true;
}
#[inline]
unsafe fn realloc_uninit_preserve(&mut self, device: Device) {
use Device::*;
match device {
CPU => {
let old = self.data_cpu.take().unwrap();
self.drop_all_buf();
self.alloc_uninitialized(device);
self.data_cpu.as_mut().unwrap()[..self.size]
.copy_from_slice(&old[..self.size]);
},
#[cfg(feature = "cuda")]
CUDA(c) => {
let c = c as usize;
let old = self.data_cuda[c].take().unwrap();
self.drop_all_buf();
self.alloc_uninitialized(device);
self.data_cuda[c].as_mut().unwrap().index(..self.size)
.copy_from(&old.index(..self.size))
.unwrap();
}
}
self.valid_flag.fill(false);
self.valid_flag[device.to_id()] = true;
}
#[inline]
pub fn get(&self, idx: usize) -> T {
use Device::*;
match self.device_valid().unwrap() {
CPU => self.data_cpu.as_ref().unwrap()[idx],
#[cfg(feature = "cuda")]
CUDA(c) => {
let mut ret: [T; 1] = unsafe {
std::mem::MaybeUninit::uninit().assume_init()
};
self.data_cuda[c as usize].as_ref().unwrap()
.index(idx)
.copy_to(&mut ret)
.unwrap();
ret[0]
}
}
}
}
impl<T: UniversalCopy + Zeroable> UVec<T> {
#[inline]
pub fn new_zeroed(size: usize, device: Device) -> UVec<T> {
let mut v: UVec<T> = Default::default();
v.size = size;
v.capacity = size;
v.alloc_zeroed(device);
v.valid_flag[device.to_id()] = true;
v
}
}
impl<T: UniversalCopy> UVec<T> {
#[inline]
pub fn len(&self) -> usize {
self.size
}
#[inline]
pub fn capacity(&self) -> usize {
self.capacity
}
#[inline]
pub fn new() -> UVec<T> {
unsafe { Self::new_uninitialized(0, Device::CPU) }
}
#[inline]
pub unsafe fn new_uninitialized(
size: usize, device: Device
) -> UVec<T> {
let mut v: UVec<T> = Default::default();
v.size = size;
v.capacity = size;
v.alloc_uninitialized(device);
v.valid_flag[device.to_id()] = true;
v
}
#[inline]
pub unsafe fn resize_uninit_nopreserve(&mut self, size: usize, device: Device) {
if self.capacity < size {
self.capacity = (size as f64 * 1.5).round() as usize;
self.realloc_uninit_nopreserve(device);
}
self.size = size;
}
#[inline]
pub unsafe fn resize_uninit_preserve(&mut self, size: usize, device: Device) {
if self.size != 0 {
self.schedule_device_read(device);
}
if self.capacity < size {
self.capacity = (size as f64 * 1.5).round() as usize;
self.realloc_uninit_preserve(device);
}
self.size = size;
self.valid_flag.fill(false);
self.valid_flag[device.to_id()] = true;
}
}
impl<T: UniversalCopy> AsRef<[T]> for UVec<T> {
#[inline]
fn as_ref(&self) -> &[T] {
self.schedule_device_read_ro(Device::CPU);
&self.data_cpu.as_ref().unwrap()[..self.size]
}
}
impl<T: UniversalCopy> AsMut<[T]> for UVec<T> {
#[inline]
fn as_mut(&mut self) -> &mut [T] {
self.schedule_device_write(Device::CPU);
&mut self.data_cpu.as_mut().unwrap()[..self.size]
}
}
impl<T: UniversalCopy> Deref for UVec<T> {
type Target = [T];
#[inline]
fn deref(&self) -> &[T] {
self.as_ref()
}
}
impl<T: UniversalCopy> DerefMut for UVec<T> {
#[inline]
fn deref_mut(&mut self) -> &mut [T] {
self.as_mut()
}
}
impl<T: UniversalCopy, I> Index<I> for UVec<T> where [T]: Index<I> {
type Output = <[T] as Index<I>>::Output;
#[inline]
fn index(&self, i: I) -> &Self::Output {
self.as_ref().index(i)
}
}
impl<T: UniversalCopy, I> IndexMut<I> for UVec<T> where [T]: IndexMut<I> {
#[inline]
fn index_mut(&mut self, i: I) -> &mut Self::Output {
self.as_mut().index_mut(i)
}
}
#[cfg(feature = "cuda")]
impl<T: UniversalCopy> AsCUDASlice<T> for UVec<T> {
#[inline]
fn as_cuda_slice(&self, cuda_device: Device) -> DeviceSlice<T> {
use Device::*;
let c = match cuda_device {
CUDA(c) => c as usize,
_ => panic!("AsCUDASlice does not accept \
non-CUDA device {:?}", cuda_device)
};
self.schedule_device_read_ro(cuda_device);
let ptr = self.data_cuda[c].as_ref().unwrap().as_device_ptr();
unsafe { DeviceSlice::from_raw_parts(ptr, self.size) }
}
}
#[cfg(feature = "cuda")]
impl<T: UniversalCopy> AsCUDASliceMut<T> for UVec<T> {
#[inline]
fn as_cuda_slice_mut(&mut self, cuda_device: Device) -> DeviceSlice<T> {
use Device::*;
let c = match cuda_device {
CUDA(c) => c as usize,
_ => panic!("AsCUDASlice does not accept \
non-CUDA device {:?}", cuda_device)
};
self.schedule_device_write(cuda_device);
let ptr = self.data_cuda[c].as_ref().unwrap().as_device_ptr();
unsafe { DeviceSlice::from_raw_parts(ptr, self.size) }
}
}
impl<T: UniversalCopy> AsUPtr<T> for UVec<T> {
#[inline]
fn as_uptr(&self, device: Device) -> *const T {
self.schedule_device_read_ro(device);
use Device::*;
match device {
CPU => self.data_cpu.as_ref().unwrap().as_ptr(),
#[cfg(feature = "cuda")]
CUDA(c) => self.data_cuda[c as usize].as_ref().unwrap()
.as_device_ptr().as_ptr()
}
}
}
impl<T: UniversalCopy> AsUPtrMut<T> for UVec<T> {
#[inline]
fn as_mut_uptr(&mut self, device: Device) -> *mut T {
self.schedule_device_write(device);
use Device::*;
match device {
CPU => self.data_cpu.as_mut().unwrap().as_mut_ptr(),
#[cfg(feature = "cuda")]
CUDA(c) => self.data_cuda[c as usize].as_mut().unwrap()
.as_device_ptr().as_mut_ptr()
}
}
}
impl<T, U: UniversalCopy> AsUPtr<U> for &T where T: AsUPtr<U> {
#[inline]
fn as_uptr(&self, device: Device) -> *const U {
(*self).as_uptr(device)
}
}
impl<T, U: UniversalCopy> AsUPtrMut<U> for &mut T where T: AsUPtrMut<U> {
#[inline]
fn as_mut_uptr(&mut self, device: Device) -> *mut U {
(*self).as_mut_uptr(device)
}
}
impl<T: UniversalCopy + Hash> Hash for UVec<T> {
#[inline]
fn hash<H: Hasher>(&self, state: &mut H) {
self.as_ref().hash(state)
}
}
impl<T: UniversalCopy, U: UniversalCopy> PartialEq<UVec<U>> for UVec<T>
where T: PartialEq<U>
{
#[inline]
fn eq(&self, other: &UVec<U>) -> bool {
self.as_ref() == other.as_ref()
}
}
impl<T: UniversalCopy + Eq> Eq for UVec<T> { }
impl<T: UniversalCopy> Clone for UVec<T> {
fn clone(&self) -> Self {
let valid_flag = self.valid_flag.clone();
let data_cpu = match valid_flag[Device::CPU.to_id()] {
true => self.data_cpu.clone(),
false => None
};
#[cfg(feature = "cuda")]
let data_cuda = unsafe {
let mut data_cuda: [Option<DeviceBuffer<T>>; MAX_NUM_CUDA_DEVICES] = Default::default();
for i in 0..MAX_NUM_CUDA_DEVICES {
if valid_flag[Device::CUDA(i as u8).to_id()] {
let dbuf = alloc_cuda_uninit(self.capacity, i as u8);
self.data_cuda[i].as_ref().unwrap().index(..self.size)
.copy_to(&mut dbuf.index(..self.size))
.unwrap();
data_cuda[i] = Some(dbuf);
}
}
data_cuda
};
UVec {
data_cpu,
#[cfg(feature = "cuda")] data_cuda,
valid_flag,
read_locks: Default::default(),
size: self.size,
capacity: self.capacity
}
}
}