ulib 0.3.3 - Docs.rs

//! Universal vector-like array storage [`UVec`].

use super::*;
use std::sync::Mutex;
use bytemuck::Zeroable;
use std::hash::{ Hash, Hasher };
use std::ops::{ Deref, DerefMut, Index, IndexMut };
use std::fmt;

#[cfg(feature = "cuda")]
use cust::memory::{ DeviceBuffer, DeviceSlice, CopyDestination };
#[cfg(feature = "cuda")]
use cust::context::Context;

/// Universal vector-like array storage.
///
/// `UVec` is thread-safe. Specifically, its
/// read-only reference can be shared across different threads.
/// This is nontrivial because a read in `UVec` might schedule
/// a copy across device.
/// `UVec` is `Send` but not `Sync`.
pub struct UVec<T: UniversalCopy> {
    data_cpu: Option<Box<[T]>>,
    #[cfg(feature = "cuda")]
    data_cuda: [Option<DeviceBuffer<T>>; MAX_NUM_CUDA_DEVICES],
    /// A flag array recording the data presence and dirty status.
    /// A true entry means the data is valid on that device.
    valid_flag: [bool; MAX_DEVICES],
    /// Read locks for all devices
    ///
    /// This will not be locked for any operation originating
    /// from a write access -- no need to do so because Rust
    /// guarantees exclusive mutable reference.
    ///
    /// This will not be locked for readonly reference as long as
    /// our interested device is already ready for read (valid)
    /// -- no need to do so because Rust guarantees no mutation
    /// operation ever possible when a read-only reference
    /// is alive.
    ///
    /// This will ONLY be locked when a copy across device
    /// need to be launched with a read-only reference.
    /// The lock, in this case, is also per receiver device.
    read_locks: [Mutex<()>; MAX_DEVICES],
    /// the length of content
    size: usize,
    /// the length of buffer
    capacity: usize,
}

impl<T: UniversalCopy + fmt::Debug> fmt::Debug for UVec<T> {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        let slice = self.as_ref();
        write!(f, "uvec[{}] = [", slice.len())?;
        for (i, e) in slice.iter().enumerate() {
            if i != 0 {
                write!(f, ", ")?;
            }
            if f.alternate() {
                write!(f, "{:#?}", e)?;
            }
            else {
                write!(f, "{:?}", e)?;
            }
        }
        write!(f, "]")
    }
}

impl<T: UniversalCopy> Default for UVec<T> {
    #[inline]
    fn default() -> Self {
        Self {
            data_cpu: None,
            #[cfg(feature = "cuda")]
            data_cuda: Default::default(),
            valid_flag: [false; MAX_DEVICES],
            read_locks: Default::default(),
            size: 0,
            capacity: 0
        }
    }
}

impl<T: UniversalCopy> From<Box<[T]>> for UVec<T> {
    #[inline]
    fn from(b: Box<[T]>) -> UVec<T> {
        let len = b.len();
        let mut valid_flag = [false; MAX_DEVICES];
        valid_flag[Device::CPU.to_id()] = true;
        Self {
            data_cpu: Some(b),
            #[cfg(feature = "cuda")]
            data_cuda: Default::default(),
            valid_flag,
            read_locks: Default::default(),
            size: len,
            capacity: len
        }
    }
}

impl<T: UniversalCopy> From<Vec<T>> for UVec<T> {
    #[inline]
    fn from(v: Vec<T>) -> UVec<T> {
        v.into_boxed_slice().into()
    }
}

impl<T: UniversalCopy> FromIterator<T> for UVec<T> {
    #[inline]
    fn from_iter<I: IntoIterator<Item = T>>(iter: I) -> Self {
        Vec::from_iter(iter).into()
    }
}

impl<T: UniversalCopy + Zeroable> UVec<T> {
    /// private function to allocate space for one device.
    ///
    /// Guaranteed to only modify the buffer and the validity
    /// bit of the specified device.
    /// (which is useful in the safety of read-schedule
    /// interior mutability.)
    #[inline]
    fn alloc_zeroed(&mut self, device: Device) {
        use Device::*;
        match device {
            CPU => {
                use std::alloc;
                self.data_cpu = Some(unsafe {
                    let ptr = alloc::alloc_zeroed(
                        alloc::Layout::array::<T>(
                            self.capacity
                        ).unwrap()) as *mut T;
                    Box::from_raw(
                        core::ptr::slice_from_raw_parts_mut(
                            ptr, self.size))
                    // Box::new_zeroed_slice(sz).assume_init()
                });
            },
            #[cfg(feature = "cuda")]
            CUDA(c) => {
                let _context = Context::new(
                    CUDA_DEVICES[c as usize].0).unwrap();
                self.data_cuda[c as usize] =
                    Some(DeviceBuffer::zeroed(self.capacity)
                         .unwrap());
            }
        }
    }
}

#[inline]
unsafe fn alloc_cpu_uninit<T: UniversalCopy>(
    sz: usize
) -> Box<[T]> {
    use std::alloc;
    let ptr = alloc::alloc(alloc::Layout::array::<T>(sz).unwrap())
        as *mut T;
    Box::from_raw(core::ptr::slice_from_raw_parts_mut(ptr, sz))
}

#[cfg(feature = "cuda")]
#[inline]
unsafe fn alloc_cuda_uninit<T: UniversalCopy>(
    sz: usize, dev: u8
) -> DeviceBuffer<T> {
    let _context = Context::new(CUDA_DEVICES[dev as usize].0)
        .unwrap();
    DeviceBuffer::uninitialized(sz).unwrap()
}

impl<T: UniversalCopy> UVec<T> {
    /// private function to allocate space for one device.
    ///
    /// Guaranteed to only modify the buffer and the validity
    /// bit of the specified device.
    /// (which is useful in the safety of read-schedule
    /// interior mutability.)
    #[inline]
    unsafe fn alloc_uninitialized(&mut self, device: Device) {
        use Device::*;
        match device {
            CPU => {
                self.data_cpu = Some(alloc_cpu_uninit(
                    self.capacity));
            },
            #[cfg(feature = "cuda")]
            CUDA(c) => {
                self.data_cuda[c as usize] = Some(
                    alloc_cuda_uninit(self.capacity, c));
            }
        }
    }
    
    /// private function to get one device with valid data
    #[inline]
    fn device_valid(&self) -> Option<Device> {
        self.valid_flag.iter().enumerate().find(|(_i, v)| **v)
            .map(|(i, _v)| Device::from_id(i))
    }

    /// schedule a device to make its data available.
    ///
    /// Guaranteed to only modify the buffer and the validity
    /// bit of the specified device.
    /// (which is useful in the safety of read-schedule
    /// interior mutability.)
    #[inline]
    fn schedule_device_read(&mut self, device: Device) {
        if self.valid_flag[device.to_id()] {
            return
        }
        use Device::*;
        let is_none = match device {
            CPU => self.data_cpu.is_none(),
            #[cfg(feature = "cuda")]
            CUDA(c) => self.data_cuda[c as usize].is_none()
        };
        if is_none {
            unsafe { self.alloc_uninitialized(device); }
        }
        let device_valid = self.device_valid().expect("no valid dev");
        match (device_valid, device) {
            (CPU, CPU) => {},
            #[cfg(feature = "cuda")]
            (CPU, CUDA(c)) => {
                let c = c as usize;
                self.data_cuda[c].as_mut().unwrap().index(..self.size)
                    .copy_from(
                        &self.data_cpu.as_ref().unwrap()[..self.size]
                    ).unwrap();
            },
            #[cfg(feature = "cuda")]
            (CUDA(c), CPU) => {
                let c = c as usize;
                self.data_cuda[c].as_ref().unwrap().index(..self.size)
                    .copy_to(
                        &mut self.data_cpu.as_mut().unwrap()[..self.size]
                    ).unwrap();
            },
            #[cfg(feature = "cuda")]
            (CUDA(c1), CUDA(c2)) => {
                let (c1, c2) = (c1 as usize, c2 as usize);
                assert_ne!(c1, c2);
                // unsafe is used to access one mutable element.
                // safety guaranteed by the above `assert_ne!`.
                let c2_mut = unsafe {
                    &mut *(self.data_cuda[c2].as_ref().unwrap()
                           as *const DeviceBuffer<T>
                           as *mut DeviceBuffer<T>)
                };
                self.data_cuda[c1].as_ref().unwrap().index(..self.size)
                    .copy_to(
                        &mut c2_mut.index(..self.size)
                    ).unwrap();
            }
        }
        self.valid_flag[device.to_id()] = true;
    }

    /// schedule a device to make its data available
    /// THROUGH a read-only reference.
    ///
    /// will acquire a lock if it is necessary.
    /// If you have mutable reference, use the lock-free
    /// `schedule_device_read` instead.
    #[inline]
    fn schedule_device_read_ro(&self, device: Device) {
        if self.valid_flag[device.to_id()] {
            return
        }
        let locked = self.read_locks[device.to_id()]
            .lock().unwrap();
        // safety guaranteed by the lock, and by the
        // guarantee of `schedule_device_read` that only
        // writes to fields related to the specified device.
        unsafe {
            (&mut *(self as *const UVec<T> as *mut UVec<T>))
                .schedule_device_read(device);
        }
        drop(locked);
    }

    /// schedule a device write. invalidates all other ranges.
    #[inline]
    fn schedule_device_write(&mut self, device: Device) {
        if !self.valid_flag[device.to_id()] {
            self.schedule_device_read(device);
        }
        // only this is valid.
        self.valid_flag[..].fill(false);
        self.valid_flag[device.to_id()] = true;
    }

    #[inline]
    fn drop_all_buf(&mut self) {
        self.data_cpu = None;
        #[cfg(feature = "cuda")]
        for d in &mut self.data_cuda {
            *d = None;
        }
    }

    #[inline]
    unsafe fn realloc_uninit_nopreserve(&mut self, device: Device) {
        self.drop_all_buf();
        if self.capacity > 10000000 {
            clilog::debug!("large realloc: capacity {}",
                           self.capacity);
        }
        self.alloc_uninitialized(device);
        self.valid_flag.fill(false);
        self.valid_flag[device.to_id()] = true;
    }
    
    #[inline]
    unsafe fn realloc_uninit_preserve(&mut self, device: Device) {
        use Device::*;
        match device {
            CPU => {
                let old = self.data_cpu.take().unwrap();
                self.drop_all_buf();
                self.alloc_uninitialized(device);
                self.data_cpu.as_mut().unwrap()[..self.size]
                    .copy_from_slice(&old[..self.size]);
            },
            #[cfg(feature = "cuda")]
            CUDA(c) => {
                let c = c as usize;
                let old = self.data_cuda[c].take().unwrap();
                self.drop_all_buf();
                self.alloc_uninitialized(device);
                self.data_cuda[c].as_mut().unwrap().index(..self.size)
                    .copy_from(&old.index(..self.size))
                    .unwrap();
            }
        }
        self.valid_flag.fill(false);
        self.valid_flag[device.to_id()] = true;
    }

    #[inline]
    pub fn get(&self, idx: usize) -> T {
        use Device::*;
        match self.device_valid().unwrap() {
            CPU => self.data_cpu.as_ref().unwrap()[idx],
            #[cfg(feature = "cuda")]
            CUDA(c) => {
                let mut ret: [T; 1] = unsafe {
                    std::mem::MaybeUninit::uninit().assume_init()
                };
                self.data_cuda[c as usize].as_ref().unwrap()
                    .index(idx)
                    .copy_to(&mut ret)
                    .unwrap();
                ret[0]
            }
        }
    }
}

impl<T: UniversalCopy + Zeroable> UVec<T> {
    /// Create a new zeroed universal vector with specific size.
    #[inline]
    pub fn new_zeroed(size: usize, device: Device) -> UVec<T> {
        let mut v: UVec<T> = Default::default();
        v.size = size;
        v.capacity = size;
        v.alloc_zeroed(device);
        v.valid_flag[device.to_id()] = true;
        v
    }
}

impl<T: UniversalCopy> UVec<T> {
    /// Get length (size) of this vector.
    #[inline]
    pub fn len(&self) -> usize {
        self.size
    }
    
    /// Get capacity of this vector.
    #[inline]
    pub fn capacity(&self) -> usize {
        self.capacity
    }

    /// New empty vector (can be used as placeholder).
    #[inline]
    pub fn new() -> UVec<T> {
        unsafe { Self::new_uninitialized(0, Device::CPU) }
    }
    
    /// Create a new uninitialized universal vector with
    /// specific size.
    #[inline]
    pub unsafe fn new_uninitialized(
        size: usize, device: Device
    ) -> UVec<T> {
        let mut v: UVec<T> = Default::default();
        v.size = size;
        v.capacity = size;
        v.alloc_uninitialized(device);
        v.valid_flag[device.to_id()] = true;
        v
    }

    /// Resize the universal vector, but do **not** preserve the
    /// original content.
    /// The potential new elements are **uninitialized**.
    ///
    /// If the current capacity is sufficient, we do not need to
    /// reallocate or do anything else. We just mark the desired
    /// device as valid.
    ///
    /// If the current capacity is insufficient, a reallocation
    /// is needed and all current allocations are dropped.
    /// (we maintain the invariant that all allocated buffers for
    /// all devices must all have the same length (= capacity).)
    #[inline]
    pub unsafe fn resize_uninit_nopreserve(&mut self, size: usize, device: Device) {
        if self.capacity < size {
            self.capacity = (size as f64 * 1.5).round() as usize;
            self.realloc_uninit_nopreserve(device);
        }
        self.size = size;
    }

    /// Resize the universal vector, and preserve all the
    /// original content.
    /// The potential new elements are **uninitialized**.
    #[inline]
    pub unsafe fn resize_uninit_preserve(&mut self, size: usize, device: Device) {
        if self.size != 0 {
            self.schedule_device_read(device);
        }
        if self.capacity < size {
            self.capacity = (size as f64 * 1.5).round() as usize;
            self.realloc_uninit_preserve(device);
        }
        self.size = size;
        self.valid_flag.fill(false);
        self.valid_flag[device.to_id()] = true;
    }
}

impl<T: UniversalCopy> AsRef<[T]> for UVec<T> {
    /// Get a CPU slice reference.
    /// 
    /// This COULD fail, actually, when we need to copy from
    /// a GPU value to CPU.
    /// This violates the guideline but we have no choice.
    ///
    /// It will lock only when a copy is needed.
    #[inline]
    fn as_ref(&self) -> &[T] {
        self.schedule_device_read_ro(Device::CPU);
        &self.data_cpu.as_ref().unwrap()[..self.size]
    }
}

impl<T: UniversalCopy> AsMut<[T]> for UVec<T> {
    /// Get a mutable CPU slice reference.
    /// 
    /// This COULD fail, actually, when we need to copy from
    /// a GPU value to CPU.
    /// This violates the guideline but we have no choice.
    ///
    /// It is lock-free.
    #[inline]
    fn as_mut(&mut self) -> &mut [T] {
        self.schedule_device_write(Device::CPU);
        &mut self.data_cpu.as_mut().unwrap()[..self.size]
    }
}

impl<T: UniversalCopy> Deref for UVec<T> {
    type Target = [T];
    /// `Deref` is now implemented for `UVec` to let you
    /// use it transparently.
    ///
    /// Internally it may fail because it might schedule a
    /// inter-device copy to make the data available on CPU.
    /// But it is thread-safe.
    #[inline]
    fn deref(&self) -> &[T] {
        self.as_ref()
    }
}

impl<T: UniversalCopy> DerefMut for UVec<T> {
    /// `Deref` is now implemented for `UVec` to let you
    /// use it transparently.
    ///
    /// Internally it may fail because it might schedule a
    /// inter-device copy to make the data available on CPU.
    /// But it is thread-safe.
    #[inline]
    fn deref_mut(&mut self) -> &mut [T] {
        self.as_mut()
    }
}

impl<T: UniversalCopy, I> Index<I> for UVec<T> where [T]: Index<I> {
    type Output = <[T] as Index<I>>::Output;
    #[inline]
    fn index(&self, i: I) -> &Self::Output {
        self.as_ref().index(i)
    }
}

impl<T: UniversalCopy, I> IndexMut<I> for UVec<T> where [T]: IndexMut<I> {
    #[inline]
    fn index_mut(&mut self, i: I) -> &mut Self::Output {
        self.as_mut().index_mut(i)
    }
}

#[cfg(feature = "cuda")]
impl<T: UniversalCopy> AsCUDASlice<T> for UVec<T> {
    #[inline]
    fn as_cuda_slice(&self, cuda_device: Device) -> DeviceSlice<T> {
        use Device::*;
        let c = match cuda_device {
            CUDA(c) => c as usize,
            _ => panic!("AsCUDASlice does not accept \
                         non-CUDA device {:?}", cuda_device)
        };
        self.schedule_device_read_ro(cuda_device);
        // construct a slice with only the first size elements.
        let ptr = self.data_cuda[c].as_ref().unwrap().as_device_ptr();
        unsafe { DeviceSlice::from_raw_parts(ptr, self.size) }
    }
}

#[cfg(feature = "cuda")]
impl<T: UniversalCopy> AsCUDASliceMut<T> for UVec<T> {
    #[inline]
    fn as_cuda_slice_mut(&mut self, cuda_device: Device) -> DeviceSlice<T> {
        use Device::*;
        let c = match cuda_device {
            CUDA(c) => c as usize,
            _ => panic!("AsCUDASlice does not accept \
                         non-CUDA device {:?}", cuda_device)
        };
        self.schedule_device_write(cuda_device);
        // construct a slice with only the first size elements.
        let ptr = self.data_cuda[c].as_ref().unwrap().as_device_ptr();
        unsafe { DeviceSlice::from_raw_parts(ptr, self.size) }
    }
}

impl<T: UniversalCopy> AsUPtr<T> for UVec<T> {
    #[inline]
    fn as_uptr(&self, device: Device) -> *const T {
        self.schedule_device_read_ro(device);
        use Device::*;
        match device {
            CPU => self.data_cpu.as_ref().unwrap().as_ptr(),
            #[cfg(feature = "cuda")]
            CUDA(c) => self.data_cuda[c as usize].as_ref().unwrap()
                .as_device_ptr().as_ptr()
        }
    }
}

impl<T: UniversalCopy> AsUPtrMut<T> for UVec<T> {
    #[inline]
    fn as_mut_uptr(&mut self, device: Device) -> *mut T {
        self.schedule_device_write(device);
        use Device::*;
        match device {
            CPU => self.data_cpu.as_mut().unwrap().as_mut_ptr(),
            #[cfg(feature = "cuda")]
            CUDA(c) => self.data_cuda[c as usize].as_mut().unwrap()
                .as_device_ptr().as_mut_ptr()
        }
    }
}

// although convenient, below gets in the way of automatic type inference.

// impl<T: UniversalCopy, const N: usize> AsUPtr<T> for UVec<[T; N]> {
//     /// convenient way to get flattened pointer
//     #[inline]
//     fn as_uptr(&self, device: Device) -> *const T {
//         AsUPtr::<[T; N]>::as_uptr(self, device) as *const T
//     }
// }

// impl<T: UniversalCopy, const N: usize> AsUPtrMut<T> for UVec<[T; N]> {
//     /// convenient way to get flattened pointer
//     #[inline]
//     fn as_mut_uptr(&mut self, device: Device) -> *mut T {
//         AsUPtrMut::<[T; N]>::as_mut_uptr(self, device) as *mut T
//     }
// }

impl<T, U: UniversalCopy> AsUPtr<U> for &T where T: AsUPtr<U> {
    #[inline]
    fn as_uptr(&self, device: Device) -> *const U {
        (*self).as_uptr(device)
    }
}

impl<T, U: UniversalCopy> AsUPtrMut<U> for &mut T where T: AsUPtrMut<U> {
    #[inline]
    fn as_mut_uptr(&mut self, device: Device) -> *mut U {
        (*self).as_mut_uptr(device)
    }
}

impl<T: UniversalCopy + Hash> Hash for UVec<T> {
    #[inline]
    fn hash<H: Hasher>(&self, state: &mut H) {
        self.as_ref().hash(state)
    }
}

impl<T: UniversalCopy, U: UniversalCopy> PartialEq<UVec<U>> for UVec<T>
    where T: PartialEq<U>
{
    #[inline]
    fn eq(&self, other: &UVec<U>) -> bool {
        self.as_ref() == other.as_ref()
    }
}

impl<T: UniversalCopy + Eq> Eq for UVec<T> { }

impl<T: UniversalCopy> Clone for UVec<T> {
    fn clone(&self) -> Self {
        let valid_flag = self.valid_flag.clone();
        let data_cpu = match valid_flag[Device::CPU.to_id()] {
            true => self.data_cpu.clone(),
            false => None
        };
        #[cfg(feature = "cuda")]
        let data_cuda = unsafe {
            let mut data_cuda: [Option<DeviceBuffer<T>>; MAX_NUM_CUDA_DEVICES] = Default::default();
            for i in 0..MAX_NUM_CUDA_DEVICES {
                if valid_flag[Device::CUDA(i as u8).to_id()] {
                    let dbuf = alloc_cuda_uninit(self.capacity, i as u8);
                    self.data_cuda[i].as_ref().unwrap().index(..self.size)
                        .copy_to(&mut dbuf.index(..self.size))
                        .unwrap();
                    data_cuda[i] = Some(dbuf);
                }
            }
            data_cuda
        };
        UVec {
            data_cpu,
            #[cfg(feature = "cuda")] data_cuda,
            valid_flag,
            read_locks: Default::default(),
            size: self.size,
            capacity: self.capacity
        }
    }
}