#![allow(clippy::needless_range_loop)]
use crate::{
shapes::*,
tensor::{masks::triangle_mask, storage_traits::*, unique_id, Cpu, CpuError, NoneTape, Tensor},
};
use super::{device::CachableCudaSlice, Cuda, CudaError};
use cudarc::driver::{CudaSlice, DeviceSlice};
use rand::Rng;
use std::{sync::Arc, vec::Vec};
impl Cuda {
fn tensor_from_host_buf<S: Shape, E: Unit>(
&self,
shape: S,
buf: Vec<E>,
) -> Result<Tensor<S, E, Self>, CudaError> {
let mut slice = unsafe { self.alloc_empty(buf.len()) }?;
self.dev.htod_copy_into(buf, &mut slice)?;
Ok(self.build_tensor(shape, shape.strides(), slice))
}
pub(crate) fn build_tensor<S: Shape, E: Unit>(
&self,
shape: S,
strides: S::Concrete,
slice: CudaSlice<E>,
) -> Tensor<S, E, Self> {
let data = CachableCudaSlice {
data: slice,
cache: self.cache.clone(),
};
Tensor {
id: unique_id(),
data: Arc::new(data),
shape,
strides,
device: self.clone(),
tape: Default::default(),
}
}
}
impl<E: Unit> ZerosTensor<E> for Cuda {
fn try_zeros_like<S: HasShape>(&self, src: &S) -> Result<Tensor<S::Shape, E, Self>, Self::Err> {
let shape = *src.shape();
let strides = shape.strides();
let mut data = unsafe { self.alloc_empty(shape.num_elements()) }?;
self.dev.memset_zeros(&mut data)?;
Ok(self.build_tensor(shape, strides, data))
}
}
impl<E: Unit> ZeroFillStorage<E> for Cuda {
fn try_fill_with_zeros(&self, storage: &mut Self::Vec) -> Result<(), Self::Err> {
self.dev.memset_zeros(&mut storage.data)?;
Ok(())
}
}
impl<E: Unit> OnesTensor<E> for Cuda
where
Cpu: OnesTensor<E>,
{
fn try_ones_like<S: HasShape>(&self, src: &S) -> Result<Tensor<S::Shape, E, Self>, Self::Err> {
let shape = *src.shape();
let buf = std::vec![E::ONE; shape.num_elements()];
self.tensor_from_host_buf(shape, buf)
}
}
impl<E: Unit> TriangleTensor<E> for Cuda
where
Cpu: TriangleTensor<E>,
{
fn try_upper_tri_like<S: HasShape>(
&self,
src: &S,
val: E,
diagonal: impl Into<Option<isize>>,
) -> Result<Tensor<S::Shape, E, Self>, Self::Err> {
let shape = *src.shape();
let mut data = std::vec![val; shape.num_elements()];
let offset = diagonal.into().unwrap_or(0);
triangle_mask(&mut data, &shape, true, offset);
self.tensor_from_host_buf(shape, data)
}
fn try_lower_tri_like<S: HasShape>(
&self,
src: &S,
val: E,
diagonal: impl Into<Option<isize>>,
) -> Result<Tensor<S::Shape, E, Self>, Self::Err> {
let shape = *src.shape();
let mut data = std::vec![val; shape.num_elements()];
let offset = diagonal.into().unwrap_or(0);
triangle_mask(&mut data, &shape, false, offset);
self.tensor_from_host_buf(shape, data)
}
}
impl<E: Unit> OneFillStorage<E> for Cuda {
fn try_fill_with_ones(&self, storage: &mut Self::Vec) -> Result<(), Self::Err> {
self.dev
.htod_copy_into(std::vec![E::ONE; storage.len()], &mut storage.data)?;
Ok(())
}
}
impl<E: Unit> SampleTensor<E> for Cuda
where
Cpu: SampleTensor<E>,
{
fn try_sample_like<S: HasShape, D: rand_distr::Distribution<E>>(
&self,
src: &S,
distr: D,
) -> Result<Tensor<S::Shape, E, Self>, Self::Err> {
let shape = *src.shape();
let mut buf = Vec::with_capacity(shape.num_elements());
{
#[cfg(not(feature = "no-std"))]
let mut rng = self.cpu.rng.lock().unwrap();
#[cfg(feature = "no-std")]
let mut rng = self.cpu.rng.lock();
buf.resize_with(shape.num_elements(), || rng.sample(&distr));
}
self.tensor_from_host_buf::<S::Shape, E>(shape, buf)
}
fn try_fill_with_distr<D: rand_distr::Distribution<E>>(
&self,
storage: &mut Self::Vec,
distr: D,
) -> Result<(), Self::Err> {
let mut buf = Vec::with_capacity(storage.len());
{
#[cfg(not(feature = "no-std"))]
let mut rng = self.cpu.rng.lock().unwrap();
#[cfg(feature = "no-std")]
let mut rng = self.cpu.rng.lock();
buf.resize_with(storage.len(), || rng.sample(&distr));
}
self.dev.htod_copy_into(buf, &mut storage.data)?;
Ok(())
}
}
impl<E: Unit> CopySlice<E> for Cuda {
fn copy_from<S: Shape, T>(dst: &mut Tensor<S, E, Self, T>, src: &[E]) {
assert_eq!(
dst.data.len(),
src.len(),
"Slices must have same number of elements as *physical* Storage<E> of tensors."
);
let storage = Arc::make_mut(&mut dst.data);
dst.device
.dev
.htod_sync_copy_into(src, &mut storage.data)
.unwrap();
}
fn copy_into<S: Shape, T>(src: &Tensor<S, E, Self, T>, dst: &mut [E]) {
assert_eq!(
src.data.len(),
dst.len(),
"Slices must have same number of elements as *physical* Storage<E> of tensors."
);
let storage: &Self::Vec = src.data.as_ref();
src.device
.dev
.dtoh_sync_copy_into(&storage.data, dst)
.unwrap();
}
}
impl<E: Unit> TensorFromVec<E> for Cuda {
fn try_tensor_from_vec<S: Shape>(
&self,
src: Vec<E>,
shape: S,
) -> Result<Tensor<S, E, Self>, Self::Err> {
let num_elements = shape.num_elements();
if src.len() != num_elements {
Err(CudaError::Cpu(CpuError::WrongNumElements))
} else {
self.tensor_from_host_buf(shape, src)
}
}
}
impl<S: Shape, E: Unit> TensorToArray<S, E> for Cuda
where
Cpu: TensorToArray<S, E> + Storage<E>,
{
type Array = <Cpu as TensorToArray<S, E>>::Array;
fn tensor_to_array<T>(&self, tensor: &Tensor<S, E, Self, T>) -> Self::Array {
let buf = tensor.as_vec();
let cpu_tensor = self.cpu.tensor_from_vec(buf, tensor.shape);
self.cpu.tensor_to_array::<NoneTape>(&cpu_tensor)
}
}