cudarc 0.19.4 - Docs.rs

use super::core::*;
use crate::{
    cudnn::{result, result::CudnnError, sys},
    driver::{DevicePtr, DevicePtrMut},
};

use crate::cudnn::safe::activation::ActivationDescriptor;
use std::{marker::PhantomData, sync::Arc};

/// A descriptor of the filters for conv operation. Create with [`Cudnn::create_4d_filter()`]
#[derive(Debug)]
pub struct FilterDescriptor<T> {
    pub(crate) desc: sys::cudnnFilterDescriptor_t,
    #[allow(unused)]
    pub(crate) handle: Arc<Cudnn>,
    pub(crate) marker: PhantomData<T>,
}

impl Cudnn {
    /// Create a filter 4d descriptor.
    pub fn create_4d_filter<T: CudnnDataType>(
        self: &Arc<Cudnn>,
        format: sys::cudnnTensorFormat_t,
        dims: [std::ffi::c_int; 4],
    ) -> Result<FilterDescriptor<T>, CudnnError> {
        let desc = result::create_filter_descriptor()?;
        let desc = FilterDescriptor {
            desc,
            handle: self.clone(),
            marker: PhantomData,
        };
        unsafe { result::set_filter4d_descriptor(desc.desc, T::DATA_TYPE, format, dims) }?;
        Ok(desc)
    }

    /// Create a filter Nd descriptor.
    pub fn create_nd_filter<T: CudnnDataType>(
        self: &Arc<Cudnn>,
        format: sys::cudnnTensorFormat_t,
        dims: &[std::ffi::c_int],
    ) -> Result<FilterDescriptor<T>, CudnnError> {
        let desc = result::create_filter_descriptor()?;
        let desc = FilterDescriptor {
            desc,
            handle: self.clone(),
            marker: PhantomData,
        };
        unsafe {
            result::set_filternd_descriptor(
                desc.desc,
                T::DATA_TYPE,
                format,
                dims.len() as std::ffi::c_int,
                dims.as_ptr(),
            )
        }?;
        Ok(desc)
    }

    /// Create a filter 3d descriptor.
    pub fn create_3d_filter<T: CudnnDataType>(
        self: &Arc<Cudnn>,
        format: sys::cudnnTensorFormat_t,
        dims: [std::ffi::c_int; 3],
    ) -> Result<FilterDescriptor<T>, CudnnError> {
        self.create_nd_filter(format, &dims)
    }

    /// Create a filter 5d descriptor.
    pub fn create_5d_filter<T: CudnnDataType>(
        self: &Arc<Cudnn>,
        format: sys::cudnnTensorFormat_t,
        dims: [std::ffi::c_int; 5],
    ) -> Result<FilterDescriptor<T>, CudnnError> {
        self.create_nd_filter(format, &dims)
    }
}

impl<T> Drop for FilterDescriptor<T> {
    fn drop(&mut self) {
        let desc = std::mem::replace(&mut self.desc, std::ptr::null_mut());
        if !desc.is_null() {
            unsafe { result::destroy_filter_descriptor(desc) }.unwrap()
        }
    }
}

/// A descriptor for a conv operation holding stride, padding, and dilation.
#[derive(Debug)]
pub struct ConvDescriptor<T> {
    pub(crate) desc: sys::cudnnConvolutionDescriptor_t,
    pub(crate) handle: Arc<Cudnn>,
    pub(crate) marker: PhantomData<T>,
}
#[deprecated(note = "use ConvDescriptor instead. This will be removed in future versions")]
pub type Conv2dDescriptor<T> = ConvDescriptor<T>;

impl Cudnn {
    /// Creates a conv2d descriptor.
    /// - `pad` is the padding to apply to height and width of tensor
    /// - `stride` is the kernel strides
    /// - `dilation` is the kernel dilation
    /// - `mode` - CROSS_CORRELATION is standard convolution
    pub fn create_conv2d<T: CudnnDataType>(
        self: &Arc<Cudnn>,
        pad: [std::ffi::c_int; 2],
        stride: [std::ffi::c_int; 2],
        dilation: [std::ffi::c_int; 2],
        mode: sys::cudnnConvolutionMode_t,
    ) -> Result<ConvDescriptor<T>, CudnnError> {
        let [pad_h, pad_w] = pad;
        let [stride_h, stride_w] = stride;
        let [dilation_h, dilation_w] = dilation;
        let desc = result::create_convolution_descriptor()?;
        let desc = ConvDescriptor {
            desc,
            handle: self.clone(),
            marker: PhantomData,
        };
        unsafe {
            result::set_convolution2d_descriptor(
                desc.desc,
                pad_h,
                pad_w,
                stride_h,
                stride_w,
                dilation_h,
                dilation_w,
                mode,
                T::DATA_TYPE,
            )
        }?;
        Ok(desc)
    }

    /// Creates a ConvDescription for Nd convolution operations.
    /// - `pads` is an array the zero-padding size for each dimension
    /// - `strides` is an array for the kernel strides for each dimension
    /// - `dilations` is an array for the kernel dilation for each dimension
    /// - `mode` - CROSS_CORRELATION is standard convolution
    pub fn create_convnd<T: CudnnDataType>(
        self: &Arc<Cudnn>,
        pads: &[std::ffi::c_int],
        strides: &[std::ffi::c_int],
        dilations: &[std::ffi::c_int],
        mode: sys::cudnnConvolutionMode_t,
    ) -> Result<ConvDescriptor<T>, CudnnError> {
        let desc = result::create_convolution_descriptor()?;
        let desc = ConvDescriptor {
            desc,
            handle: self.clone(),
            marker: PhantomData,
        };
        unsafe {
            result::set_convolutionnd_descriptor(
                desc.desc,
                pads.len() as std::ffi::c_int,
                pads.as_ptr(),
                strides.as_ptr(),
                dilations.as_ptr(),
                mode,
                T::DATA_TYPE,
            )
        }?;
        Ok(desc)
    }
}

impl<T> ConvDescriptor<T> {
    /// Set's the math type for this convolution. Refer to [nvidia docs](https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnSetConvolutionMathType)
    /// for more information.
    pub fn set_math_type(&mut self, math_type: sys::cudnnMathType_t) -> Result<(), CudnnError> {
        unsafe { result::set_convolution_math_type(self.desc, math_type) }
    }

    /// Set's the group count for this convolution. Refer to [nvidia docs](https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnSetConvolutionGroupCount)
    /// for more information.
    pub fn set_group_count(&mut self, group_count: i32) -> Result<(), CudnnError> {
        unsafe { result::set_convolution_group_count(self.desc, group_count) }
    }
}

impl<T> Drop for ConvDescriptor<T> {
    fn drop(&mut self) {
        let desc = std::mem::replace(&mut self.desc, std::ptr::null_mut());
        if !desc.is_null() {
            unsafe { result::destroy_convolution_descriptor(desc) }.unwrap()
        }
    }
}

/// The convolution 2d forward operation. Pass in references to descriptors
/// directly, and then call:
/// 1. [`ConvForward::pick_algorithm()`] to use cudnn heuristics to select the algorithm
/// 2. [`ConvForward::get_workspace_size()`] to get required workspace size.
/// 3. [`ConvForward::launch()`] to execute it
#[derive(Debug)]
pub struct ConvForward<'a, X: CudnnDataType, C: CudnnDataType, Y: CudnnDataType> {
    /// Conv parameters
    pub conv: &'a ConvDescriptor<C>,
    /// Input tensor descriptor
    pub x: &'a TensorDescriptor<X>,
    /// Filter descriptor
    pub w: &'a FilterDescriptor<X>,
    /// Output tensor descriptor
    pub y: &'a TensorDescriptor<Y>,
}
#[deprecated(note = "use ConvForward instead. This will be removed in future versions")]
pub type Conv2dForward<'a, X, C, Y> = ConvForward<'a, X, C, Y>;

impl<X: CudnnDataType, C: CudnnDataType, Y: CudnnDataType> ConvForward<'_, X, C, Y> {
    /// Picks the fastest algorithm from all available cuDNN algorithms based on cudnn heuristics.
    pub fn pick_algorithm(&self) -> Result<sys::cudnnConvolutionFwdAlgo_t, CudnnError> {
        const NUM_ALGOS: usize = 8;
        debug_assert_eq!(
            sys::cudnnConvolutionFwdAlgo_t::CUDNN_CONVOLUTION_FWD_ALGO_COUNT as u32,
            NUM_ALGOS as u32
        );
        let mut returned_count = [0; 1];
        let mut perf_results = [unsafe { std::mem::zeroed() }; NUM_ALGOS];
        unsafe {
            result::get_convolution_forward_algorithm(
                self.conv.handle.handle,
                self.x.desc,
                self.w.desc,
                self.conv.desc,
                self.y.desc,
                NUM_ALGOS as std::ffi::c_int,
                returned_count.as_mut_ptr(),
                perf_results.as_mut_ptr(),
            )
        }?;
        assert!(returned_count[0] > 0);
        perf_results[0].status.result()?;
        Ok(perf_results[0].algo)
    }

    /// Returns size in **bytes** to execute the selected algorithm.
    pub fn get_workspace_size(
        &self,
        algo: sys::cudnnConvolutionFwdAlgo_t,
    ) -> Result<usize, CudnnError> {
        unsafe {
            result::get_convolution_forward_workspace_size(
                self.conv.handle.handle,
                self.x.desc,
                self.w.desc,
                self.conv.desc,
                self.y.desc,
                algo,
            )
        }
    }

    /// Launches the operation.
    ///
    /// - `src` is the input tensor
    /// - `filter` is the convolution kernels
    /// - `y` is the output
    ///
    /// # Safety
    /// The src/filter/y arguments must match the data type/layout specified in the
    /// descriptors in `self.
    pub unsafe fn launch<Workspace, Src, Filter, Dst>(
        &self,
        algo: sys::cudnnConvolutionFwdAlgo_t,
        workspace: Option<&mut Workspace>,
        (alpha, beta): (Y, Y),
        src: &Src,
        filter: &Filter,
        y: &mut Dst,
    ) -> Result<(), CudnnError>
    where
        Workspace: DevicePtrMut<u8>,
        Src: DevicePtr<X>,
        Filter: DevicePtr<X>,
        Dst: DevicePtrMut<Y>,
    {
        let stream = &self.x.handle.stream;
        let alpha = alpha.into_scaling_parameter();
        let beta = beta.into_scaling_parameter();
        let (src, _record_src) = src.device_ptr(stream);
        let (filter, _record_f) = filter.device_ptr(stream);
        let (y, _record_y) = y.device_ptr_mut(stream);
        let workspace_size_in_bytes = workspace.as_ref().map(|w| w.num_bytes()).unwrap_or(0);
        let (w, _record_w) = workspace.map(|w| w.device_ptr_mut(stream)).unzip();
        result::convolution_forward(
            self.conv.handle.handle,
            (&alpha) as *const Y::Scalar as *const std::ffi::c_void,
            self.x.desc,
            src as *const X as *const std::ffi::c_void,
            self.w.desc,
            filter as *const X as *const std::ffi::c_void,
            self.conv.desc,
            algo,
            w.map(|ptr| ptr as _).unwrap_or(std::ptr::null_mut()),
            workspace_size_in_bytes,
            (&beta) as *const Y::Scalar as *const std::ffi::c_void,
            self.y.desc,
            y as *mut Y as *mut std::ffi::c_void,
        )
    }
}

/// The convolution backward operation for the input tensor. Pass in references to descriptors
/// directly, and then call:
/// 1. [`ConvBackwardData::pick_algorithm()`] to use cudnn heuristics to select the algorithm
/// 2. [`ConvBackwardData::get_workspace_size()`] to get required workspace size.
/// 3. [`ConvBackwardData::launch()`] to execute it
#[derive(Debug)]
pub struct ConvBackwardData<'a, X: CudnnDataType, C: CudnnDataType, Y: CudnnDataType> {
    /// Conv descriptor
    pub conv: &'a ConvDescriptor<C>,
    /// Input tensor descriptor
    pub dx: &'a TensorDescriptor<X>,
    /// Filter descriptor
    pub w: &'a FilterDescriptor<X>,
    /// Output tensor descriptor
    pub dy: &'a TensorDescriptor<Y>,
}
#[deprecated(note = "use ConvBackwardData instead. This will be removed in future versions")]
pub type Conv2dBackwardData<'a, X, C, Y> = ConvBackwardData<'a, X, C, Y>;

impl<X: CudnnDataType, C: CudnnDataType, Y: CudnnDataType> ConvBackwardData<'_, X, C, Y> {
    /// Picks the fastest algorithm from all available cuDNN algorithms based on cudnn heuristics.
    pub fn pick_algorithm(&self) -> Result<sys::cudnnConvolutionBwdDataAlgo_t, CudnnError> {
        const NUM_ALGOS: usize = 6;
        debug_assert_eq!(
            sys::cudnnConvolutionBwdDataAlgo_t::CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT as u32,
            NUM_ALGOS as u32
        );
        let mut returned_count = [0; 1];
        let mut perf_results = [unsafe { std::mem::zeroed() }; NUM_ALGOS];
        unsafe {
            result::get_convolution_backward_data_algorithm(
                self.conv.handle.handle,
                self.w.desc,
                self.dy.desc,
                self.conv.desc,
                self.dx.desc,
                NUM_ALGOS as std::ffi::c_int,
                returned_count.as_mut_ptr(),
                perf_results.as_mut_ptr(),
            )
        }?;
        assert!(returned_count[0] > 0);
        perf_results[0].status.result()?;
        Ok(perf_results[0].algo)
    }

    /// Returns size in **bytes** to execute the selected algorithm.
    pub fn get_workspace_size(
        &self,
        algo: sys::cudnnConvolutionBwdDataAlgo_t,
    ) -> Result<usize, CudnnError> {
        unsafe {
            result::get_convolution_backward_data_workspace_size(
                self.conv.handle.handle,
                self.w.desc,
                self.dy.desc,
                self.conv.desc,
                self.dx.desc,
                algo,
            )
        }
    }

    /// Launches the operation.
    ///
    /// - `dx` is the gradient of the input tensor to populate
    /// - `filter` is the convolution kernels
    /// - `dy` is the gradient of the output tensor
    ///
    /// # Safety
    /// The arguments must match the data type/layout specified in the
    /// descriptors in `self.
    pub unsafe fn launch<Workspace, Src, Filter, Dst>(
        &self,
        algo: sys::cudnnConvolutionBwdDataAlgo_t,
        workspace: Option<&mut Workspace>,
        (alpha, beta): (Y, Y),
        dx: &mut Src,
        filter: &Filter,
        dy: &Dst,
    ) -> Result<(), CudnnError>
    where
        Workspace: DevicePtrMut<u8>,
        Src: DevicePtrMut<X>,
        Filter: DevicePtr<X>,
        Dst: DevicePtr<Y>,
    {
        let stream = &self.dx.handle.stream;
        let alpha = alpha.into_scaling_parameter();
        let beta = beta.into_scaling_parameter();
        let (dx, _record_dx) = dx.device_ptr_mut(stream);
        let (filter, _record_f) = filter.device_ptr(stream);
        let (dy, _record_dy) = dy.device_ptr(stream);
        let workspace_size_in_bytes = workspace.as_ref().map(|w| w.num_bytes()).unwrap_or(0);
        let (w, _record_w) = workspace.map(|w| w.device_ptr_mut(stream)).unzip();
        result::convolution_backward_data(
            self.conv.handle.handle,
            (&alpha) as *const Y::Scalar as *const std::ffi::c_void,
            self.w.desc,
            filter as *const X as *const std::ffi::c_void,
            self.dy.desc,
            dy as *const Y as *const std::ffi::c_void,
            self.conv.desc,
            algo,
            w.map(|ptr| ptr as _).unwrap_or(std::ptr::null_mut()),
            workspace_size_in_bytes,
            (&beta) as *const Y::Scalar as *const std::ffi::c_void,
            self.dx.desc,
            dx as *mut X as *mut std::ffi::c_void,
        )
    }
}

/// The convolution 2d backward operation for the filters. Pass in references to descriptors
/// directly, and then call:
/// 1. [`ConvBackwardFilter::pick_algorithm()`] to use cudnn heuristics to select the algorithm
/// 2. [`ConvBackwardFilter::get_workspace_size()`] to get required workspace size.
/// 3. [`ConvBackwardFilter::launch()`] to execute it
#[derive(Debug)]
pub struct ConvBackwardFilter<'a, X: CudnnDataType, C: CudnnDataType, Y: CudnnDataType> {
    /// Conv descriptor
    pub conv: &'a ConvDescriptor<C>,
    /// Input tensor descriptor
    pub x: &'a TensorDescriptor<X>,
    /// Filter descriptor
    pub dw: &'a FilterDescriptor<X>,
    /// Output tensor descriptor
    pub dy: &'a TensorDescriptor<Y>,
}
#[deprecated(note = "use ConvBackwardFilter instead. This will be removed in future versions")]
pub type Conv2dBackwardFilter<'a, X, C, Y> = ConvBackwardFilter<'a, X, C, Y>;

impl<X: CudnnDataType, C: CudnnDataType, Y: CudnnDataType> ConvBackwardFilter<'_, X, C, Y> {
    /// Picks the fastest algorithm from all available cuDNN algorithms based on cudnn heuristics.
    pub fn pick_algorithm(&self) -> Result<sys::cudnnConvolutionBwdFilterAlgo_t, CudnnError> {
        const NUM_ALGOS: usize = 7;
        debug_assert_eq!(
            sys::cudnnConvolutionBwdFilterAlgo_t::CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT as u32,
            NUM_ALGOS as u32
        );
        let mut returned_count = [0; 1];
        let mut perf_results = [unsafe { std::mem::zeroed() }; NUM_ALGOS];
        unsafe {
            result::get_convolution_backward_filter_algorithm(
                self.conv.handle.handle,
                self.x.desc,
                self.dy.desc,
                self.conv.desc,
                self.dw.desc,
                NUM_ALGOS as std::ffi::c_int,
                returned_count.as_mut_ptr(),
                perf_results.as_mut_ptr(),
            )
        }?;
        assert!(returned_count[0] > 0);
        perf_results[0].status.result()?;
        Ok(perf_results[0].algo)
    }

    /// Returns size in **bytes** to execute the selected algorithm.
    pub fn get_workspace_size(
        &self,
        algo: sys::cudnnConvolutionBwdFilterAlgo_t,
    ) -> Result<usize, CudnnError> {
        unsafe {
            result::get_convolution_backward_filter_workspace_size(
                self.conv.handle.handle,
                self.x.desc,
                self.dy.desc,
                self.conv.desc,
                self.dw.desc,
                algo,
            )
        }
    }

    /// Launches the operation.
    ///
    /// - `x` is the input tensor
    /// - `dfilter` is the gradient of the convolution kernels
    /// - `dy` is the gradient of the output tensor
    ///
    /// # Safety
    /// The arguments must match the data type/layout specified in the
    /// descriptors in `self.
    pub unsafe fn launch<Workspace, Src, Filter, Dst>(
        &self,
        algo: sys::cudnnConvolutionBwdFilterAlgo_t,
        workspace: Option<&mut Workspace>,
        (alpha, beta): (Y, Y),
        x: &Src,
        dfilter: &mut Filter,
        dy: &Dst,
    ) -> Result<(), CudnnError>
    where
        Workspace: DevicePtrMut<u8>,
        Src: DevicePtr<X>,
        Filter: DevicePtrMut<X>,
        Dst: DevicePtr<Y>,
    {
        let stream = &self.x.handle.stream;
        let alpha = alpha.into_scaling_parameter();
        let beta = beta.into_scaling_parameter();
        let (x, _record_x) = x.device_ptr(stream);
        let (dfilter, _record_f) = dfilter.device_ptr_mut(stream);
        let (dy, _record_dy) = dy.device_ptr(stream);
        let workspace_size_in_bytes = workspace.as_ref().map(|w| w.num_bytes()).unwrap_or(0);
        let (w, _record_w) = workspace.map(|w| w.device_ptr_mut(stream)).unzip();
        result::convolution_backward_filter(
            self.conv.handle.handle,
            (&alpha) as *const Y::Scalar as *const std::ffi::c_void,
            self.x.desc,
            x as *const _,
            self.dy.desc,
            dy as *const _,
            self.conv.desc,
            algo,
            w.map(|ptr| ptr as _).unwrap_or(std::ptr::null_mut()),
            workspace_size_in_bytes,
            (&beta) as *const Y::Scalar as *const std::ffi::c_void,
            self.dw.desc,
            dfilter as *mut _,
        )
    }
}

/// The bias + convolution + activation forward operation.
/// The full computation follows the equation `y = act (alpha1 * conv(x) + alpha2 * z + bias)`.
/// Pass in references to descriptors directly, and then call:
/// 1. [`ConvForward::pick_algorithm()`] to use cudnn heuristics to select the algorithm
/// 2. [`ConvForward::get_workspace_size()`] to get required workspace size.
/// 3. [`ConvForward::launch()`] to execute it
#[derive(Debug)]
pub struct ConvBiasActivationForward<
    'a,
    X: CudnnDataType,
    C: CudnnDataType,
    A: CudnnDataType,
    Y: CudnnDataType,
> {
    /// Conv parameters.
    pub conv: &'a ConvDescriptor<C>,
    /// Activation function.
    pub act: &'a ActivationDescriptor<A>,
    /// Input tensor descriptor
    pub x: &'a TensorDescriptor<X>,
    /// Filter descriptor
    pub w: &'a FilterDescriptor<X>,
    /// Z descriptor
    pub z: &'a TensorDescriptor<X>,
    /// Bias descriptor
    pub bias: &'a TensorDescriptor<X>,
    /// Output tensor descriptor
    pub y: &'a TensorDescriptor<Y>,
}

impl<X, C, A, Y> ConvBiasActivationForward<'_, X, C, A, Y>
where
    X: CudnnDataType,
    C: CudnnDataType,
    A: CudnnDataType,
    Y: CudnnDataType,
{
    /// Picks the fastest algorithm from all available cuDNN algorithms based on cudnn heuristics.
    pub fn pick_algorithm(&self) -> Result<sys::cudnnConvolutionFwdAlgo_t, CudnnError> {
        let conv = ConvForward {
            conv: self.conv,
            x: self.x,
            w: self.w,
            y: self.y,
        };
        conv.pick_algorithm()
    }

    /// Returns size in **bytes** to execute the selected algorithm.
    pub fn get_workspace_size(
        &self,
        algo: sys::cudnnConvolutionFwdAlgo_t,
    ) -> Result<usize, CudnnError> {
        let conv = ConvForward {
            conv: self.conv,
            x: self.x,
            w: self.w,
            y: self.y,
        };
        conv.get_workspace_size(algo)
    }

    /// Launches the operation.
    ///
    /// - `src` is the input tensor
    /// - `filter` is the convolution kernels
    /// - `y` is the output
    ///
    /// # Safety
    /// The src/filter/y arguments must match the data type/layout specified in the
    /// descriptors in `self.
    #[allow(clippy::too_many_arguments)]
    pub unsafe fn launch<Workspace, Src, Filter, Dst>(
        &self,
        algo: sys::cudnnConvolutionFwdAlgo_t,
        workspace: Option<&mut Workspace>,
        (alpha1, alpha2): (Y, Y),
        src: &Src,
        filter: &Filter,
        z: &Src,
        bias: &Src,
        y: &mut Dst,
    ) -> Result<(), CudnnError>
    where
        Workspace: DevicePtrMut<u8>,
        Src: DevicePtr<X>,
        Filter: DevicePtr<X>,
        Dst: DevicePtrMut<Y>,
    {
        let alpha1 = alpha1.into_scaling_parameter();
        let alpha2 = alpha2.into_scaling_parameter();
        let stream = &self.x.handle.stream;
        let (src, _record_src) = src.device_ptr(stream);
        let (filter, _record_f) = filter.device_ptr(stream);
        let (z, _record_z) = z.device_ptr(stream);
        let (bias, _record_bias) = bias.device_ptr(stream);
        let (y, _record_y) = y.device_ptr_mut(stream);
        let workspace_size_in_bytes = workspace.as_ref().map(|w| w.num_bytes()).unwrap_or(0);
        let (w, _record_w) = workspace.map(|w| w.device_ptr_mut(stream)).unzip();
        result::convolution_bias_activation_forward(
            self.conv.handle.handle,
            (&alpha1) as *const Y::Scalar as *const std::ffi::c_void,
            self.x.desc,
            src as *const X as *const std::ffi::c_void,
            self.w.desc,
            filter as *const X as *const std::ffi::c_void,
            self.conv.desc,
            algo,
            w.map(|ptr| ptr as _).unwrap_or(std::ptr::null_mut()),
            workspace_size_in_bytes,
            (&alpha2) as *const Y::Scalar as *const std::ffi::c_void,
            self.z.desc,
            z as *const X as *const std::ffi::c_void,
            self.bias.desc,
            bias as *const X as *const std::ffi::c_void,
            self.act.desc,
            self.y.desc,
            y as *mut Y as *mut std::ffi::c_void,
        )
    }
}