tenflowers-core 0.1.1

//! cuDNN integration for optimized neural network operations on NVIDIA GPUs
//!
//! This module provides cuDNN bindings and high-level abstractions for GPU-accelerated
//! neural network operations including convolution, pooling, normalization, and activation
//! functions.

#[cfg(all(feature = "cudnn", any(target_os = "linux", target_os = "windows")))]
use crate::{Device, Result, Tensor, TensorError};
use std::collections::HashMap;
use std::sync::Arc;

/// cuDNN handle wrapper for managing cuDNN context
#[cfg(all(feature = "cudnn", any(target_os = "linux", target_os = "windows")))]
#[derive(Debug)]
pub struct CudnnHandle {
    /// cuDNN handle (placeholder for actual cudnnHandle_t)
    handle: *mut std::ffi::c_void,
    device_id: usize,
}

// SAFETY: CudnnHandle is safe to send across threads because cuDNN handles
// are thread-safe when proper synchronization is used. We protect access
// to the global context with a Mutex to ensure thread-safe access.
#[cfg(all(feature = "cudnn", any(target_os = "linux", target_os = "windows")))]
unsafe impl Send for CudnnHandle {}

// SAFETY: CudnnHandle is safe to share between threads when synchronized.
// The cuDNN library is thread-safe, and we use a Mutex to synchronize access.
#[cfg(all(feature = "cudnn", any(target_os = "linux", target_os = "windows")))]
unsafe impl Sync for CudnnHandle {}

/// cuDNN tensor descriptor for describing tensor layout
#[cfg(all(feature = "cudnn", any(target_os = "linux", target_os = "windows")))]
#[derive(Debug, Clone)]
pub struct CudnnTensorDescriptor {
    /// Tensor data type (float, half, etc.)
    data_type: CudnnDataType,
    /// Tensor format (NCHW, NHWC, etc.)
    format: CudnnTensorFormat,
    /// Tensor dimensions
    dimensions: Vec<i32>,
    /// Tensor strides
    strides: Vec<i32>,
}

/// cuDNN data types
#[cfg(all(feature = "cudnn", any(target_os = "linux", target_os = "windows")))]
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum CudnnDataType {
    Float,
    Double,
    Half,
    Int8,
    Int32,
    Int8x4,
    Uint8,
    Uint8x4,
    Int8x32,
}

/// cuDNN tensor formats
#[cfg(all(feature = "cudnn", any(target_os = "linux", target_os = "windows")))]
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum CudnnTensorFormat {
    NCHW,
    NHWC,
    NCHWVectC,
    NHWCVectC,
}

/// cuDNN activation functions
#[cfg(all(feature = "cudnn", any(target_os = "linux", target_os = "windows")))]
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum CudnnActivationMode {
    Sigmoid,
    Relu,
    Tanh,
    ClippedRelu,
    Elu,
    Identity,
    Swish,
}

/// cuDNN convolution descriptor
#[cfg(all(feature = "cudnn", any(target_os = "linux", target_os = "windows")))]
#[derive(Debug, Clone)]
pub struct CudnnConvolutionDescriptor {
    /// Padding in each dimension
    padding: Vec<i32>,
    /// Stride in each dimension
    stride: Vec<i32>,
    /// Dilation in each dimension
    dilation: Vec<i32>,
    /// Convolution mode (cross-correlation vs convolution)
    mode: CudnnConvolutionMode,
    /// Data type for computation
    compute_type: CudnnDataType,
}

/// cuDNN convolution modes
#[cfg(all(feature = "cudnn", any(target_os = "linux", target_os = "windows")))]
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum CudnnConvolutionMode {
    Convolution,
    CrossCorrelation,
}

/// cuDNN pooling descriptor
#[cfg(all(feature = "cudnn", any(target_os = "linux", target_os = "windows")))]
#[derive(Debug, Clone)]
pub struct CudnnPoolingDescriptor {
    /// Pooling mode (max, average, etc.)
    mode: CudnnPoolingMode,
    /// NaN propagation mode
    nan_opt: CudnnNanPropagation,
    /// Window size
    window_size: Vec<i32>,
    /// Padding
    padding: Vec<i32>,
    /// Stride
    stride: Vec<i32>,
}

/// cuDNN pooling modes
#[cfg(all(feature = "cudnn", any(target_os = "linux", target_os = "windows")))]
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum CudnnPoolingMode {
    Max,
    AverageCountIncludePadding,
    AverageCountExcludePadding,
    MaxDeterministic,
}

/// cuDNN NaN propagation options
#[cfg(all(feature = "cudnn", any(target_os = "linux", target_os = "windows")))]
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum CudnnNanPropagation {
    NotPropagateNan,
    PropagateNan,
}

/// High-level cuDNN operations manager
#[cfg(all(feature = "cudnn", any(target_os = "linux", target_os = "windows")))]
pub struct CudnnContext {
    /// cuDNN handles per device
    handles: HashMap<usize, Arc<CudnnHandle>>,
    /// Cached descriptors for performance
    tensor_descriptors: HashMap<String, CudnnTensorDescriptor>,
    convolution_descriptors: HashMap<String, CudnnConvolutionDescriptor>,
    pooling_descriptors: HashMap<String, CudnnPoolingDescriptor>,
}

#[cfg(all(feature = "cudnn", any(target_os = "linux", target_os = "windows")))]
impl CudnnHandle {
    /// Create a new cuDNN handle for the specified device
    pub fn new(device_id: usize) -> Result<Self> {
        // Placeholder implementation
        // Real implementation would:
        // 1. Call cudaSetDevice(device_id)
        // 2. Call cudnnCreate(&handle)
        // 3. Call cudnnSetStream(handle, stream) if using custom stream

        println!("Creating cuDNN handle for device {}", device_id);

        Ok(CudnnHandle {
            handle: std::ptr::null_mut(), // Placeholder
            device_id,
        })
    }

    /// Get the device ID for this handle
    pub fn device_id(&self) -> usize {
        self.device_id
    }

    /// Set the CUDA stream for this cuDNN handle
    pub fn set_stream(&mut self, stream: *mut std::ffi::c_void) -> Result<()> {
        // Placeholder implementation
        // Real implementation would call cudnnSetStream(self.handle, stream)
        println!("Setting cuDNN stream for device {}", self.device_id);
        Ok(())
    }
}

#[cfg(all(feature = "cudnn", any(target_os = "linux", target_os = "windows")))]
impl Drop for CudnnHandle {
    fn drop(&mut self) {
        // Placeholder implementation
        // Real implementation would call cudnnDestroy(self.handle)
        println!("Destroying cuDNN handle for device {}", self.device_id);
    }
}

#[cfg(all(feature = "cudnn", any(target_os = "linux", target_os = "windows")))]
impl CudnnTensorDescriptor {
    /// Create a new tensor descriptor
    pub fn new(
        data_type: CudnnDataType,
        format: CudnnTensorFormat,
        dimensions: Vec<i32>,
    ) -> Result<Self> {
        // Calculate strides based on format
        let strides = Self::calculate_strides(&dimensions, format);

        Ok(CudnnTensorDescriptor {
            data_type,
            format,
            dimensions,
            strides,
        })
    }

    /// Create tensor descriptor from TenfloweRS tensor
    pub fn from_tensor<T>(tensor: &Tensor<T>) -> Result<Self>
    where
        T: Clone + Send + Sync + 'static,
    {
        let shape = tensor.shape();
        let dimensions: Vec<i32> = shape.iter().map(|&x| x as i32).collect();

        // Determine data type from T
        let data_type = Self::infer_data_type::<T>()?;

        // Default to NCHW format for now
        let format = CudnnTensorFormat::NCHW;

        Self::new(data_type, format, dimensions)
    }

    /// Calculate strides for the given dimensions and format
    fn calculate_strides(dimensions: &[i32], format: CudnnTensorFormat) -> Vec<i32> {
        let mut strides = vec![0; dimensions.len()];

        match format {
            CudnnTensorFormat::NCHW => {
                // NCHW: N×C×H×W
                if dimensions.len() >= 4 {
                    strides[3] = 1; // W stride
                    strides[2] = dimensions[3]; // H stride
                    strides[1] = dimensions[2] * dimensions[3]; // C stride
                    strides[0] = dimensions[1] * dimensions[2] * dimensions[3]; // N stride
                }
            }
            CudnnTensorFormat::NHWC => {
                // NHWC: N×H×W×C
                if dimensions.len() >= 4 {
                    strides[3] = 1; // C stride
                    strides[2] = dimensions[3]; // W stride
                    strides[1] = dimensions[2] * dimensions[3]; // H stride
                    strides[0] = dimensions[1] * dimensions[2] * dimensions[3]; // N stride
                }
            }
            _ => {
                // For other formats, use default row-major ordering
                let mut stride = 1;
                for i in (0..dimensions.len()).rev() {
                    strides[i] = stride;
                    stride *= dimensions[i];
                }
            }
        }

        strides
    }

    /// Infer cuDNN data type from Rust type
    fn infer_data_type<T>() -> Result<CudnnDataType>
    where
        T: 'static,
    {
        let type_id = std::any::TypeId::of::<T>();

        if type_id == std::any::TypeId::of::<f32>() {
            Ok(CudnnDataType::Float)
        } else if type_id == std::any::TypeId::of::<f64>() {
            Ok(CudnnDataType::Double)
        } else if type_id == std::any::TypeId::of::<i32>() {
            Ok(CudnnDataType::Int32)
        } else if type_id == std::any::TypeId::of::<i8>() {
            Ok(CudnnDataType::Int8)
        } else if type_id == std::any::TypeId::of::<u8>() {
            Ok(CudnnDataType::Uint8)
        } else {
            Err(TensorError::unsupported_operation_simple(format!(
                "Unsupported data type for cuDNN: {:?}",
                std::any::type_name::<T>()
            )))
        }
    }

    /// Get tensor element count
    pub fn element_count(&self) -> usize {
        self.dimensions.iter().map(|&x| x as usize).product()
    }

    /// Get tensor size in bytes
    pub fn size_in_bytes(&self) -> usize {
        let element_size = match self.data_type {
            CudnnDataType::Float => 4,
            CudnnDataType::Double => 8,
            CudnnDataType::Half => 2,
            CudnnDataType::Int8 | CudnnDataType::Uint8 => 1,
            CudnnDataType::Int32 => 4,
            _ => 4, // Default to 4 bytes
        };

        self.element_count() * element_size
    }
}

#[cfg(all(feature = "cudnn", any(target_os = "linux", target_os = "windows")))]
impl CudnnConvolutionDescriptor {
    /// Create a new convolution descriptor
    pub fn new(
        padding: Vec<i32>,
        stride: Vec<i32>,
        dilation: Vec<i32>,
        mode: CudnnConvolutionMode,
        compute_type: CudnnDataType,
    ) -> Result<Self> {
        Ok(CudnnConvolutionDescriptor {
            padding,
            stride,
            dilation,
            mode,
            compute_type,
        })
    }

    /// Create convolution descriptor for 2D convolution
    pub fn conv2d(
        pad_h: i32,
        pad_w: i32,
        stride_h: i32,
        stride_w: i32,
        dilation_h: i32,
        dilation_w: i32,
        compute_type: CudnnDataType,
    ) -> Result<Self> {
        Self::new(
            vec![pad_h, pad_w],
            vec![stride_h, stride_w],
            vec![dilation_h, dilation_w],
            CudnnConvolutionMode::CrossCorrelation,
            compute_type,
        )
    }
}

#[cfg(all(feature = "cudnn", any(target_os = "linux", target_os = "windows")))]
impl CudnnPoolingDescriptor {
    /// Create a new pooling descriptor
    pub fn new(
        mode: CudnnPoolingMode,
        nan_opt: CudnnNanPropagation,
        window_size: Vec<i32>,
        padding: Vec<i32>,
        stride: Vec<i32>,
    ) -> Result<Self> {
        Ok(CudnnPoolingDescriptor {
            mode,
            nan_opt,
            window_size,
            padding,
            stride,
        })
    }

    /// Create pooling descriptor for 2D max pooling
    pub fn max_pool2d(
        window_h: i32,
        window_w: i32,
        pad_h: i32,
        pad_w: i32,
        stride_h: i32,
        stride_w: i32,
    ) -> Result<Self> {
        Self::new(
            CudnnPoolingMode::Max,
            CudnnNanPropagation::NotPropagateNan,
            vec![window_h, window_w],
            vec![pad_h, pad_w],
            vec![stride_h, stride_w],
        )
    }

    /// Create pooling descriptor for 2D average pooling
    pub fn avg_pool2d(
        window_h: i32,
        window_w: i32,
        pad_h: i32,
        pad_w: i32,
        stride_h: i32,
        stride_w: i32,
        count_include_pad: bool,
    ) -> Result<Self> {
        let mode = if count_include_pad {
            CudnnPoolingMode::AverageCountIncludePadding
        } else {
            CudnnPoolingMode::AverageCountExcludePadding
        };

        Self::new(
            mode,
            CudnnNanPropagation::NotPropagateNan,
            vec![window_h, window_w],
            vec![pad_h, pad_w],
            vec![stride_h, stride_w],
        )
    }
}

#[cfg(all(feature = "cudnn", any(target_os = "linux", target_os = "windows")))]
impl CudnnContext {
    /// Create a new cuDNN context
    pub fn new() -> Self {
        CudnnContext {
            handles: HashMap::new(),
            tensor_descriptors: HashMap::new(),
            convolution_descriptors: HashMap::new(),
            pooling_descriptors: HashMap::new(),
        }
    }

    /// Get or create cuDNN handle for device
    pub fn get_handle(&mut self, device_id: usize) -> Result<Arc<CudnnHandle>> {
        if let Some(handle) = self.handles.get(&device_id) {
            Ok(Arc::clone(handle))
        } else {
            let handle = Arc::new(CudnnHandle::new(device_id)?);
            self.handles.insert(device_id, Arc::clone(&handle));
            Ok(handle)
        }
    }

    /// Check if cuDNN is available
    pub fn is_available() -> bool {
        // Placeholder implementation
        // Real implementation would check for cuDNN library and compatible CUDA driver
        #[cfg(all(feature = "cudnn", any(target_os = "linux", target_os = "windows")))]
        {
            // Check environment variables or library availability
            std::env::var("CUDNN_LIBRARY_PATH").is_ok() || std::env::var("CUDA_PATH").is_ok()
        }
        #[cfg(not(feature = "cudnn"))]
        {
            false
        }
    }

    /// Get cuDNN version information
    pub fn version_info() -> Result<String> {
        // Placeholder implementation
        // Real implementation would call cudnnGetVersion()
        Ok("cuDNN 8.x.x (placeholder)".to_string())
    }

    /// Perform cuDNN convolution forward pass
    pub fn convolution_forward<T>(
        &mut self,
        input: &Tensor<T>,
        weights: &Tensor<T>,
        bias: Option<&Tensor<T>>,
        conv_desc: &CudnnConvolutionDescriptor,
        output_desc: &CudnnTensorDescriptor,
    ) -> Result<Tensor<T>>
    where
        T: Clone + Send + Sync + 'static,
    {
        let device_id = match input.device() {
            Device::Gpu(id) => *id,
            Device::Cpu => {
                return Err(TensorError::device_error_simple(
                    "cuDNN requires GPU device".to_string(),
                ))
            }
            #[cfg(feature = "rocm")]
            Device::Rocm(_) => {
                return Err(TensorError::device_error_simple(
                    "cuDNN not supported on ROCm devices".to_string(),
                ))
            }
        };

        let handle = self.get_handle(device_id)?;

        // Placeholder implementation
        // Real implementation would:
        // 1. Create tensor descriptors for input, weights, bias, output
        // 2. Find best convolution algorithm using cudnnFindConvolutionForwardAlgorithm
        // 3. Allocate workspace memory if needed
        // 4. Call cudnnConvolutionForward
        // 5. Add bias if present using cudnnAddTensor

        println!(
            "cuDNN convolution forward: device={}, input_shape={:?}",
            device_id,
            input.shape()
        );

        // For now, fall back to existing implementation
        Err(TensorError::unsupported_operation_simple(
            "cuDNN convolution not yet implemented - falling back to WGPU".to_string(),
        ))
    }

    /// Perform cuDNN pooling forward pass
    pub fn pooling_forward<T>(
        &mut self,
        input: &Tensor<T>,
        pooling_desc: &CudnnPoolingDescriptor,
        output_desc: &CudnnTensorDescriptor,
    ) -> Result<Tensor<T>>
    where
        T: Clone + Send + Sync + 'static,
    {
        let device_id = match input.device() {
            Device::Gpu(id) => *id,
            Device::Cpu => {
                return Err(TensorError::device_error_simple(
                    "cuDNN requires GPU device".to_string(),
                ))
            }
            #[cfg(feature = "rocm")]
            Device::Rocm(_) => {
                return Err(TensorError::device_error_simple(
                    "cuDNN not supported on ROCm devices".to_string(),
                ))
            }
        };

        let handle = self.get_handle(device_id)?;

        // Placeholder implementation
        // Real implementation would:
        // 1. Create tensor descriptors for input and output
        // 2. Call cudnnPoolingForward

        println!(
            "cuDNN pooling forward: device={}, input_shape={:?}",
            device_id,
            input.shape()
        );

        // For now, fall back to existing implementation
        Err(TensorError::unsupported_operation_simple(
            "cuDNN pooling not yet implemented - falling back to WGPU".to_string(),
        ))
    }

    /// Perform cuDNN activation forward pass
    pub fn activation_forward<T>(
        &mut self,
        input: &Tensor<T>,
        activation_mode: CudnnActivationMode,
        alpha: f64,
        beta: f64,
    ) -> Result<Tensor<T>>
    where
        T: Clone + Send + Sync + 'static,
    {
        let device_id = match input.device() {
            Device::Gpu(id) => *id,
            Device::Cpu => {
                return Err(TensorError::device_error_simple(
                    "cuDNN requires GPU device".to_string(),
                ))
            }
            #[cfg(feature = "rocm")]
            Device::Rocm(_) => {
                return Err(TensorError::device_error_simple(
                    "cuDNN not supported on ROCm devices".to_string(),
                ))
            }
        };

        let handle = self.get_handle(device_id)?;

        // Placeholder implementation
        // Real implementation would:
        // 1. Create activation descriptor using cudnnCreateActivationDescriptor
        // 2. Set activation descriptor using cudnnSetActivationDescriptor
        // 3. Call cudnnActivationForward

        println!(
            "cuDNN activation forward: device={}, mode={:?}",
            device_id, activation_mode
        );

        // For now, fall back to existing implementation
        Err(TensorError::unsupported_operation_simple(
            "cuDNN activation not yet implemented - falling back to WGPU".to_string(),
        ))
    }

    /// Clear cached descriptors
    pub fn clear_cache(&mut self) {
        self.tensor_descriptors.clear();
        self.convolution_descriptors.clear();
        self.pooling_descriptors.clear();
    }
}

#[cfg(all(feature = "cudnn", any(target_os = "linux", target_os = "windows")))]
impl Default for CudnnContext {
    fn default() -> Self {
        Self::new()
    }
}

/// Global cuDNN context instance
#[cfg(all(feature = "cudnn", any(target_os = "linux", target_os = "windows")))]
static GLOBAL_CUDNN_CONTEXT: std::sync::OnceLock<std::sync::Mutex<CudnnContext>> =
    std::sync::OnceLock::new();

/// Get global cuDNN context
#[cfg(all(feature = "cudnn", any(target_os = "linux", target_os = "windows")))]
pub fn global_cudnn_context() -> std::sync::MutexGuard<'static, CudnnContext> {
    GLOBAL_CUDNN_CONTEXT
        .get_or_init(|| std::sync::Mutex::new(CudnnContext::new()))
        .lock()
        .expect("cuDNN context mutex poisoned")
}

/// Utility functions for cuDNN integration
#[cfg(all(feature = "cudnn", any(target_os = "linux", target_os = "windows")))]
pub mod utils {
    use super::*;

    /// Check if a tensor is compatible with cuDNN
    pub fn is_tensor_cudnn_compatible<T>(tensor: &Tensor<T>) -> bool
    where
        T: 'static,
    {
        // Check if tensor is on GPU
        #[cfg(feature = "gpu")]
        {
            if !tensor.device().is_gpu() {
                return false;
            }
        }
        #[cfg(not(feature = "gpu"))]
        {
            return false;
        }

        // Check if data type is supported
        CudnnTensorDescriptor::infer_data_type::<T>().is_ok()
    }

    /// Convert tensor format between NCHW and NHWC
    pub fn convert_tensor_format<T>(
        tensor: &Tensor<T>,
        target_format: CudnnTensorFormat,
    ) -> Result<Tensor<T>>
    where
        T: Clone + Send + Sync + 'static,
    {
        // Placeholder implementation
        // Real implementation would perform tensor transpose to convert formats
        println!("Converting tensor format to {:?}", target_format);
        Ok(tensor.clone())
    }

    /// Get optimal cuDNN algorithm for convolution
    pub fn find_best_convolution_algorithm(
        input_desc: &CudnnTensorDescriptor,
        filter_desc: &CudnnTensorDescriptor,
        conv_desc: &CudnnConvolutionDescriptor,
        output_desc: &CudnnTensorDescriptor,
    ) -> Result<i32> {
        // Placeholder implementation
        // Real implementation would call cudnnFindConvolutionForwardAlgorithm
        println!("Finding best cuDNN convolution algorithm");
        Ok(0) // Return placeholder algorithm ID
    }

    /// Calculate workspace size needed for operation
    pub fn get_convolution_workspace_size(
        input_desc: &CudnnTensorDescriptor,
        filter_desc: &CudnnTensorDescriptor,
        conv_desc: &CudnnConvolutionDescriptor,
        output_desc: &CudnnTensorDescriptor,
        algorithm: i32,
    ) -> Result<usize> {
        // Placeholder implementation
        // Real implementation would call cudnnGetConvolutionForwardWorkspaceSize
        println!(
            "Calculating cuDNN workspace size for algorithm {}",
            algorithm
        );
        Ok(1024 * 1024) // Return 1MB placeholder
    }
}

#[cfg(all(feature = "cudnn", any(target_os = "linux", target_os = "windows")))]
#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_cudnn_context_creation() {
        let context = CudnnContext::new();
        assert!(context.handles.is_empty());
    }

    #[test]
    fn test_tensor_descriptor_creation() {
        let desc = CudnnTensorDescriptor::new(
            CudnnDataType::Float,
            CudnnTensorFormat::NCHW,
            vec![1, 3, 224, 224],
        )
        .expect("test: operation should succeed");

        assert_eq!(desc.data_type, CudnnDataType::Float);
        assert_eq!(desc.format, CudnnTensorFormat::NCHW);
        assert_eq!(desc.dimensions, vec![1, 3, 224, 224]);
        assert_eq!(desc.element_count(), 150528);
    }

    #[test]
    fn test_convolution_descriptor_creation() {
        let desc = CudnnConvolutionDescriptor::conv2d(
            1,
            1, // padding
            1,
            1, // stride
            1,
            1, // dilation
            CudnnDataType::Float,
        )
        .expect("test: operation should succeed");

        assert_eq!(desc.padding, vec![1, 1]);
        assert_eq!(desc.stride, vec![1, 1]);
        assert_eq!(desc.dilation, vec![1, 1]);
    }

    #[test]
    fn test_pooling_descriptor_creation() {
        let desc = CudnnPoolingDescriptor::max_pool2d(
            2, 2, // window
            0, 0, // padding
            2, 2, // stride
        )
        .expect("test: operation should succeed");

        assert_eq!(desc.mode, CudnnPoolingMode::Max);
        assert_eq!(desc.window_size, vec![2, 2]);
        assert_eq!(desc.stride, vec![2, 2]);
    }

    #[test]
    fn test_cudnn_availability() {
        // This test will vary based on system configuration
        let available = CudnnContext::is_available();
        println!("cuDNN available: {}", available);
        // We don't assert since availability depends on system setup
    }

    #[test]
    fn test_version_info() {
        let version = CudnnContext::version_info().expect("test: version_info should succeed");
        assert!(version.contains("cuDNN"));
    }

    #[test]
    fn test_stride_calculation() {
        let dims = vec![1, 3, 224, 224];
        let strides = CudnnTensorDescriptor::calculate_strides(&dims, CudnnTensorFormat::NCHW);

        // For NCHW: strides should be [150528, 50176, 224, 1]
        assert_eq!(strides, vec![150528, 50176, 224, 1]);
    }

    #[test]
    fn test_data_type_inference() {
        assert_eq!(
            CudnnTensorDescriptor::infer_data_type::<f32>()
                .expect("test: operation should succeed"),
            CudnnDataType::Float
        );
        assert_eq!(
            CudnnTensorDescriptor::infer_data_type::<f64>()
                .expect("test: operation should succeed"),
            CudnnDataType::Double
        );
        assert_eq!(
            CudnnTensorDescriptor::infer_data_type::<i32>()
                .expect("test: operation should succeed"),
            CudnnDataType::Int32
        );
    }
}