lcpfs 2026.1.102

// Copyright 2025 LunaOS Contributors
// SPDX-License-Identifier: Apache-2.0
//
// CUDA GPU Compute Backend
// Real CUDA driver API integration for hardware-accelerated compression.

//! # CUDA GPU Compute Backend
//!
//! This module provides a real CUDA implementation of [`GpuComputeProvider`] using
//! the CUDA Driver API and nvCOMP library for hardware-accelerated compression.
//!
//! ## Features
//!
//! - Uses CUDA Driver API (not Runtime API) for no_std compatibility
//! - Integrates with nvCOMP for LZ4 and ZSTD compression kernels
//! - Proper error handling with CUDA error codes
//! - Thread-safe design with Send+Sync implementations
//!
//! ## Requirements
//!
//! - NVIDIA GPU with CUDA Compute Capability 5.0+
//! - CUDA Driver installed (libcuda.so / nvcuda.dll)
//! - nvCOMP library (libnvcomp.so / nvcomp.dll)
//!
//! ## Usage
//!
//! ```rust,ignore
//! use lcpfs::lcpfs_gpu_cuda::CudaComputeProvider;
//! use lcpfs::lcpfs_gpu_compress::register_compute_provider;
//!
//! // Initialize CUDA backend
//! let provider = CudaComputeProvider::new(0).expect("Failed to init CUDA");
//!
//! // Register as the global compute provider
//! register_compute_provider(Box::leak(Box::new(provider)));
//! ```

use alloc::vec::Vec;
use core::ffi::c_void;
use core::sync::atomic::{AtomicBool, Ordering};
use spin::Mutex;

use crate::compress::gpu_compress::GpuComputeProvider;

// ═══════════════════════════════════════════════════════════════════════════════
// CUDA DRIVER API FFI BINDINGS
// ═══════════════════════════════════════════════════════════════════════════════

/// CUDA result/error type.
/// Maps to CUresult in cuda.h.
pub type CuResult = i32;

/// CUDA device handle.
/// Maps to CUdevice in cuda.h (typedef int CUdevice).
pub type CuDevice = i32;

/// CUDA context handle.
/// Maps to CUcontext in cuda.h (opaque pointer).
pub type CuContext = *mut c_void;

/// CUDA stream handle.
/// Maps to CUstream in cuda.h (opaque pointer).
pub type CuStream = *mut c_void;

/// CUDA device pointer.
/// Maps to CUdeviceptr in cuda.h (unsigned long long on 64-bit).
pub type CuDevicePtr = u64;

/// CUDA module handle.
/// Maps to CUmodule in cuda.h (opaque pointer).
pub type CuModule = *mut c_void;

/// CUDA function/kernel handle.
/// Maps to CUfunction in cuda.h (opaque pointer).
pub type CuFunction = *mut c_void;

// ═══════════════════════════════════════════════════════════════════════════════
// CUDA ERROR CODES
// ═══════════════════════════════════════════════════════════════════════════════

/// CUDA operation completed successfully.
pub const CUDA_SUCCESS: CuResult = 0;

/// CUDA driver not initialized.
pub const CUDA_ERROR_NOT_INITIALIZED: CuResult = 3;

/// CUDA driver already initialized.
pub const CUDA_ERROR_DEINITIALIZED: CuResult = 4;

/// No CUDA-capable device available.
pub const CUDA_ERROR_NO_DEVICE: CuResult = 100;

/// Invalid device ordinal.
pub const CUDA_ERROR_INVALID_DEVICE: CuResult = 101;

/// Invalid context.
pub const CUDA_ERROR_INVALID_CONTEXT: CuResult = 201;

/// Context already current.
pub const CUDA_ERROR_CONTEXT_ALREADY_CURRENT: CuResult = 202;

/// Out of memory.
pub const CUDA_ERROR_OUT_OF_MEMORY: CuResult = 2;

/// Invalid value.
pub const CUDA_ERROR_INVALID_VALUE: CuResult = 1;

/// Invalid handle.
pub const CUDA_ERROR_INVALID_HANDLE: CuResult = 400;

/// Operation not ready.
pub const CUDA_ERROR_NOT_READY: CuResult = 600;

// ═══════════════════════════════════════════════════════════════════════════════
// CUDA DEVICE ATTRIBUTES
// ═══════════════════════════════════════════════════════════════════════════════

/// Number of multiprocessors on the device.
pub const CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT: i32 = 16;

/// Global memory bus width in bits.
pub const CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH: i32 = 13;

/// Peak memory clock frequency in kilohertz.
pub const CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE: i32 = 36;

/// Major compute capability version number.
pub const CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR: i32 = 75;

/// Minor compute capability version number.
pub const CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR: i32 = 76;

/// Maximum shared memory per block in bytes.
pub const CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK: i32 = 8;

/// Total constant memory on the device in bytes.
pub const CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY: i32 = 9;

// ═══════════════════════════════════════════════════════════════════════════════
// CUDA CONTEXT FLAGS
// ═══════════════════════════════════════════════════════════════════════════════

/// Automatic scheduling (let driver choose).
pub const CU_CTX_SCHED_AUTO: u32 = 0x00;

/// Spin-wait scheduling (lower latency, higher CPU).
pub const CU_CTX_SCHED_SPIN: u32 = 0x01;

/// Yield scheduling (lower CPU, higher latency).
pub const CU_CTX_SCHED_YIELD: u32 = 0x02;

/// Block on synchronization (best for single-threaded).
pub const CU_CTX_SCHED_BLOCKING_SYNC: u32 = 0x04;

/// Map host memory for device access.
pub const CU_CTX_MAP_HOST: u32 = 0x08;

// ═══════════════════════════════════════════════════════════════════════════════
// CUDA MEMORY COPY KINDS (for cuMemcpy)
// ═══════════════════════════════════════════════════════════════════════════════

/// Host to host copy.
pub const CU_MEMORYTYPE_HOST: u32 = 0x01;

/// Device to device copy.
pub const CU_MEMORYTYPE_DEVICE: u32 = 0x02;

/// Array memory copy.
pub const CU_MEMORYTYPE_ARRAY: u32 = 0x03;

/// Unified virtual addressing copy.
pub const CU_MEMORYTYPE_UNIFIED: u32 = 0x04;

// ═══════════════════════════════════════════════════════════════════════════════
// CUDA DRIVER API FUNCTIONS
// ═══════════════════════════════════════════════════════════════════════════════

// SAFETY INVARIANTS:
// 1. These are NVIDIA CUDA Driver API declarations (libcuda.so / nvcuda.dll)
// 2. cuInit() must be called before any other CUDA function
// 3. All CUdeviceptr values must be obtained from cuMemAlloc or equivalent
// 4. All CUstream handles must be valid or CU_STREAM_DEFAULT (0)
// 5. All CUcontext handles must be valid and current for the calling thread
// 6. Device memory pointers are not dereferenceable from host code
// 7. Caller must ensure proper synchronization before freeing resources
//
// JUSTIFICATION:
// Direct CUDA Driver API access required for GPU-accelerated compression.
// Using Driver API (not Runtime) for no_std kernel compatibility.
// These declarations match NVIDIA's cuda.h header exactly.
unsafe extern "C" {
    /// Initialize the CUDA driver API.
    ///
    /// Must be called before any other CUDA driver API function.
    ///
    /// # Arguments
    /// * `flags` - Initialization flags (must be 0)
    ///
    /// # Returns
    /// `CUDA_SUCCESS` on success, error code otherwise.
    pub fn cuInit(flags: u32) -> CuResult;

    /// Get the number of CUDA-capable devices.
    ///
    /// # Arguments
    /// * `count` - Pointer to store device count
    ///
    /// # Returns
    /// `CUDA_SUCCESS` on success, error code otherwise.
    pub fn cuDeviceGetCount(count: *mut i32) -> CuResult;

    /// Get a handle to a CUDA device.
    ///
    /// # Arguments
    /// * `device` - Pointer to store device handle
    /// * `ordinal` - Device ordinal (0-based index)
    ///
    /// # Returns
    /// `CUDA_SUCCESS` on success, error code otherwise.
    pub fn cuDeviceGet(device: *mut CuDevice, ordinal: i32) -> CuResult;

    /// Get the name of a CUDA device.
    ///
    /// # Arguments
    /// * `name` - Buffer to store device name
    /// * `len` - Size of name buffer
    /// * `device` - Device handle
    ///
    /// # Returns
    /// `CUDA_SUCCESS` on success, error code otherwise.
    pub fn cuDeviceGetName(name: *mut u8, len: i32, device: CuDevice) -> CuResult;

    /// Get the total memory on a CUDA device.
    ///
    /// # Arguments
    /// * `bytes` - Pointer to store total memory in bytes
    /// * `device` - Device handle
    ///
    /// # Returns
    /// `CUDA_SUCCESS` on success, error code otherwise.
    pub fn cuDeviceTotalMem_v2(bytes: *mut usize, device: CuDevice) -> CuResult;

    /// Get an attribute of a CUDA device.
    ///
    /// # Arguments
    /// * `value` - Pointer to store attribute value
    /// * `attrib` - Attribute to query
    /// * `device` - Device handle
    ///
    /// # Returns
    /// `CUDA_SUCCESS` on success, error code otherwise.
    pub fn cuDeviceGetAttribute(value: *mut i32, attrib: i32, device: CuDevice) -> CuResult;

    /// Create a CUDA context.
    ///
    /// # Arguments
    /// * `ctx` - Pointer to store context handle
    /// * `flags` - Context creation flags
    /// * `device` - Device to create context on
    ///
    /// # Returns
    /// `CUDA_SUCCESS` on success, error code otherwise.
    pub fn cuCtxCreate_v2(ctx: *mut CuContext, flags: u32, device: CuDevice) -> CuResult;

    /// Destroy a CUDA context.
    ///
    /// # Arguments
    /// * `ctx` - Context to destroy
    ///
    /// # Returns
    /// `CUDA_SUCCESS` on success, error code otherwise.
    pub fn cuCtxDestroy_v2(ctx: CuContext) -> CuResult;

    /// Push a CUDA context onto the current thread's context stack.
    ///
    /// # Arguments
    /// * `ctx` - Context to push
    ///
    /// # Returns
    /// `CUDA_SUCCESS` on success, error code otherwise.
    pub fn cuCtxPushCurrent_v2(ctx: CuContext) -> CuResult;

    /// Pop the current CUDA context from the context stack.
    ///
    /// # Arguments
    /// * `ctx` - Pointer to store popped context
    ///
    /// # Returns
    /// `CUDA_SUCCESS` on success, error code otherwise.
    pub fn cuCtxPopCurrent_v2(ctx: *mut CuContext) -> CuResult;

    /// Set the current CUDA context.
    ///
    /// # Arguments
    /// * `ctx` - Context to make current
    ///
    /// # Returns
    /// `CUDA_SUCCESS` on success, error code otherwise.
    pub fn cuCtxSetCurrent(ctx: CuContext) -> CuResult;

    /// Synchronize the current context.
    ///
    /// Blocks until all preceding commands in the current context have completed.
    ///
    /// # Returns
    /// `CUDA_SUCCESS` on success, error code otherwise.
    pub fn cuCtxSynchronize() -> CuResult;

    /// Allocate device memory.
    ///
    /// # Arguments
    /// * `dptr` - Pointer to store device pointer
    /// * `bytesize` - Number of bytes to allocate
    ///
    /// # Returns
    /// `CUDA_SUCCESS` on success, error code otherwise.
    pub fn cuMemAlloc_v2(dptr: *mut CuDevicePtr, bytesize: usize) -> CuResult;

    /// Free device memory.
    ///
    /// # Arguments
    /// * `dptr` - Device pointer to free
    ///
    /// # Returns
    /// `CUDA_SUCCESS` on success, error code otherwise.
    pub fn cuMemFree_v2(dptr: CuDevicePtr) -> CuResult;

    /// Copy memory from host to device.
    ///
    /// # Arguments
    /// * `dst` - Destination device pointer
    /// * `src` - Source host pointer
    /// * `bytecount` - Number of bytes to copy
    ///
    /// # Returns
    /// `CUDA_SUCCESS` on success, error code otherwise.
    pub fn cuMemcpyHtoD_v2(dst: CuDevicePtr, src: *const c_void, bytecount: usize) -> CuResult;

    /// Copy memory from device to host.
    ///
    /// # Arguments
    /// * `dst` - Destination host pointer
    /// * `src` - Source device pointer
    /// * `bytecount` - Number of bytes to copy
    ///
    /// # Returns
    /// `CUDA_SUCCESS` on success, error code otherwise.
    pub fn cuMemcpyDtoH_v2(dst: *mut c_void, src: CuDevicePtr, bytecount: usize) -> CuResult;

    /// Create a CUDA stream.
    ///
    /// # Arguments
    /// * `stream` - Pointer to store stream handle
    /// * `flags` - Stream creation flags
    ///
    /// # Returns
    /// `CUDA_SUCCESS` on success, error code otherwise.
    pub fn cuStreamCreate(stream: *mut CuStream, flags: u32) -> CuResult;

    /// Destroy a CUDA stream.
    ///
    /// # Arguments
    /// * `stream` - Stream to destroy
    ///
    /// # Returns
    /// `CUDA_SUCCESS` on success, error code otherwise.
    pub fn cuStreamDestroy_v2(stream: CuStream) -> CuResult;

    /// Synchronize a CUDA stream.
    ///
    /// Blocks until all preceding commands in the stream have completed.
    ///
    /// # Arguments
    /// * `stream` - Stream to synchronize
    ///
    /// # Returns
    /// `CUDA_SUCCESS` on success, error code otherwise.
    pub fn cuStreamSynchronize(stream: CuStream) -> CuResult;

    /// Load a CUDA module from a file.
    ///
    /// # Arguments
    /// * `module` - Pointer to store module handle
    /// * `fname` - Path to module file (PTX or cubin)
    ///
    /// # Returns
    /// `CUDA_SUCCESS` on success, error code otherwise.
    pub fn cuModuleLoad(module: *mut CuModule, fname: *const u8) -> CuResult;

    /// Load a CUDA module from memory.
    ///
    /// # Arguments
    /// * `module` - Pointer to store module handle
    /// * `image` - Pointer to module image (PTX or cubin)
    ///
    /// # Returns
    /// `CUDA_SUCCESS` on success, error code otherwise.
    pub fn cuModuleLoadData(module: *mut CuModule, image: *const c_void) -> CuResult;

    /// Unload a CUDA module.
    ///
    /// # Arguments
    /// * `module` - Module to unload
    ///
    /// # Returns
    /// `CUDA_SUCCESS` on success, error code otherwise.
    pub fn cuModuleUnload(module: CuModule) -> CuResult;

    /// Get a function handle from a module.
    ///
    /// # Arguments
    /// * `hfunc` - Pointer to store function handle
    /// * `module` - Module containing the function
    /// * `name` - Name of the function
    ///
    /// # Returns
    /// `CUDA_SUCCESS` on success, error code otherwise.
    pub fn cuModuleGetFunction(
        hfunc: *mut CuFunction,
        module: CuModule,
        name: *const u8,
    ) -> CuResult;

    /// Launch a CUDA kernel.
    ///
    /// # Arguments
    /// * `f` - Function handle
    /// * `grid_dim_x` - Grid dimension X
    /// * `grid_dim_y` - Grid dimension Y
    /// * `grid_dim_z` - Grid dimension Z
    /// * `block_dim_x` - Block dimension X
    /// * `block_dim_y` - Block dimension Y
    /// * `block_dim_z` - Block dimension Z
    /// * `shared_mem_bytes` - Dynamic shared memory size
    /// * `stream` - Stream for kernel execution
    /// * `kernel_params` - Array of kernel parameters
    /// * `extra` - Extra options (usually NULL)
    ///
    /// # Returns
    /// `CUDA_SUCCESS` on success, error code otherwise.
    pub fn cuLaunchKernel(
        f: CuFunction,
        grid_dim_x: u32,
        grid_dim_y: u32,
        grid_dim_z: u32,
        block_dim_x: u32,
        block_dim_y: u32,
        block_dim_z: u32,
        shared_mem_bytes: u32,
        stream: CuStream,
        kernel_params: *mut *mut c_void,
        extra: *mut *mut c_void,
    ) -> CuResult;
}

// ═══════════════════════════════════════════════════════════════════════════════
// NVCOMP FFI BINDINGS
// ═══════════════════════════════════════════════════════════════════════════════

/// nvCOMP status type.
pub type NvcompStatus = i32;

/// nvCOMP operation completed successfully.
pub const NVCOMP_SUCCESS: NvcompStatus = 0;

/// nvCOMP invalid parameter.
pub const NVCOMP_ERROR_INVALID_VALUE: NvcompStatus = 1;

/// nvCOMP not supported.
pub const NVCOMP_ERROR_NOT_SUPPORTED: NvcompStatus = 2;

/// nvCOMP CUDA error.
pub const NVCOMP_ERROR_CUDA_ERROR: NvcompStatus = 3;

/// nvCOMP internal error.
pub const NVCOMP_ERROR_INTERNAL: NvcompStatus = 4;

/// nvCOMP type for compression algorithm selection.
pub type NvcompType = i32;

/// LZ4 compression algorithm.
pub const NVCOMP_TYPE_LZ4: NvcompType = 0;

/// ZSTD compression algorithm.
pub const NVCOMP_TYPE_ZSTD: NvcompType = 5;

/// nvCOMP batched LZ4 compression options.
#[repr(C)]
#[derive(Debug, Clone, Copy)]
pub struct NvcompBatchedLz4Opts {
    /// Data type (0 = char/byte).
    pub data_type: i32,
    /// Chunk size for batched compression.
    pub chunk_size: usize,
}

impl Default for NvcompBatchedLz4Opts {
    fn default() -> Self {
        Self {
            data_type: 0,
            chunk_size: 65536, // 64KB default chunk size
        }
    }
}

/// nvCOMP batched ZSTD compression options.
#[repr(C)]
#[derive(Debug, Clone, Copy, Default)]
pub struct NvcompBatchedZstdOpts {
    /// Reserved field.
    pub reserved: i32,
}

// SAFETY INVARIANTS:
// 1. These are NVIDIA nvCOMP library declarations (libnvcomp.so / nvcomp.dll)
// 2. All device pointers must be valid CUDA device memory from cuMemAlloc
// 3. Batch sizes and buffer sizes must match between related calls
// 4. Temporary workspace must be allocated with size from *GetTempSize functions
// 5. Output buffers must be pre-allocated with size from *GetOutputSize functions
// 6. Stream must be valid CUstream or 0 for default stream
// 7. All async operations complete when stream is synchronized
//
// JUSTIFICATION:
// nvCOMP provides GPU-accelerated LZ4/ZSTD compression with 10-100x speedup.
// Required for high-throughput filesystem compression on NVIDIA GPUs.
// These declarations match NVIDIA's nvcomp.h header exactly.
unsafe extern "C" {
    // ═══════════════════════════════════════════════════════════════════════════
    // NVCOMP LZ4 BATCHED COMPRESSION API
    // ═══════════════════════════════════════════════════════════════════════════

    /// Get the temporary workspace size for batched LZ4 compression.
    ///
    /// # Arguments
    /// * `batch_size` - Number of chunks in batch
    /// * `max_uncompressed_chunk_bytes` - Maximum uncompressed chunk size
    /// * `format_opts` - Compression options
    /// * `temp_bytes` - Output: required temporary workspace size
    ///
    /// # Returns
    /// `NVCOMP_SUCCESS` on success, error code otherwise.
    pub fn nvcompBatchedLZ4CompressGetTempSize(
        batch_size: usize,
        max_uncompressed_chunk_bytes: usize,
        format_opts: NvcompBatchedLz4Opts,
        temp_bytes: *mut usize,
    ) -> NvcompStatus;

    /// Get the maximum output size for batched LZ4 compression.
    ///
    /// # Arguments
    /// * `max_uncompressed_chunk_bytes` - Maximum uncompressed chunk size
    /// * `format_opts` - Compression options
    /// * `max_compressed_bytes` - Output: maximum compressed size per chunk
    ///
    /// # Returns
    /// `NVCOMP_SUCCESS` on success, error code otherwise.
    pub fn nvcompBatchedLZ4CompressGetMaxOutputChunkSize(
        max_uncompressed_chunk_bytes: usize,
        format_opts: NvcompBatchedLz4Opts,
        max_compressed_bytes: *mut usize,
    ) -> NvcompStatus;

    /// Perform batched LZ4 compression asynchronously.
    ///
    /// # Arguments
    /// * `device_uncompressed_ptrs` - Array of device pointers to uncompressed data
    /// * `device_uncompressed_bytes` - Array of uncompressed sizes
    /// * `max_uncompressed_chunk_bytes` - Maximum uncompressed chunk size
    /// * `batch_size` - Number of chunks
    /// * `device_temp_ptr` - Temporary workspace device pointer
    /// * `temp_bytes` - Temporary workspace size
    /// * `device_compressed_ptrs` - Array of device pointers for compressed output
    /// * `device_compressed_bytes` - Array to store compressed sizes
    /// * `format_opts` - Compression options
    /// * `stream` - CUDA stream
    ///
    /// # Returns
    /// `NVCOMP_SUCCESS` on success, error code otherwise.
    pub fn nvcompBatchedLZ4CompressAsync(
        device_uncompressed_ptrs: *const CuDevicePtr,
        device_uncompressed_bytes: *const usize,
        max_uncompressed_chunk_bytes: usize,
        batch_size: usize,
        device_temp_ptr: CuDevicePtr,
        temp_bytes: usize,
        device_compressed_ptrs: *const CuDevicePtr,
        device_compressed_bytes: *mut usize,
        format_opts: NvcompBatchedLz4Opts,
        stream: CuStream,
    ) -> NvcompStatus;

    // ═══════════════════════════════════════════════════════════════════════════
    // NVCOMP ZSTD BATCHED COMPRESSION API
    // ═══════════════════════════════════════════════════════════════════════════

    /// Get the temporary workspace size for batched ZSTD compression.
    ///
    /// # Arguments
    /// * `batch_size` - Number of chunks in batch
    /// * `max_uncompressed_chunk_bytes` - Maximum uncompressed chunk size
    /// * `format_opts` - Compression options
    /// * `temp_bytes` - Output: required temporary workspace size
    ///
    /// # Returns
    /// `NVCOMP_SUCCESS` on success, error code otherwise.
    pub fn nvcompBatchedZstdCompressGetTempSize(
        batch_size: usize,
        max_uncompressed_chunk_bytes: usize,
        format_opts: NvcompBatchedZstdOpts,
        temp_bytes: *mut usize,
    ) -> NvcompStatus;

    /// Get the maximum output size for batched ZSTD compression.
    ///
    /// # Arguments
    /// * `max_uncompressed_chunk_bytes` - Maximum uncompressed chunk size
    /// * `format_opts` - Compression options
    /// * `max_compressed_bytes` - Output: maximum compressed size per chunk
    ///
    /// # Returns
    /// `NVCOMP_SUCCESS` on success, error code otherwise.
    pub fn nvcompBatchedZstdCompressGetMaxOutputChunkSize(
        max_uncompressed_chunk_bytes: usize,
        format_opts: NvcompBatchedZstdOpts,
        max_compressed_bytes: *mut usize,
    ) -> NvcompStatus;

    /// Perform batched ZSTD compression asynchronously.
    ///
    /// # Arguments
    /// * `device_uncompressed_ptrs` - Array of device pointers to uncompressed data
    /// * `device_uncompressed_bytes` - Array of uncompressed sizes
    /// * `max_uncompressed_chunk_bytes` - Maximum uncompressed chunk size
    /// * `batch_size` - Number of chunks
    /// * `device_temp_ptr` - Temporary workspace device pointer
    /// * `temp_bytes` - Temporary workspace size
    /// * `device_compressed_ptrs` - Array of device pointers for compressed output
    /// * `device_compressed_bytes` - Array to store compressed sizes
    /// * `format_opts` - Compression options
    /// * `stream` - CUDA stream
    ///
    /// # Returns
    /// `NVCOMP_SUCCESS` on success, error code otherwise.
    pub fn nvcompBatchedZstdCompressAsync(
        device_uncompressed_ptrs: *const CuDevicePtr,
        device_uncompressed_bytes: *const usize,
        max_uncompressed_chunk_bytes: usize,
        batch_size: usize,
        device_temp_ptr: CuDevicePtr,
        temp_bytes: usize,
        device_compressed_ptrs: *const CuDevicePtr,
        device_compressed_bytes: *mut usize,
        format_opts: NvcompBatchedZstdOpts,
        stream: CuStream,
    ) -> NvcompStatus;
}

// ═══════════════════════════════════════════════════════════════════════════════
// CUDA ERROR HANDLING
// ═══════════════════════════════════════════════════════════════════════════════

/// CUDA error type with descriptive messages.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum CudaError {
    /// CUDA driver not initialized.
    NotInitialized,
    /// CUDA driver deinitialized.
    Deinitialized,
    /// No CUDA-capable device found.
    NoDevice,
    /// Invalid device ordinal.
    InvalidDevice,
    /// Invalid context.
    InvalidContext,
    /// Out of device memory.
    OutOfMemory,
    /// Invalid value/parameter.
    InvalidValue,
    /// Invalid handle.
    InvalidHandle,
    /// Operation not ready.
    NotReady,
    /// Unknown CUDA error.
    Unknown(CuResult),
}

impl CudaError {
    /// Convert a CUDA result code to a CudaError.
    pub fn from_result(result: CuResult) -> Option<Self> {
        if result == CUDA_SUCCESS {
            return None;
        }
        Some(match result {
            CUDA_ERROR_NOT_INITIALIZED => CudaError::NotInitialized,
            CUDA_ERROR_DEINITIALIZED => CudaError::Deinitialized,
            CUDA_ERROR_NO_DEVICE => CudaError::NoDevice,
            CUDA_ERROR_INVALID_DEVICE => CudaError::InvalidDevice,
            CUDA_ERROR_INVALID_CONTEXT => CudaError::InvalidContext,
            CUDA_ERROR_OUT_OF_MEMORY => CudaError::OutOfMemory,
            CUDA_ERROR_INVALID_VALUE => CudaError::InvalidValue,
            CUDA_ERROR_INVALID_HANDLE => CudaError::InvalidHandle,
            CUDA_ERROR_NOT_READY => CudaError::NotReady,
            code => CudaError::Unknown(code),
        })
    }

    /// Get a human-readable description of the error.
    pub fn description(&self) -> &'static str {
        match self {
            CudaError::NotInitialized => "CUDA driver not initialized",
            CudaError::Deinitialized => "CUDA driver deinitialized",
            CudaError::NoDevice => "No CUDA-capable device found",
            CudaError::InvalidDevice => "Invalid device ordinal",
            CudaError::InvalidContext => "Invalid CUDA context",
            CudaError::OutOfMemory => "Out of device memory",
            CudaError::InvalidValue => "Invalid value or parameter",
            CudaError::InvalidHandle => "Invalid handle",
            CudaError::NotReady => "Operation not ready",
            CudaError::Unknown(_) => "Unknown CUDA error",
        }
    }
}

/// nvCOMP error type.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum NvcompError {
    /// Invalid parameter.
    InvalidValue,
    /// Operation not supported.
    NotSupported,
    /// CUDA error occurred.
    CudaError,
    /// Internal nvCOMP error.
    Internal,
    /// Unknown nvCOMP error.
    Unknown(NvcompStatus),
}

impl NvcompError {
    /// Convert an nvCOMP status code to an NvcompError.
    pub fn from_status(status: NvcompStatus) -> Option<Self> {
        if status == NVCOMP_SUCCESS {
            return None;
        }
        Some(match status {
            NVCOMP_ERROR_INVALID_VALUE => NvcompError::InvalidValue,
            NVCOMP_ERROR_NOT_SUPPORTED => NvcompError::NotSupported,
            NVCOMP_ERROR_CUDA_ERROR => NvcompError::CudaError,
            NVCOMP_ERROR_INTERNAL => NvcompError::Internal,
            code => NvcompError::Unknown(code),
        })
    }

    /// Get a human-readable description of the error.
    pub fn description(&self) -> &'static str {
        match self {
            NvcompError::InvalidValue => "Invalid parameter",
            NvcompError::NotSupported => "Operation not supported",
            NvcompError::CudaError => "CUDA error occurred",
            NvcompError::Internal => "Internal nvCOMP error",
            NvcompError::Unknown(_) => "Unknown nvCOMP error",
        }
    }
}

// ═══════════════════════════════════════════════════════════════════════════════
// WRAPPER TYPES FOR THREAD SAFETY
// ═══════════════════════════════════════════════════════════════════════════════

/// Wrapper for CUDA context handle that is Send+Sync safe.
///
/// # Safety Invariants
///
/// This type wraps a raw CUDA context pointer. The following invariants must hold
/// for the `unsafe impl Send + Sync`:
///
/// - The CUDA context is created with `cuCtxCreate_v2` and remains valid until
///   `cuCtxDestroy_v2` is called
/// - All access to CUDA operations using this context is synchronized through
///   the parent `CudaComputeProvider`'s mutex
/// - The context is only used on the thread that has it set as current via
///   `cuCtxSetCurrent` or `cuCtxPushCurrent_v2`
/// - CUDA contexts are inherently thread-safe when proper synchronization is used
#[derive(Clone, Copy)]
struct CudaContextHandle(CuContext);

// SAFETY: CudaContextHandle is accessed only under the CudaComputeProvider mutex lock.
// CUDA contexts are reference-counted and can be shared between threads when
// proper synchronization is applied (which we do via spin::Mutex).
unsafe impl Send for CudaContextHandle {}
unsafe impl Sync for CudaContextHandle {}

impl CudaContextHandle {
    /// Create a new context handle from a raw CUDA context.
    ///
    /// # Safety
    ///
    /// The caller must ensure the context pointer is valid and was created
    /// with `cuCtxCreate_v2`.
    const unsafe fn from_raw(ctx: CuContext) -> Self {
        Self(ctx)
    }

    /// Get the raw CUDA context pointer.
    fn as_raw(self) -> CuContext {
        self.0
    }

    /// Check if the context is null.
    fn is_null(self) -> bool {
        self.0.is_null()
    }
}

/// Wrapper for CUDA stream handle that is Send+Sync safe.
///
/// # Safety Invariants
///
/// This type wraps a raw CUDA stream pointer. The following invariants must hold
/// for the `unsafe impl Send + Sync`:
///
/// - The CUDA stream is created with `cuStreamCreate` and remains valid until
///   `cuStreamDestroy_v2` is called
/// - All access to stream operations is synchronized through the parent
///   `CudaComputeProvider`'s mutex
/// - Operations submitted to a stream execute in order within that stream
/// - Proper synchronization is performed before reading results (`cuStreamSynchronize`)
#[derive(Clone, Copy)]
struct CudaStreamHandle(CuStream);

// SAFETY: CudaStreamHandle is accessed only under the CudaComputeProvider mutex lock.
// CUDA streams can be accessed from multiple threads with proper synchronization.
unsafe impl Send for CudaStreamHandle {}
unsafe impl Sync for CudaStreamHandle {}

impl CudaStreamHandle {
    /// Create a new stream handle from a raw CUDA stream.
    ///
    /// # Safety
    ///
    /// The caller must ensure the stream pointer is valid and was created
    /// with `cuStreamCreate`.
    const unsafe fn from_raw(stream: CuStream) -> Self {
        Self(stream)
    }

    /// Get the raw CUDA stream pointer.
    fn as_raw(self) -> CuStream {
        self.0
    }

    /// Check if the stream is null (default stream).
    fn is_null(self) -> bool {
        self.0.is_null()
    }
}

// ═══════════════════════════════════════════════════════════════════════════════
// CUDA COMPUTE PROVIDER IMPLEMENTATION
// ═══════════════════════════════════════════════════════════════════════════════

/// CUDA device information.
#[derive(Debug, Clone)]
pub struct CudaDeviceInfo {
    /// Device ordinal (0-based index).
    pub ordinal: i32,
    /// Device name (up to 256 characters).
    pub name: [u8; 256],
    /// Total device memory in bytes.
    pub total_memory: usize,
    /// Number of streaming multiprocessors.
    pub sm_count: i32,
    /// Compute capability major version.
    pub compute_major: i32,
    /// Compute capability minor version.
    pub compute_minor: i32,
    /// Memory clock rate in kHz.
    pub memory_clock_khz: i32,
    /// Memory bus width in bits.
    pub memory_bus_width: i32,
}

impl CudaDeviceInfo {
    /// Get the device name as a string slice.
    pub fn name_str(&self) -> &str {
        let len = self
            .name
            .iter()
            .position(|&b| b == 0)
            .unwrap_or(self.name.len());
        core::str::from_utf8(&self.name[..len]).unwrap_or("Unknown")
    }

    /// Calculate theoretical memory bandwidth in GB/s.
    pub fn memory_bandwidth_gbps(&self) -> f32 {
        // Bandwidth = 2 * clock_rate * bus_width / 8 / 1e9
        // Factor of 2 for DDR (Double Data Rate)
        let clock_hz = self.memory_clock_khz as f64 * 1000.0;
        let bus_bytes = self.memory_bus_width as f64 / 8.0;
        (2.0 * clock_hz * bus_bytes / 1e9) as f32
    }
}

/// Internal state for CUDA provider, protected by mutex.
struct CudaProviderState {
    /// CUDA context handle.
    context: CudaContextHandle,
    /// CUDA stream for async operations.
    stream: CudaStreamHandle,
    /// Device information.
    device_info: CudaDeviceInfo,
}

/// CUDA compute provider implementing [`GpuComputeProvider`].
///
/// Provides real GPU-accelerated memory operations and compression using
/// the CUDA Driver API and nvCOMP library.
///
/// # Thread Safety
///
/// This type is `Send + Sync`. All CUDA operations are synchronized through
/// an internal mutex to ensure thread safety.
///
/// # Example
///
/// ```rust,ignore
/// use lcpfs::lcpfs_gpu_cuda::CudaComputeProvider;
///
/// // Initialize with device 0
/// let provider = CudaComputeProvider::new(0)?;
///
/// // Allocate device memory
/// let ptr = provider.allocate(1024 * 1024).expect("allocation failed");
///
/// // Use for compression...
///
/// // Free when done
/// provider.free(ptr);
/// ```
pub struct CudaComputeProvider {
    /// Internal state protected by mutex.
    state: Mutex<CudaProviderState>,
    /// Whether CUDA has been initialized globally.
    initialized: AtomicBool,
}

// Global CUDA initialization flag (cuInit should only be called once).
static CUDA_INITIALIZED: AtomicBool = AtomicBool::new(false);

impl CudaComputeProvider {
    /// Create a new CUDA compute provider for the specified device.
    ///
    /// # Arguments
    /// * `device_ordinal` - Device index (0 for first GPU)
    ///
    /// # Returns
    /// * `Ok(provider)` - Successfully initialized provider
    /// * `Err(error)` - Initialization failed
    ///
    /// # Errors
    /// * `CudaError::NoDevice` - No CUDA-capable device found
    /// * `CudaError::InvalidDevice` - Invalid device ordinal
    /// * `CudaError::OutOfMemory` - Failed to create context
    ///
    /// # Safety
    ///
    /// This function calls CUDA Driver API functions through FFI. The CUDA
    /// driver must be properly installed on the system.
    pub fn new(device_ordinal: i32) -> Result<Self, CudaError> {
        // Initialize CUDA driver (only once globally)
        if !CUDA_INITIALIZED.swap(true, Ordering::SeqCst) {
            // SAFETY: cuInit is safe to call with flags=0, and we ensure
            // it's only called once via atomic flag.
            let result = unsafe { cuInit(0) };
            if let Some(err) = CudaError::from_result(result) {
                CUDA_INITIALIZED.store(false, Ordering::SeqCst);
                return Err(err);
            }
        }

        // Get device count
        let mut device_count: i32 = 0;
        // SAFETY: device_count is a valid pointer to stack-allocated i32.
        let result = unsafe { cuDeviceGetCount(&mut device_count) };
        if let Some(err) = CudaError::from_result(result) {
            return Err(err);
        }

        if device_count == 0 {
            return Err(CudaError::NoDevice);
        }

        if device_ordinal >= device_count {
            return Err(CudaError::InvalidDevice);
        }

        // Get device handle
        let mut device: CuDevice = 0;
        // SAFETY: device is a valid pointer to stack-allocated CuDevice.
        let result = unsafe { cuDeviceGet(&mut device, device_ordinal) };
        if let Some(err) = CudaError::from_result(result) {
            return Err(err);
        }

        // Query device information
        let device_info = Self::query_device_info(device, device_ordinal)?;

        // Create context
        let mut context: CuContext = core::ptr::null_mut();
        // SAFETY: context is a valid pointer, flags and device are valid.
        let result = unsafe { cuCtxCreate_v2(&mut context, CU_CTX_SCHED_AUTO, device) };
        if let Some(err) = CudaError::from_result(result) {
            return Err(err);
        }

        // Create stream for async operations
        let mut stream: CuStream = core::ptr::null_mut();
        // SAFETY: stream is a valid pointer, context is active.
        let result = unsafe { cuStreamCreate(&mut stream, 0) };
        if let Some(err) = CudaError::from_result(result) {
            // Clean up context on failure
            // SAFETY: context is valid and we own it.
            unsafe { cuCtxDestroy_v2(context) };
            return Err(err);
        }

        // SAFETY: context and stream were successfully created and are valid.
        let context_handle = unsafe { CudaContextHandle::from_raw(context) };
        let stream_handle = unsafe { CudaStreamHandle::from_raw(stream) };

        Ok(Self {
            state: Mutex::new(CudaProviderState {
                context: context_handle,
                stream: stream_handle,
                device_info,
            }),
            initialized: AtomicBool::new(true),
        })
    }

    /// Query device information from a CUDA device.
    fn query_device_info(device: CuDevice, ordinal: i32) -> Result<CudaDeviceInfo, CudaError> {
        let mut info = CudaDeviceInfo {
            ordinal,
            name: [0u8; 256],
            total_memory: 0,
            sm_count: 0,
            compute_major: 0,
            compute_minor: 0,
            memory_clock_khz: 0,
            memory_bus_width: 0,
        };

        // Get device name
        // SAFETY: name buffer is valid and large enough.
        let result = unsafe { cuDeviceGetName(info.name.as_mut_ptr(), 256, device) };
        if let Some(err) = CudaError::from_result(result) {
            return Err(err);
        }

        // Get total memory
        // SAFETY: total_memory is a valid pointer.
        let result = unsafe { cuDeviceTotalMem_v2(&mut info.total_memory, device) };
        if let Some(err) = CudaError::from_result(result) {
            return Err(err);
        }

        // Get SM count
        // SAFETY: sm_count is a valid pointer, attribute and device are valid.
        let result = unsafe {
            cuDeviceGetAttribute(
                &mut info.sm_count,
                CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
                device,
            )
        };
        if let Some(err) = CudaError::from_result(result) {
            return Err(err);
        }

        // Get compute capability
        // SAFETY: compute_major/minor are valid pointers.
        let result = unsafe {
            cuDeviceGetAttribute(
                &mut info.compute_major,
                CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
                device,
            )
        };
        if let Some(err) = CudaError::from_result(result) {
            return Err(err);
        }

        let result = unsafe {
            cuDeviceGetAttribute(
                &mut info.compute_minor,
                CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
                device,
            )
        };
        if let Some(err) = CudaError::from_result(result) {
            return Err(err);
        }

        // Get memory clock rate
        // SAFETY: memory_clock_khz is a valid pointer.
        let result = unsafe {
            cuDeviceGetAttribute(
                &mut info.memory_clock_khz,
                CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE,
                device,
            )
        };
        if let Some(err) = CudaError::from_result(result) {
            return Err(err);
        }

        // Get memory bus width
        // SAFETY: memory_bus_width is a valid pointer.
        let result = unsafe {
            cuDeviceGetAttribute(
                &mut info.memory_bus_width,
                CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH,
                device,
            )
        };
        if let Some(err) = CudaError::from_result(result) {
            return Err(err);
        }

        Ok(info)
    }

    /// Get device information.
    pub fn device_info(&self) -> CudaDeviceInfo {
        let state = self.state.lock();
        state.device_info.clone()
    }

    /// Check if the provider is initialized.
    pub fn is_initialized(&self) -> bool {
        self.initialized.load(Ordering::SeqCst)
    }

    /// Ensure the CUDA context is current on this thread.
    ///
    /// # Safety
    ///
    /// Must be called while holding the state mutex lock.
    fn ensure_context_current(state: &CudaProviderState) -> bool {
        if state.context.is_null() {
            return false;
        }
        // SAFETY: context handle is valid per our invariants.
        let result = unsafe { cuCtxSetCurrent(state.context.as_raw()) };
        CudaError::from_result(result).is_none()
    }
}

impl Drop for CudaComputeProvider {
    fn drop(&mut self) {
        let state = self.state.lock();

        // Destroy stream if valid
        if !state.stream.is_null() {
            // SAFETY: stream is valid and we own it.
            unsafe { cuStreamDestroy_v2(state.stream.as_raw()) };
        }

        // Destroy context if valid
        if !state.context.is_null() {
            // SAFETY: context is valid and we own it.
            unsafe { cuCtxDestroy_v2(state.context.as_raw()) };
        }

        self.initialized.store(false, Ordering::SeqCst);
    }
}

impl GpuComputeProvider for CudaComputeProvider {
    /// Allocate device memory.
    ///
    /// # Arguments
    /// * `size` - Number of bytes to allocate
    ///
    /// # Returns
    /// * `Some(device_addr)` - Device memory address on success
    /// * `None` - Allocation failed (out of memory or invalid state)
    ///
    /// # Safety
    ///
    /// The returned device address is valid until `free()` is called with it.
    /// The caller must ensure proper synchronization when accessing the memory.
    fn allocate(&self, size: usize) -> Option<u64> {
        if size == 0 {
            return None;
        }

        let state = self.state.lock();

        if !Self::ensure_context_current(&state) {
            return None;
        }

        let mut device_ptr: CuDevicePtr = 0;
        // SAFETY: device_ptr is valid, context is current, size is non-zero.
        let result = unsafe { cuMemAlloc_v2(&mut device_ptr, size) };

        if CudaError::from_result(result).is_some() {
            return None;
        }

        Some(device_ptr)
    }

    /// Free device memory.
    ///
    /// # Arguments
    /// * `device_addr` - Device memory address to free
    ///
    /// # Safety
    ///
    /// The device address must have been returned by a previous `allocate()` call.
    /// The memory must not be in use by any pending GPU operations.
    fn free(&self, device_addr: u64) {
        if device_addr == 0 {
            return;
        }

        let state = self.state.lock();

        if !Self::ensure_context_current(&state) {
            return;
        }

        // SAFETY: device_addr was allocated by cuMemAlloc_v2 per our contract.
        unsafe { cuMemFree_v2(device_addr) };
    }

    /// Copy data from host to device.
    ///
    /// # Arguments
    /// * `device_addr` - Destination device memory address
    /// * `data` - Source host data
    ///
    /// # Returns
    /// * `true` - Copy succeeded
    /// * `false` - Copy failed
    ///
    /// # Safety
    ///
    /// - `device_addr` must be a valid device pointer with at least `data.len()` bytes allocated
    /// - The copy is synchronous and completes before this function returns
    fn copy_to_device(&self, device_addr: u64, data: &[u8]) -> bool {
        if device_addr == 0 || data.is_empty() {
            return false;
        }

        let state = self.state.lock();

        if !Self::ensure_context_current(&state) {
            return false;
        }

        // SAFETY: device_addr is valid per our contract, data is a valid slice.
        let result =
            unsafe { cuMemcpyHtoD_v2(device_addr, data.as_ptr() as *const c_void, data.len()) };

        CudaError::from_result(result).is_none()
    }

    /// Copy data from device to host.
    ///
    /// # Arguments
    /// * `device_addr` - Source device memory address
    /// * `data` - Destination host buffer
    ///
    /// # Returns
    /// * `true` - Copy succeeded
    /// * `false` - Copy failed
    ///
    /// # Safety
    ///
    /// - `device_addr` must be a valid device pointer with at least `data.len()` bytes
    /// - The copy is synchronous and completes before this function returns
    fn copy_from_device(&self, device_addr: u64, data: &mut [u8]) -> bool {
        if device_addr == 0 || data.is_empty() {
            return false;
        }

        let state = self.state.lock();

        if !Self::ensure_context_current(&state) {
            return false;
        }

        // SAFETY: device_addr is valid per our contract, data is a valid mutable slice.
        let result =
            unsafe { cuMemcpyDtoH_v2(data.as_mut_ptr() as *mut c_void, device_addr, data.len()) };

        CudaError::from_result(result).is_none()
    }

    /// Execute LZ4 compression using nvCOMP.
    ///
    /// # Arguments
    /// * `input_addr` - Device memory address of input data
    /// * `input_size` - Size of input data in bytes
    /// * `output_addr` - Device memory address for compressed output
    /// * `output_size` - Maximum output buffer size
    ///
    /// # Returns
    /// * Actual compressed size on success
    /// * 0 on failure
    ///
    /// # Safety
    ///
    /// - `input_addr` must point to valid device memory with `input_size` bytes
    /// - `output_addr` must point to valid device memory with `output_size` bytes
    /// - The compression is executed asynchronously but synchronized before returning
    fn compress_lz4(
        &self,
        input_addr: u64,
        input_size: usize,
        output_addr: u64,
        output_size: usize,
    ) -> usize {
        if input_addr == 0 || output_addr == 0 || input_size == 0 || output_size == 0 {
            return 0;
        }

        let state = self.state.lock();

        if !Self::ensure_context_current(&state) {
            return 0;
        }

        let opts = NvcompBatchedLz4Opts::default();

        // Get temporary workspace size
        let mut temp_bytes: usize = 0;
        // SAFETY: temp_bytes is a valid pointer, parameters are valid.
        let status =
            unsafe { nvcompBatchedLZ4CompressGetTempSize(1, input_size, opts, &mut temp_bytes) };
        if NvcompError::from_status(status).is_some() {
            return 0;
        }

        // Allocate temporary workspace
        let mut temp_ptr: CuDevicePtr = 0;
        if temp_bytes > 0 {
            // SAFETY: temp_ptr is valid, context is current.
            let result = unsafe { cuMemAlloc_v2(&mut temp_ptr, temp_bytes) };
            if CudaError::from_result(result).is_some() {
                return 0;
            }
        }

        // Prepare batch arrays (single chunk)
        let input_ptrs = [input_addr];
        let input_sizes = [input_size];
        let output_ptrs = [output_addr];
        let mut output_sizes = [0usize];

        // Execute compression
        // SAFETY: All pointers and parameters are valid per our checks.
        let status = unsafe {
            nvcompBatchedLZ4CompressAsync(
                input_ptrs.as_ptr(),
                input_sizes.as_ptr(),
                input_size,
                1, // batch_size
                temp_ptr,
                temp_bytes,
                output_ptrs.as_ptr(),
                output_sizes.as_mut_ptr(),
                opts,
                state.stream.as_raw(),
            )
        };

        // Synchronize to get result
        // SAFETY: stream is valid.
        unsafe { cuStreamSynchronize(state.stream.as_raw()) };

        // Free temporary workspace
        if temp_ptr != 0 {
            // SAFETY: temp_ptr was allocated above.
            unsafe { cuMemFree_v2(temp_ptr) };
        }

        if NvcompError::from_status(status).is_some() {
            return 0;
        }

        output_sizes[0].min(output_size)
    }

    /// Execute ZSTD compression using nvCOMP.
    ///
    /// # Arguments
    /// * `input_addr` - Device memory address of input data
    /// * `input_size` - Size of input data in bytes
    /// * `output_addr` - Device memory address for compressed output
    /// * `output_size` - Maximum output buffer size
    /// * `level` - Compression level (1-22, higher = more compression)
    ///
    /// # Returns
    /// * Actual compressed size on success
    /// * 0 on failure
    ///
    /// # Safety
    ///
    /// - `input_addr` must point to valid device memory with `input_size` bytes
    /// - `output_addr` must point to valid device memory with `output_size` bytes
    /// - The compression is executed asynchronously but synchronized before returning
    fn compress_zstd(
        &self,
        input_addr: u64,
        input_size: usize,
        output_addr: u64,
        output_size: usize,
        _level: u8,
    ) -> usize {
        if input_addr == 0 || output_addr == 0 || input_size == 0 || output_size == 0 {
            return 0;
        }

        let state = self.state.lock();

        if !Self::ensure_context_current(&state) {
            return 0;
        }

        let opts = NvcompBatchedZstdOpts::default();

        // Get temporary workspace size
        let mut temp_bytes: usize = 0;
        // SAFETY: temp_bytes is a valid pointer, parameters are valid.
        let status =
            unsafe { nvcompBatchedZstdCompressGetTempSize(1, input_size, opts, &mut temp_bytes) };
        if NvcompError::from_status(status).is_some() {
            return 0;
        }

        // Allocate temporary workspace
        let mut temp_ptr: CuDevicePtr = 0;
        if temp_bytes > 0 {
            // SAFETY: temp_ptr is valid, context is current.
            let result = unsafe { cuMemAlloc_v2(&mut temp_ptr, temp_bytes) };
            if CudaError::from_result(result).is_some() {
                return 0;
            }
        }

        // Prepare batch arrays (single chunk)
        let input_ptrs = [input_addr];
        let input_sizes = [input_size];
        let output_ptrs = [output_addr];
        let mut output_sizes = [0usize];

        // Execute compression
        // SAFETY: All pointers and parameters are valid per our checks.
        let status = unsafe {
            nvcompBatchedZstdCompressAsync(
                input_ptrs.as_ptr(),
                input_sizes.as_ptr(),
                input_size,
                1, // batch_size
                temp_ptr,
                temp_bytes,
                output_ptrs.as_ptr(),
                output_sizes.as_mut_ptr(),
                opts,
                state.stream.as_raw(),
            )
        };

        // Synchronize to get result
        // SAFETY: stream is valid.
        unsafe { cuStreamSynchronize(state.stream.as_raw()) };

        // Free temporary workspace
        if temp_ptr != 0 {
            // SAFETY: temp_ptr was allocated above.
            unsafe { cuMemFree_v2(temp_ptr) };
        }

        if NvcompError::from_status(status).is_some() {
            return 0;
        }

        output_sizes[0].min(output_size)
    }

    /// Synchronize GPU operations.
    ///
    /// Blocks until all pending operations on the GPU stream have completed.
    fn synchronize(&self) {
        let state = self.state.lock();

        if !Self::ensure_context_current(&state) {
            return;
        }

        // Synchronize the stream
        // SAFETY: stream is valid per our invariants.
        unsafe { cuStreamSynchronize(state.stream.as_raw()) };
    }
}

// ═══════════════════════════════════════════════════════════════════════════════
// HELPER FUNCTIONS
// ═══════════════════════════════════════════════════════════════════════════════

/// Get the number of CUDA-capable devices.
///
/// # Returns
/// * `Ok(count)` - Number of devices
/// * `Err(error)` - Query failed
pub fn get_device_count() -> Result<i32, CudaError> {
    // Initialize CUDA if not already done
    if !CUDA_INITIALIZED.swap(true, Ordering::SeqCst) {
        // SAFETY: cuInit is safe to call with flags=0.
        let result = unsafe { cuInit(0) };
        if let Some(err) = CudaError::from_result(result) {
            CUDA_INITIALIZED.store(false, Ordering::SeqCst);
            return Err(err);
        }
    }

    let mut count: i32 = 0;
    // SAFETY: count is a valid pointer.
    let result = unsafe { cuDeviceGetCount(&mut count) };
    if let Some(err) = CudaError::from_result(result) {
        return Err(err);
    }

    Ok(count)
}

/// Query information about a specific CUDA device.
///
/// # Arguments
/// * `ordinal` - Device index (0-based)
///
/// # Returns
/// * `Ok(info)` - Device information
/// * `Err(error)` - Query failed
pub fn get_device_info(ordinal: i32) -> Result<CudaDeviceInfo, CudaError> {
    // Initialize CUDA if not already done
    if !CUDA_INITIALIZED.swap(true, Ordering::SeqCst) {
        // SAFETY: cuInit is safe to call with flags=0.
        let result = unsafe { cuInit(0) };
        if let Some(err) = CudaError::from_result(result) {
            CUDA_INITIALIZED.store(false, Ordering::SeqCst);
            return Err(err);
        }
    }

    let mut device: CuDevice = 0;
    // SAFETY: device is a valid pointer.
    let result = unsafe { cuDeviceGet(&mut device, ordinal) };
    if let Some(err) = CudaError::from_result(result) {
        return Err(err);
    }

    CudaComputeProvider::query_device_info(device, ordinal)
}

/// List all available CUDA devices.
///
/// # Returns
/// Vector of device information for all CUDA-capable devices.
pub fn list_devices() -> Vec<CudaDeviceInfo> {
    let count = match get_device_count() {
        Ok(c) => c,
        Err(_) => return Vec::new(),
    };

    let mut devices = Vec::with_capacity(count as usize);
    for i in 0..count {
        if let Ok(info) = get_device_info(i) {
            devices.push(info);
        }
    }

    devices
}

// ═══════════════════════════════════════════════════════════════════════════════
// TESTS
// ═══════════════════════════════════════════════════════════════════════════════

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_cuda_error_codes() {
        assert!(CudaError::from_result(CUDA_SUCCESS).is_none());
        assert_eq!(
            CudaError::from_result(CUDA_ERROR_NOT_INITIALIZED),
            Some(CudaError::NotInitialized)
        );
        assert_eq!(
            CudaError::from_result(CUDA_ERROR_OUT_OF_MEMORY),
            Some(CudaError::OutOfMemory)
        );
        assert_eq!(
            CudaError::from_result(CUDA_ERROR_NO_DEVICE),
            Some(CudaError::NoDevice)
        );
    }

    #[test]
    fn test_cuda_error_descriptions() {
        assert_eq!(
            CudaError::NotInitialized.description(),
            "CUDA driver not initialized"
        );
        assert_eq!(CudaError::OutOfMemory.description(), "Out of device memory");
        assert_eq!(
            CudaError::NoDevice.description(),
            "No CUDA-capable device found"
        );
    }

    #[test]
    fn test_nvcomp_error_codes() {
        assert!(NvcompError::from_status(NVCOMP_SUCCESS).is_none());
        assert_eq!(
            NvcompError::from_status(NVCOMP_ERROR_INVALID_VALUE),
            Some(NvcompError::InvalidValue)
        );
        assert_eq!(
            NvcompError::from_status(NVCOMP_ERROR_NOT_SUPPORTED),
            Some(NvcompError::NotSupported)
        );
    }

    #[test]
    fn test_nvcomp_error_descriptions() {
        assert_eq!(NvcompError::InvalidValue.description(), "Invalid parameter");
        assert_eq!(
            NvcompError::NotSupported.description(),
            "Operation not supported"
        );
        assert_eq!(NvcompError::CudaError.description(), "CUDA error occurred");
    }

    #[test]
    fn test_lz4_opts_default() {
        let opts = NvcompBatchedLz4Opts::default();
        assert_eq!(opts.data_type, 0);
        assert_eq!(opts.chunk_size, 65536);
    }

    #[test]
    fn test_zstd_opts_default() {
        let opts = NvcompBatchedZstdOpts::default();
        assert_eq!(opts.reserved, 0);
    }

    #[test]
    fn test_context_handle_null_check() {
        // SAFETY: Creating handle with null is safe for testing.
        let handle = unsafe { CudaContextHandle::from_raw(core::ptr::null_mut()) };
        assert!(handle.is_null());
    }

    #[test]
    fn test_stream_handle_null_check() {
        // SAFETY: Creating handle with null is safe for testing.
        let handle = unsafe { CudaStreamHandle::from_raw(core::ptr::null_mut()) };
        assert!(handle.is_null());
    }

    #[test]
    fn test_device_info_bandwidth_calculation() {
        let info = CudaDeviceInfo {
            ordinal: 0,
            name: [0u8; 256],
            total_memory: 24 * 1024 * 1024 * 1024, // 24GB
            sm_count: 128,
            compute_major: 8,
            compute_minor: 9,
            memory_clock_khz: 10501000, // ~10.5 GHz effective
            memory_bus_width: 384,      // 384-bit bus
        };

        let bandwidth = info.memory_bandwidth_gbps();
        // Expected: 2 * 10.501 GHz * 48 bytes = ~1008 GB/s
        assert!(bandwidth > 900.0 && bandwidth < 1100.0);
    }

    #[test]
    fn test_device_info_name_str() {
        let mut info = CudaDeviceInfo {
            ordinal: 0,
            name: [0u8; 256],
            total_memory: 0,
            sm_count: 0,
            compute_major: 0,
            compute_minor: 0,
            memory_clock_khz: 0,
            memory_bus_width: 0,
        };

        // Set name to "Test GPU"
        let name = b"Test GPU";
        info.name[..name.len()].copy_from_slice(name);

        assert_eq!(info.name_str(), "Test GPU");
    }
}