lcpfs 2026.1.102

// Copyright 2025 LunaOS Contributors
// SPDX-License-Identifier: Apache-2.0
//
// GPU-Accelerated Compression Backend
// Hardware-accelerated compression offload to GPU compute units.
// Abstraction layer with pluggable backends (CUDA/ROCm/Vulkan).

//! # Hardware-Accelerated Compression
//!
//! This module provides compression with optional hardware acceleration.
//!
//! ## Quick Start
//!
//! ```rust,ignore
//! use lcpfs::lcpfs_gpu_compress::{compress, GpuCompressAlgo};
//!
//! let data = b"Hello, World!".repeat(1000);
//! let compressed = compress(GpuCompressAlgo::Lz4, &data, 0);
//! ```
//!
//! ## How It Works
//!
//! - **Default**: Uses high-performance CPU compression via `lz4_flex`
//! - **With `gpu-compute` feature + provider**: Uses GPU acceleration
//!
//! The `compress()` function automatically uses the best available backend.
//!
//! ## Performance
//!
//! | Algorithm | CPU Speed  | With GPU   | Notes                    |
//! |-----------|------------|------------|--------------------------|
//! | LZ4       | ~500 MB/s  | ~5 GB/s    | GPU requires provider    |
//! | ZSTD      | ~200 MB/s  | ~2 GB/s    | Requires `std` feature   |
//!
//! ## GPU Acceleration
//!
//! GPU acceleration requires:
//! 1. Enable `gpu-compute` feature in Cargo.toml
//! 2. A `GpuComputeProvider` implementation registered at runtime
//!
//! **LunaOS**: Automatically provided by `luna_nvidia` driver.
//!
//! **Linux/Other**: Implement `GpuComputeProvider` trait and call
//! `register_compute_provider()` during your init. See trait docs for API.
//!
//! Without a registered provider, `compress()` uses fast CPU fallback.

use alloc::collections::BTreeMap;
use alloc::vec::Vec;
use lazy_static::lazy_static;
use spin::Mutex;

// ═══════════════════════════════════════════════════════════════════════════════
// GPU BACKEND TYPE
// ═══════════════════════════════════════════════════════════════════════════════

/// GPU backend type (selected at runtime based on available hardware)
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum GpuBackend {
    /// CPU fallback (simulation or no GPU available)
    Cpu,
    /// NVIDIA CUDA (via luna_nvidia)
    Cuda,
    /// AMD ROCm
    Rocm,
    /// Vulkan compute (cross-platform)
    Vulkan,
}

impl GpuBackend {
    /// Get human-readable name
    pub fn name(&self) -> &'static str {
        match self {
            GpuBackend::Cpu => "CPU (fallback)",
            GpuBackend::Cuda => "NVIDIA CUDA",
            GpuBackend::Rocm => "AMD ROCm",
            GpuBackend::Vulkan => "Vulkan Compute",
        }
    }

    /// Check if this is a real GPU backend
    pub fn is_gpu(&self) -> bool {
        !matches!(self, GpuBackend::Cpu)
    }
}

/// GPU compression algorithm
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum GpuCompressAlgo {
    /// LZ4 fast compression
    Lz4,
    /// ZSTD balanced compression
    Zstd,
}

/// GPU device capabilities
#[derive(Debug, Clone)]
pub struct GpuCapabilities {
    /// Device ID
    pub device_id: u32,
    /// Device name
    pub name: &'static str,
    /// Backend type
    pub backend: GpuBackend,
    /// Compute units (SMs/CUs)
    pub compute_units: u32,
    /// Memory bandwidth (GB/s)
    pub memory_bandwidth: u32,
    /// Max buffer size (bytes)
    pub max_buffer_size: u64,
    /// Supports LZ4
    pub supports_lz4: bool,
    /// Supports ZSTD
    pub supports_zstd: bool,
}

impl GpuCapabilities {
    /// Create new GPU capabilities
    pub fn new(
        device_id: u32,
        name: &'static str,
        compute_units: u32,
        memory_bandwidth: u32,
    ) -> Self {
        Self {
            device_id,
            name,
            backend: GpuBackend::Cpu, // Default to CPU simulation
            compute_units,
            memory_bandwidth,
            max_buffer_size: 16 * 1024 * 1024 * 1024, // 16GB typical
            supports_lz4: true,
            supports_zstd: true,
        }
    }

    /// Create capabilities for a real CUDA device
    #[cfg(feature = "gpu-compute")]
    pub fn from_cuda(
        device_id: u32,
        name: &'static str,
        compute_units: u32,
        memory_bandwidth: u32,
        max_buffer_size: u64,
    ) -> Self {
        Self {
            device_id,
            name,
            backend: GpuBackend::Cuda,
            compute_units,
            memory_bandwidth,
            max_buffer_size,
            supports_lz4: true,
            supports_zstd: true,
        }
    }

    /// Check if algorithm is supported
    pub fn supports_algo(&self, algo: GpuCompressAlgo) -> bool {
        match algo {
            GpuCompressAlgo::Lz4 => self.supports_lz4,
            GpuCompressAlgo::Zstd => self.supports_zstd,
        }
    }

    /// Check if this is a real GPU (not CPU simulation)
    pub fn is_real_gpu(&self) -> bool {
        self.backend.is_gpu()
    }
}

/// GPU compression command
#[derive(Debug, Clone)]
pub struct GpuCompressCommand {
    /// Command ID
    pub cmd_id: u64,
    /// Algorithm
    pub algo: GpuCompressAlgo,
    /// Input size
    pub input_size: u64,
    /// Expected output size
    pub output_size: u64,
    /// Compression level (1-22 for ZSTD, ignored for LZ4)
    pub level: u8,
    /// Submitted timestamp
    pub submitted: u64,
}

/// GPU compression result
#[derive(Debug, Clone)]
pub struct GpuCompressResult {
    /// Command ID
    pub cmd_id: u64,
    /// Actual compressed size
    pub compressed_size: u64,
    /// Compression ratio
    pub ratio: f32,
    /// GPU execution time (microseconds)
    pub gpu_time_us: u64,
    /// Total time including transfers (microseconds)
    pub total_time_us: u64,
}

impl GpuCompressResult {
    /// Calculate throughput in GB/s
    pub fn throughput_gbps(&self, input_size: u64) -> f32 {
        if self.total_time_us == 0 {
            return 0.0;
        }
        // GB/s = bytes / (microseconds / 1_000_000) / 1e9
        (input_size as f64 / (self.total_time_us as f64 / 1_000_000.0) / 1e9) as f32
    }

    /// Calculate speedup vs CPU (assuming CPU = 500 MB/s for LZ4, 200 MB/s for ZSTD)
    pub fn speedup_vs_cpu(&self, input_size: u64, algo: GpuCompressAlgo) -> f32 {
        let gpu_throughput = self.throughput_gbps(input_size) * 1000.0; // Convert to MB/s
        let cpu_throughput = match algo {
            GpuCompressAlgo::Lz4 => 500.0,  // 500 MB/s
            GpuCompressAlgo::Zstd => 200.0, // 200 MB/s
        };
        gpu_throughput / cpu_throughput
    }
}

/// GPU compression statistics
#[derive(Debug, Clone, Default)]
pub struct GpuCompressStats {
    /// Total operations
    pub total_ops: u64,
    /// Operations offloaded to GPU
    pub gpu_ops: u64,
    /// Operations fallback to CPU
    pub cpu_fallback: u64,
    /// Total bytes compressed
    pub total_bytes: u64,
    /// Total compressed bytes
    pub compressed_bytes: u64,
    /// Total GPU time (microseconds)
    pub total_gpu_time_us: u64,
    /// Total time (microseconds)
    pub total_time_us: u64,
}

impl GpuCompressStats {
    /// Calculate GPU offload ratio
    pub fn offload_ratio(&self) -> f32 {
        if self.total_ops == 0 {
            return 0.0;
        }
        self.gpu_ops as f32 / self.total_ops as f32
    }

    /// Calculate average throughput (GB/s)
    pub fn avg_throughput_gbps(&self) -> f32 {
        if self.total_time_us == 0 {
            return 0.0;
        }
        (self.total_bytes as f64 / (self.total_time_us as f64 / 1_000_000.0) / 1e9) as f32
    }

    /// Calculate average compression ratio
    pub fn avg_compression_ratio(&self) -> f32 {
        if self.compressed_bytes == 0 {
            return 1.0;
        }
        self.total_bytes as f32 / self.compressed_bytes as f32
    }

    /// Calculate average speedup vs CPU
    pub fn avg_speedup(&self) -> f32 {
        // Assume mixed workload (70% LZ4, 30% ZSTD)
        let cpu_throughput = 0.7 * 500.0 + 0.3 * 200.0; // MB/s
        let gpu_throughput = self.avg_throughput_gbps() * 1000.0; // MB/s
        if cpu_throughput == 0.0 {
            return 1.0;
        }
        gpu_throughput / cpu_throughput
    }
}

/// GPU compression manager
pub struct GpuCompressManager {
    /// GPU capabilities
    capabilities: Option<GpuCapabilities>,
    /// Pending commands
    pending: BTreeMap<u64, GpuCompressCommand>,
    /// Next command ID
    next_cmd_id: u64,
    /// Statistics
    stats: GpuCompressStats,
}

impl Default for GpuCompressManager {
    fn default() -> Self {
        Self::new()
    }
}

impl GpuCompressManager {
    /// Create new GPU compression manager
    pub fn new() -> Self {
        Self {
            capabilities: None,
            pending: BTreeMap::new(),
            next_cmd_id: 1,
            stats: GpuCompressStats::default(),
        }
    }

    /// Register GPU device
    pub fn register_device(&mut self, caps: GpuCapabilities) {
        self.capabilities = Some(caps);
    }

    /// Check if GPU is available
    pub fn is_available(&self) -> bool {
        self.capabilities.is_some()
    }

    /// Submit compression command
    ///
    /// # Returns
    /// * `Some(cmd_id)` if offloaded to GPU
    /// * `None` if should use CPU fallback
    pub fn submit(
        &mut self,
        algo: GpuCompressAlgo,
        input_size: u64,
        level: u8,
        timestamp: u64,
    ) -> Option<u64> {
        // Check if GPU available and supports algorithm
        if let Some(caps) = &self.capabilities {
            if !caps.supports_algo(algo) {
                self.stats.cpu_fallback += 1;
                return None;
            }

            if input_size > caps.max_buffer_size {
                self.stats.cpu_fallback += 1;
                return None;
            }

            // Submit to GPU
            let cmd_id = self.next_cmd_id;
            self.next_cmd_id += 1;

            let cmd = GpuCompressCommand {
                cmd_id,
                algo,
                input_size,
                output_size: input_size, // Will be updated on completion
                level,
                submitted: timestamp,
            };

            self.pending.insert(cmd_id, cmd);
            self.stats.gpu_ops += 1;
            self.stats.total_ops += 1;

            Some(cmd_id)
        } else {
            self.stats.cpu_fallback += 1;
            self.stats.total_ops += 1;
            None
        }
    }

    /// Complete compression command (simulated GPU execution)
    pub fn complete(
        &mut self,
        cmd_id: u64,
        compressed_size: u64,
        current_time: u64,
    ) -> Option<GpuCompressResult> {
        if let Some(cmd) = self.pending.remove(&cmd_id) {
            // Simulate GPU compression performance
            // GPU is ~10x faster than CPU for large buffers
            let cpu_time = match cmd.algo {
                GpuCompressAlgo::Lz4 => cmd.input_size / 500_000, // 500 MB/s CPU
                GpuCompressAlgo::Zstd => cmd.input_size / 200_000, // 200 MB/s CPU
            };
            let gpu_time = cpu_time / 10; // 10x speedup
            let transfer_overhead = cmd.input_size / 50_000_000; // 50 GB/s PCIe
            let total_time = gpu_time + transfer_overhead * 2; // Upload + download

            let result = GpuCompressResult {
                cmd_id,
                compressed_size,
                ratio: cmd.input_size as f32 / compressed_size as f32,
                gpu_time_us: gpu_time,
                total_time_us: total_time.max(1),
            };

            // Update statistics
            self.stats.total_bytes += cmd.input_size;
            self.stats.compressed_bytes += compressed_size;
            self.stats.total_gpu_time_us += gpu_time;
            self.stats.total_time_us += total_time;

            Some(result)
        } else {
            None
        }
    }

    /// Get statistics
    pub fn stats(&self) -> GpuCompressStats {
        self.stats.clone()
    }

    /// Get device capabilities
    pub fn capabilities(&self) -> Option<&GpuCapabilities> {
        self.capabilities.as_ref()
    }
}

// ═══════════════════════════════════════════════════════════════════════════════
// GLOBAL GPU COMPRESSION ENGINE
// ═══════════════════════════════════════════════════════════════════════════════

lazy_static! {
    static ref GPU_COMPRESS_ENGINE: Mutex<GpuCompressManager> =
        Mutex::new(GpuCompressManager::new());
}

/// Global GPU compression engine API
pub struct GpuCompressEngine;

impl GpuCompressEngine {
    /// Register GPU device
    pub fn register_device(caps: GpuCapabilities) {
        let mut engine = GPU_COMPRESS_ENGINE.lock();
        engine.register_device(caps);
    }

    /// Check if GPU is available
    pub fn is_available() -> bool {
        let engine = GPU_COMPRESS_ENGINE.lock();
        engine.is_available()
    }

    /// Submit compression command
    pub fn submit(
        algo: GpuCompressAlgo,
        input_size: u64,
        level: u8,
        timestamp: u64,
    ) -> Option<u64> {
        let mut engine = GPU_COMPRESS_ENGINE.lock();
        engine.submit(algo, input_size, level, timestamp)
    }

    /// Complete compression command
    pub fn complete(
        cmd_id: u64,
        compressed_size: u64,
        current_time: u64,
    ) -> Option<GpuCompressResult> {
        let mut engine = GPU_COMPRESS_ENGINE.lock();
        engine.complete(cmd_id, compressed_size, current_time)
    }

    /// Get statistics
    pub fn stats() -> GpuCompressStats {
        let engine = GPU_COMPRESS_ENGINE.lock();
        engine.stats()
    }

    /// Get device capabilities
    pub fn capabilities() -> Option<GpuCapabilities> {
        let engine = GPU_COMPRESS_ENGINE.lock();
        engine.capabilities().cloned()
    }
}

/// Create typical NVIDIA GPU for testing
pub fn create_nvidia_rtx4090() -> GpuCapabilities {
    GpuCapabilities::new(
        0,
        "NVIDIA RTX 4090",
        128,  // 128 SMs
        1008, // 1008 GB/s memory bandwidth
    )
}

/// Create typical AMD GPU for testing
pub fn create_amd_mi300x() -> GpuCapabilities {
    GpuCapabilities::new(
        1,
        "AMD MI300X",
        304,  // 304 CUs
        5300, // 5.3 TB/s memory bandwidth
    )
}

// ═══════════════════════════════════════════════════════════════════════════════
// GPU COMPUTE TRAIT (for luna_nvidia integration)
// ═══════════════════════════════════════════════════════════════════════════════

/// Trait for GPU compute providers.
///
/// This trait is implemented by `luna_nvidia` to provide real GPU compute.
/// LCPFS calls these methods when the `gpu-compute` feature is enabled and
/// a GPU has been registered.
///
/// # Safety
///
/// Implementations must ensure:
/// - Device memory is properly allocated and freed
/// - DMA transfers complete before returning
/// - Error conditions are handled (device lost, OOM, etc.)
pub trait GpuComputeProvider: Send + Sync {
    /// Allocate device memory
    ///
    /// Returns device memory address or None if allocation failed.
    fn allocate(&self, size: usize) -> Option<u64>;

    /// Free device memory
    fn free(&self, device_addr: u64);

    /// Copy data from host to device
    fn copy_to_device(&self, device_addr: u64, data: &[u8]) -> bool;

    /// Copy data from device to host
    fn copy_from_device(&self, device_addr: u64, data: &mut [u8]) -> bool;

    /// Execute LZ4 compression kernel
    ///
    /// # Arguments
    /// * `input_addr` - Device memory address of input data
    /// * `input_size` - Size of input data in bytes
    /// * `output_addr` - Device memory address for compressed output
    /// * `output_size` - Maximum output buffer size
    ///
    /// # Returns
    /// Actual compressed size, or 0 on failure
    fn compress_lz4(
        &self,
        input_addr: u64,
        input_size: usize,
        output_addr: u64,
        output_size: usize,
    ) -> usize;

    /// Execute ZSTD compression kernel
    ///
    /// # Arguments
    /// * `input_addr` - Device memory address of input data
    /// * `input_size` - Size of input data in bytes
    /// * `output_addr` - Device memory address for compressed output
    /// * `output_size` - Maximum output buffer size
    /// * `level` - Compression level (1-22)
    ///
    /// # Returns
    /// Actual compressed size, or 0 on failure
    fn compress_zstd(
        &self,
        input_addr: u64,
        input_size: usize,
        output_addr: u64,
        output_size: usize,
        level: u8,
    ) -> usize;

    /// Synchronize GPU (wait for all pending operations)
    fn synchronize(&self);
}

/// Global GPU compute provider (set by luna_nvidia during init)
#[cfg(feature = "gpu-compute")]
static GPU_COMPUTE_PROVIDER: spin::Once<&'static dyn GpuComputeProvider> = spin::Once::new();

/// Register a GPU compute provider (called by luna_nvidia)
#[cfg(feature = "gpu-compute")]
pub fn register_compute_provider(provider: &'static dyn GpuComputeProvider) {
    GPU_COMPUTE_PROVIDER.call_once(|| provider);
}

/// Get the registered GPU compute provider
#[cfg(feature = "gpu-compute")]
pub fn get_compute_provider() -> Option<&'static dyn GpuComputeProvider> {
    GPU_COMPUTE_PROVIDER.get().copied()
}

/// Compress data using GPU (when gpu-compute feature is enabled)
#[cfg(feature = "gpu-compute")]
pub fn gpu_compress(algo: GpuCompressAlgo, input: &[u8], level: u8) -> Option<Vec<u8>> {
    let provider = get_compute_provider()?;

    // Allocate device memory for input
    let input_addr = provider.allocate(input.len())?;

    // Allocate device memory for output (worst case: no compression)
    let output_size = input.len() + 1024; // Extra space for headers
    let output_addr = provider.allocate(output_size)?;

    // Copy input to device
    if !provider.copy_to_device(input_addr, input) {
        provider.free(input_addr);
        provider.free(output_addr);
        return None;
    }

    // Execute compression kernel
    let compressed_size = match algo {
        GpuCompressAlgo::Lz4 => {
            provider.compress_lz4(input_addr, input.len(), output_addr, output_size)
        }
        GpuCompressAlgo::Zstd => {
            provider.compress_zstd(input_addr, input.len(), output_addr, output_size, level)
        }
    };

    if compressed_size == 0 {
        provider.free(input_addr);
        provider.free(output_addr);
        return None;
    }

    // Copy result back to host
    let mut output = alloc::vec![0u8; compressed_size];
    if !provider.copy_from_device(output_addr, &mut output) {
        provider.free(input_addr);
        provider.free(output_addr);
        return None;
    }

    // Free device memory
    provider.free(input_addr);
    provider.free(output_addr);

    Some(output)
}

/// Compress data using GPU or CPU fallback
pub fn compress(algo: GpuCompressAlgo, input: &[u8], level: u8) -> Vec<u8> {
    // Try GPU if available
    #[cfg(feature = "gpu-compute")]
    if let Some(result) = gpu_compress(algo, input, level) {
        return result;
    }

    // CPU fallback
    match algo {
        GpuCompressAlgo::Lz4 => lz4_flex::compress_prepend_size(input),
        GpuCompressAlgo::Zstd => {
            // ZSTD requires std feature, fallback to LZ4 in no_std
            #[cfg(feature = "std")]
            {
                let mut encoder = zstd::stream::Encoder::new(Vec::new(), level as i32).unwrap();
                std::io::copy(&mut std::io::Cursor::new(input), &mut encoder).unwrap();
                encoder.finish().unwrap()
            }
            #[cfg(not(feature = "std"))]
            {
                // Fallback to LZ4 when std is not available
                lz4_flex::compress_prepend_size(input)
            }
        }
    }
}

// ═══════════════════════════════════════════════════════════════════════════════
// TESTS
// ═══════════════════════════════════════════════════════════════════════════════

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_gpu_capabilities() {
        let caps = create_nvidia_rtx4090();
        assert_eq!(caps.name, "NVIDIA RTX 4090");
        assert_eq!(caps.compute_units, 128);
        assert_eq!(caps.memory_bandwidth, 1008);
        assert!(caps.supports_lz4);
        assert!(caps.supports_zstd);
    }

    #[test]
    fn test_device_registration() {
        let mut mgr = GpuCompressManager::new();
        assert!(!mgr.is_available());

        mgr.register_device(create_nvidia_rtx4090());
        assert!(mgr.is_available());
    }

    #[test]
    fn test_submit_command() {
        let mut mgr = GpuCompressManager::new();
        mgr.register_device(create_nvidia_rtx4090());

        let cmd_id = mgr.submit(GpuCompressAlgo::Lz4, 1_000_000, 0, 0);
        assert!(cmd_id.is_some());

        let stats = mgr.stats();
        assert_eq!(stats.gpu_ops, 1);
        assert_eq!(stats.total_ops, 1);
    }

    #[test]
    fn test_cpu_fallback() {
        let mut mgr = GpuCompressManager::new();
        // No GPU registered

        let cmd_id = mgr.submit(GpuCompressAlgo::Lz4, 1_000_000, 0, 0);
        assert!(cmd_id.is_none());

        let stats = mgr.stats();
        assert_eq!(stats.cpu_fallback, 1);
        assert_eq!(stats.gpu_ops, 0);
    }

    #[test]
    fn test_complete_command() {
        let mut mgr = GpuCompressManager::new();
        mgr.register_device(create_nvidia_rtx4090());

        let cmd_id = mgr
            .submit(GpuCompressAlgo::Lz4, 10_000_000, 0, 0)
            .expect("test: operation should succeed");
        let result = mgr
            .complete(cmd_id, 5_000_000, 100)
            .expect("test: operation should succeed");

        assert_eq!(result.cmd_id, cmd_id);
        assert_eq!(result.compressed_size, 5_000_000);
        assert_eq!(result.ratio, 2.0);
    }

    #[test]
    fn test_throughput_calculation() {
        let result = GpuCompressResult {
            cmd_id: 1,
            compressed_size: 5_000_000,
            ratio: 2.0,
            gpu_time_us: 1000,
            total_time_us: 2000,
        };

        let throughput = result.throughput_gbps(10_000_000);
        // 10MB in 2000us = 10MB / 0.002s = 5000 MB/s = 5 GB/s
        assert!((throughput - 5.0).abs() < 0.1);
    }

    #[test]
    fn test_speedup_calculation() {
        let result = GpuCompressResult {
            cmd_id: 1,
            compressed_size: 5_000_000,
            ratio: 2.0,
            gpu_time_us: 1000,
            total_time_us: 2000,
        };

        let speedup = result.speedup_vs_cpu(10_000_000, GpuCompressAlgo::Lz4);
        // GPU: 5000 MB/s, CPU: 500 MB/s -> 10x speedup
        assert!((speedup - 10.0).abs() < 0.5);
    }

    #[test]
    fn test_statistics() {
        let mut mgr = GpuCompressManager::new();
        mgr.register_device(create_nvidia_rtx4090());

        // Submit 10 commands
        for i in 0..10 {
            let cmd_id = mgr
                .submit(GpuCompressAlgo::Lz4, 1_000_000, 0, i)
                .expect("test: operation should succeed");
            mgr.complete(cmd_id, 500_000, i + 100);
        }

        let stats = mgr.stats();
        assert_eq!(stats.total_ops, 10);
        assert_eq!(stats.gpu_ops, 10);
        assert_eq!(stats.total_bytes, 10_000_000);
        assert_eq!(stats.compressed_bytes, 5_000_000);
        assert_eq!(stats.avg_compression_ratio(), 2.0);
    }

    #[test]
    fn test_offload_ratio() {
        let mut mgr = GpuCompressManager::new();

        // 5 without GPU (CPU fallback)
        for _ in 0..5 {
            mgr.submit(GpuCompressAlgo::Lz4, 1_000_000, 0, 0);
        }

        // Register GPU
        mgr.register_device(create_nvidia_rtx4090());

        // 15 with GPU
        for _ in 0..15 {
            mgr.submit(GpuCompressAlgo::Lz4, 1_000_000, 0, 0);
        }

        let stats = mgr.stats();
        assert_eq!(stats.total_ops, 20);
        assert_eq!(stats.gpu_ops, 15);
        assert_eq!(stats.cpu_fallback, 5);
        assert_eq!(stats.offload_ratio(), 0.75);
    }

    #[test]
    fn test_large_buffer_fallback() {
        let mut mgr = GpuCompressManager::new();
        let mut caps = create_nvidia_rtx4090();
        caps.max_buffer_size = 1_000_000; // 1MB max
        mgr.register_device(caps);

        // Too large buffer
        let cmd_id = mgr.submit(GpuCompressAlgo::Lz4, 10_000_000, 0, 0);
        assert!(cmd_id.is_none());

        let stats = mgr.stats();
        assert_eq!(stats.cpu_fallback, 1);
    }
}