lcpfs 2026.1.102

// Copyright 2025 LunaOS Contributors
// SPDX-License-Identifier: Apache-2.0
//
// QLoRA Compression Provider Interface
// Provides integration point for LunaOS libluna's QLoRA implementation.

//! # QLoRA Compression Provider
//!
//! This module provides an interface for QLoRA (Quantized Low-Rank Adaptation)
//! compression, which achieves 16-32x compression ratios for AI model weights
//! and embeddings while preserving semantic quality.
//!
//! ## Architecture
//!
//! LCPFS defines the `QLoraProvider` trait that LunaOS's libluna implements.
//! This allows the filesystem to automatically detect and compress AI/ML data
//! using QLoRA when running on LunaOS, while falling back to standard
//! compression on other platforms.
//!
//! ## How QLoRA Works
//!
//! QLoRA combines two techniques:
//! 1. **Quantization**: Reduces precision from FP32/FP16 to 4-bit or 8-bit
//! 2. **Low-Rank Adaptation**: Captures the essential structure of weight matrices
//!
//! This is particularly effective for:
//! - Transformer model weights
//! - Embedding vectors
//! - Neural network parameters
//! - Activation tensors
//!
//! ## LunaOS Integration
//!
//! On LunaOS, libluna registers its QLoRA provider at boot:
//!
//! ```rust,ignore
//! // In libluna initialization:
//! use lcpfs::lcpfs_qlora::{register_qlora_provider, QLoraProvider};
//!
//! struct LiblunaQLoraProvider { /* ... */ }
//!
//! impl QLoraProvider for LiblunaQLoraProvider {
//!     fn is_qlora_candidate(&self, data: &[u8]) -> bool {
//!         // Detect AI/embedding data via magic bytes, structure, or entropy
//!         // Returns true for: .safetensors, .gguf, raw FP16/FP32 tensors
//!         detect_tensor_data(data)
//!     }
//!
//!     fn compress(&self, data: &[u8], bits: u8) -> Option<(Vec<u8>, QLoraMetadata)> {
//!         // Use CUDA/ROCm for GPU-accelerated quantization
//!         let (quantized, scales) = quantize_tensor(data, bits)?;
//!         let (lora_a, lora_b) = compute_low_rank_factors(&quantized, rank)?;
//!         // ... pack and return
//!     }
//!
//!     fn decompress(&self, compressed: &[u8], meta: &QLoraMetadata) -> Option<Vec<u8>> {
//!         // Reconstruct: dequantize(lora_a @ lora_b) * scale
//!     }
//! }
//!
//! static PROVIDER: LiblunaQLoraProvider = LiblunaQLoraProvider::new();
//!
//! pub fn init() {
//!     register_qlora_provider(&PROVIDER);
//! }
//! ```
//!
//! ## Compression Ratios
//!
//! | Original Format | QLoRA Bits | Typical Ratio |
//! |-----------------|------------|---------------|
//! | FP32            | 4-bit      | 32x           |
//! | FP32            | 8-bit      | 16x           |
//! | FP16            | 4-bit      | 16x           |
//! | FP16            | 8-bit      | 8x            |
//! | BF16            | 4-bit      | 16x           |
//!
//! ## Quality Preservation
//!
//! QLoRA is designed to preserve semantic quality:
//! - Cosine similarity > 0.99 for embeddings
//! - Perplexity increase < 0.5% for language models
//! - mAP drop < 0.1% for vision models
//!
//! The low-rank factors capture the principal components of weight matrices,
//! allowing reconstruction with minimal information loss.

use alloc::string::String;
use alloc::vec::Vec;

// ═══════════════════════════════════════════════════════════════════════════════
// QLORA METADATA
// ═══════════════════════════════════════════════════════════════════════════════

/// QLoRA compression metadata.
///
/// This metadata is stored alongside compressed data and is required
/// for decompression. It contains all parameters needed to reconstruct
/// the original tensor.
#[derive(Debug, Clone)]
pub struct QLoraMetadata {
    /// Original uncompressed size in bytes.
    pub original_size: u64,

    /// Quantization bit width (typically 4 or 8).
    ///
    /// - 4-bit: Maximum compression, slight quality loss
    /// - 8-bit: Balanced compression and quality
    pub bits: u8,

    /// LoRA rank (dimensionality of low-rank factors).
    ///
    /// Higher rank = better quality but larger size.
    /// Typical values: 8, 16, 32, 64, 128.
    pub rank: u16,

    /// Original tensor dimensions.
    ///
    /// Format: [dim0, dim1, ...] for multi-dimensional tensors.
    pub dimensions: Vec<u32>,

    /// Original data type.
    pub dtype: TensorDtype,

    /// Per-group scale factors for dequantization.
    ///
    /// Quantization uses group-wise scaling to preserve dynamic range.
    /// Each group (typically 32-128 elements) has its own scale factor.
    pub scale_factors: Vec<f32>,

    /// Zero points for asymmetric quantization (optional).
    ///
    /// Used when the tensor has a non-zero mean.
    pub zero_points: Option<Vec<i8>>,

    /// Checksum of original data for integrity verification.
    pub original_checksum: u64,

    /// Compression timestamp (Unix epoch seconds).
    pub timestamp: u64,

    /// Additional metadata (e.g., model name, layer info).
    pub extra: Option<String>,
}

impl QLoraMetadata {
    /// Create new metadata with minimal fields.
    pub fn new(original_size: u64, bits: u8, rank: u16) -> Self {
        Self {
            original_size,
            bits,
            rank,
            dimensions: Vec::new(),
            dtype: TensorDtype::Float32,
            scale_factors: Vec::new(),
            zero_points: None,
            original_checksum: 0,
            timestamp: 0,
            extra: None,
        }
    }

    /// Calculate the theoretical compression ratio.
    pub fn compression_ratio(&self, compressed_size: usize) -> f64 {
        if compressed_size == 0 {
            return 0.0;
        }
        self.original_size as f64 / compressed_size as f64
    }

    /// Estimate expected compression ratio based on parameters.
    pub fn expected_ratio(&self) -> f64 {
        let original_bits = self.dtype.bits() as f64;
        let quantized_bits = self.bits as f64;

        // Base ratio from bit reduction
        let bit_ratio = original_bits / quantized_bits;

        // LoRA overhead reduction (approximate)
        // Full matrix: m*n elements
        // LoRA: m*r + r*n elements (where r << min(m,n))
        // For typical cases, this adds ~10-20% overhead
        bit_ratio * 0.85
    }

    /// Validate metadata consistency.
    pub fn is_valid(&self) -> bool {
        // Check basic constraints
        if self.original_size == 0 {
            return false;
        }
        if self.bits != 4 && self.bits != 8 {
            return false;
        }
        if self.rank == 0 || self.rank > 1024 {
            return false;
        }
        if self.scale_factors.is_empty() {
            return false;
        }

        // Validate dimensions
        let total_elements: u64 = self.dimensions.iter().map(|&d| d as u64).product();
        let expected_size = total_elements * (self.dtype.bits() as u64 / 8);
        if expected_size != self.original_size && !self.dimensions.is_empty() {
            return false;
        }

        true
    }

    /// Serialize metadata to bytes.
    pub fn to_bytes(&self) -> Vec<u8> {
        let mut bytes = Vec::with_capacity(64 + self.scale_factors.len() * 4);

        // Header: magic + version
        bytes.extend_from_slice(b"QLOR");
        bytes.push(1); // Version 1

        // Core fields
        bytes.extend_from_slice(&self.original_size.to_le_bytes());
        bytes.push(self.bits);
        bytes.extend_from_slice(&self.rank.to_le_bytes());
        bytes.push(self.dtype as u8);

        // Dimensions
        bytes.extend_from_slice(&(self.dimensions.len() as u16).to_le_bytes());
        for &dim in &self.dimensions {
            bytes.extend_from_slice(&dim.to_le_bytes());
        }

        // Scale factors
        bytes.extend_from_slice(&(self.scale_factors.len() as u32).to_le_bytes());
        for &scale in &self.scale_factors {
            bytes.extend_from_slice(&scale.to_le_bytes());
        }

        // Zero points (optional)
        if let Some(ref zp) = self.zero_points {
            bytes.push(1);
            bytes.extend_from_slice(&(zp.len() as u32).to_le_bytes());
            for &z in zp {
                bytes.push(z as u8);
            }
        } else {
            bytes.push(0);
        }

        // Checksum and timestamp
        bytes.extend_from_slice(&self.original_checksum.to_le_bytes());
        bytes.extend_from_slice(&self.timestamp.to_le_bytes());

        bytes
    }

    /// Deserialize metadata from bytes.
    pub fn from_bytes(bytes: &[u8]) -> Option<Self> {
        if bytes.len() < 20 {
            return None;
        }

        // Check magic
        if &bytes[0..4] != b"QLOR" {
            return None;
        }

        let version = bytes[4];
        if version != 1 {
            return None;
        }

        let mut offset = 5;

        // Core fields
        let original_size = u64::from_le_bytes(bytes[offset..offset + 8].try_into().ok()?);
        offset += 8;
        let bits = bytes[offset];
        offset += 1;
        let rank = u16::from_le_bytes(bytes[offset..offset + 2].try_into().ok()?);
        offset += 2;
        let dtype = TensorDtype::from_u8(bytes[offset])?;
        offset += 1;

        // Dimensions
        let dim_count = u16::from_le_bytes(bytes[offset..offset + 2].try_into().ok()?) as usize;
        offset += 2;
        let mut dimensions = Vec::with_capacity(dim_count);
        for _ in 0..dim_count {
            dimensions.push(u32::from_le_bytes(
                bytes[offset..offset + 4].try_into().ok()?,
            ));
            offset += 4;
        }

        // Scale factors
        let scale_count = u32::from_le_bytes(bytes[offset..offset + 4].try_into().ok()?) as usize;
        offset += 4;
        let mut scale_factors = Vec::with_capacity(scale_count);
        for _ in 0..scale_count {
            scale_factors.push(f32::from_le_bytes(
                bytes[offset..offset + 4].try_into().ok()?,
            ));
            offset += 4;
        }

        // Zero points
        let has_zp = bytes[offset] != 0;
        offset += 1;
        let zero_points = if has_zp {
            let zp_count = u32::from_le_bytes(bytes[offset..offset + 4].try_into().ok()?) as usize;
            offset += 4;
            let mut zp = Vec::with_capacity(zp_count);
            for i in 0..zp_count {
                zp.push(bytes[offset + i] as i8);
            }
            offset += zp_count;
            Some(zp)
        } else {
            None
        };

        // Checksum and timestamp
        let original_checksum = u64::from_le_bytes(bytes[offset..offset + 8].try_into().ok()?);
        offset += 8;
        let timestamp = u64::from_le_bytes(bytes[offset..offset + 8].try_into().ok()?);

        Some(Self {
            original_size,
            bits,
            rank,
            dimensions,
            dtype,
            scale_factors,
            zero_points,
            original_checksum,
            timestamp,
            extra: None,
        })
    }
}

impl Default for QLoraMetadata {
    fn default() -> Self {
        Self::new(0, 4, 16)
    }
}

// ═══════════════════════════════════════════════════════════════════════════════
// TENSOR DATA TYPES
// ═══════════════════════════════════════════════════════════════════════════════

/// Tensor data types supported by QLoRA.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[repr(u8)]
pub enum TensorDtype {
    /// 32-bit floating point (IEEE 754).
    Float32 = 0,
    /// 16-bit floating point (IEEE 754 half precision).
    Float16 = 1,
    /// 16-bit brain floating point (truncated FP32).
    BFloat16 = 2,
    /// 64-bit floating point (IEEE 754 double precision).
    Float64 = 3,
    /// 8-bit floating point (E4M3 format).
    Float8E4M3 = 4,
    /// 8-bit floating point (E5M2 format).
    Float8E5M2 = 5,
}

impl TensorDtype {
    /// Get the bit width of this data type.
    pub const fn bits(&self) -> u8 {
        match self {
            TensorDtype::Float32 => 32,
            TensorDtype::Float16 => 16,
            TensorDtype::BFloat16 => 16,
            TensorDtype::Float64 => 64,
            TensorDtype::Float8E4M3 => 8,
            TensorDtype::Float8E5M2 => 8,
        }
    }

    /// Get the byte size of this data type.
    pub const fn bytes(&self) -> u8 {
        self.bits() / 8
    }

    /// Convert from u8.
    pub fn from_u8(value: u8) -> Option<Self> {
        match value {
            0 => Some(TensorDtype::Float32),
            1 => Some(TensorDtype::Float16),
            2 => Some(TensorDtype::BFloat16),
            3 => Some(TensorDtype::Float64),
            4 => Some(TensorDtype::Float8E4M3),
            5 => Some(TensorDtype::Float8E5M2),
            _ => None,
        }
    }

    /// Get a human-readable name.
    pub const fn name(&self) -> &'static str {
        match self {
            TensorDtype::Float32 => "float32",
            TensorDtype::Float16 => "float16",
            TensorDtype::BFloat16 => "bfloat16",
            TensorDtype::Float64 => "float64",
            TensorDtype::Float8E4M3 => "float8_e4m3",
            TensorDtype::Float8E5M2 => "float8_e5m2",
        }
    }
}

// ═══════════════════════════════════════════════════════════════════════════════
// QLORA CONFIGURATION
// ═══════════════════════════════════════════════════════════════════════════════

/// QLoRA compression configuration.
#[derive(Debug, Clone)]
pub struct QLoraConfig {
    /// Quantization bit width (4 or 8).
    pub bits: u8,

    /// LoRA rank for low-rank factorization.
    pub rank: u16,

    /// Group size for quantization (elements per scale factor).
    pub group_size: u32,

    /// Use symmetric quantization (vs asymmetric with zero points).
    pub symmetric: bool,

    /// Minimum size in bytes for QLoRA compression.
    /// Smaller data uses standard compression.
    pub min_size: usize,

    /// Maximum size in bytes for QLoRA compression.
    /// Larger data is split into chunks.
    pub max_size: usize,

    /// Quality threshold for compression (0.0-1.0).
    /// Higher values require better reconstruction quality.
    pub quality_threshold: f32,
}

impl QLoraConfig {
    /// Create a configuration optimized for maximum compression.
    pub fn aggressive() -> Self {
        Self {
            bits: 4,
            rank: 8,
            group_size: 128,
            symmetric: true,
            min_size: 4096,
            max_size: 1024 * 1024 * 1024, // 1GB
            quality_threshold: 0.95,
        }
    }

    /// Create a configuration optimized for quality preservation.
    pub fn quality() -> Self {
        Self {
            bits: 8,
            rank: 64,
            group_size: 32,
            symmetric: false,
            min_size: 4096,
            max_size: 1024 * 1024 * 1024,
            quality_threshold: 0.99,
        }
    }

    /// Create a balanced configuration.
    pub fn balanced() -> Self {
        Self {
            bits: 4,
            rank: 32,
            group_size: 64,
            symmetric: true,
            min_size: 4096,
            max_size: 1024 * 1024 * 1024,
            quality_threshold: 0.97,
        }
    }

    /// Validate configuration.
    pub fn is_valid(&self) -> bool {
        (self.bits == 4 || self.bits == 8)
            && self.rank > 0
            && self.rank <= 1024
            && self.group_size >= 8
            && self.group_size <= 256
            && self.min_size > 0
            && self.max_size >= self.min_size
            && self.quality_threshold > 0.0
            && self.quality_threshold <= 1.0
    }
}

impl Default for QLoraConfig {
    fn default() -> Self {
        Self::balanced()
    }
}

// ═══════════════════════════════════════════════════════════════════════════════
// QLORA PROVIDER TRAIT
// ═══════════════════════════════════════════════════════════════════════════════

/// QLoRA compression provider trait.
///
/// This trait defines the interface for QLoRA compression that LunaOS's
/// libluna implements. LCPFS provides a no-op fallback for non-LunaOS systems.
///
/// # Implementation Guide for libluna
///
/// ```rust,ignore
/// use lcpfs::lcpfs_qlora::{QLoraProvider, QLoraMetadata, QLoraConfig, TensorDtype};
///
/// pub struct LiblunaQLoraProvider {
///     cuda_context: Option<CudaContext>,
///     default_config: QLoraConfig,
/// }
///
/// impl QLoraProvider for LiblunaQLoraProvider {
///     fn is_qlora_candidate(&self, data: &[u8]) -> bool {
///         // Check for tensor file magic bytes
///         if data.len() < 8 {
///             return false;
///         }
///
///         // SafeTensors magic
///         if &data[0..8] == b"safetens" {
///             return true;
///         }
///
///         // GGUF magic
///         if &data[0..4] == b"GGUF" {
///             return true;
///         }
///
///         // Check entropy - AI weights have specific patterns
///         let entropy = calculate_entropy(data);
///         if entropy > 7.9 && entropy < 8.0 {
///             // High entropy but not random - likely FP weights
///             return true;
///         }
///
///         false
///     }
///
///     fn compress(&self, data: &[u8], config: &QLoraConfig)
///         -> Option<(Vec<u8>, QLoraMetadata)>
///     {
///         // Parse tensor format
///         let tensor = parse_tensor(data)?;
///
///         // Quantize to target bit width
///         let (quantized, scales, zeros) = quantize(
///             &tensor,
///             config.bits,
///             config.group_size,
///             config.symmetric,
///         )?;
///
///         // Compute low-rank factorization
///         let (lora_a, lora_b) = svd_low_rank(&quantized, config.rank)?;
///
///         // Verify quality
///         let reconstructed = reconstruct(&lora_a, &lora_b, &scales, &zeros);
///         let cosine_sim = cosine_similarity(&tensor, &reconstructed);
///         if cosine_sim < config.quality_threshold {
///             return None; // Quality too low
///         }
///
///         // Pack compressed data
///         let compressed = pack_qlora(&lora_a, &lora_b, &quantized);
///
///         let metadata = QLoraMetadata {
///             original_size: data.len() as u64,
///             bits: config.bits,
///             rank: config.rank,
///             dimensions: tensor.shape.clone(),
///             dtype: tensor.dtype,
///             scale_factors: scales,
///             zero_points: if config.symmetric { None } else { Some(zeros) },
///             original_checksum: blake3_hash(data),
///             timestamp: get_unix_time(),
///             extra: None,
///         };
///
///         Some((compressed, metadata))
///     }
///
///     fn decompress(&self, compressed: &[u8], metadata: &QLoraMetadata)
///         -> Option<Vec<u8>>
///     {
///         // Unpack compressed data
///         let (lora_a, lora_b, quantized) = unpack_qlora(compressed)?;
///
///         // Reconstruct full tensor
///         let reconstructed = matmul(&lora_a, &lora_b);
///
///         // Dequantize
///         let dequantized = dequantize(
///             &reconstructed,
///             &metadata.scale_factors,
///             metadata.zero_points.as_ref(),
///             metadata.dtype,
///         )?;
///
///         // Verify checksum
///         if blake3_hash(&dequantized) != metadata.original_checksum {
///             // Checksum mismatch - data corrupted
///             return None;
///         }
///
///         Some(dequantized)
///     }
///
///     fn config(&self) -> &QLoraConfig {
///         &self.default_config
///     }
///
///     fn name(&self) -> &str {
///         "libluna-qlora"
///     }
///
///     fn supports_gpu(&self) -> bool {
///         self.cuda_context.is_some()
///     }
/// }
/// ```
pub trait QLoraProvider: Send + Sync {
    /// Check if data is a candidate for QLoRA compression.
    ///
    /// Returns `true` if the data appears to be AI model weights,
    /// embeddings, or other tensor data suitable for QLoRA.
    ///
    /// # Detection Heuristics
    ///
    /// Implementations should check for:
    /// - Known file format magic bytes (SafeTensors, GGUF, etc.)
    /// - Entropy patterns characteristic of floating-point weights
    /// - Alignment and structure of the data
    ///
    /// # Arguments
    /// * `data` - Raw data to analyze
    ///
    /// # Returns
    /// `true` if QLoRA compression should be attempted.
    fn is_qlora_candidate(&self, data: &[u8]) -> bool;

    /// Compress data using QLoRA.
    ///
    /// # Arguments
    /// * `data` - Raw tensor data to compress
    /// * `config` - Compression configuration
    ///
    /// # Returns
    /// `Some((compressed_data, metadata))` on success, `None` if:
    /// - Data is not compressible with QLoRA
    /// - Quality threshold cannot be met
    /// - An error occurred during compression
    fn compress(&self, data: &[u8], config: &QLoraConfig) -> Option<(Vec<u8>, QLoraMetadata)>;

    /// Decompress QLoRA-compressed data.
    ///
    /// # Arguments
    /// * `compressed` - Compressed data from `compress()`
    /// * `metadata` - Metadata from `compress()`
    ///
    /// # Returns
    /// `Some(original_data)` on success, `None` if:
    /// - Data is corrupted
    /// - Metadata is invalid
    /// - Checksum verification fails
    fn decompress(&self, compressed: &[u8], metadata: &QLoraMetadata) -> Option<Vec<u8>>;

    /// Get the default configuration.
    fn config(&self) -> &QLoraConfig;

    /// Get the provider name for logging/debugging.
    fn name(&self) -> &str;

    /// Check if GPU acceleration is available.
    fn supports_gpu(&self) -> bool;

    /// Get compression statistics.
    fn statistics(&self) -> QLoraStatistics {
        QLoraStatistics::default()
    }
}

// ═══════════════════════════════════════════════════════════════════════════════
// STATISTICS
// ═══════════════════════════════════════════════════════════════════════════════

/// QLoRA compression statistics.
#[derive(Debug, Clone, Default)]
pub struct QLoraStatistics {
    /// Total bytes compressed.
    pub bytes_in: u64,
    /// Total compressed bytes produced.
    pub bytes_out: u64,
    /// Number of successful compressions.
    pub compress_count: u64,
    /// Number of successful decompressions.
    pub decompress_count: u64,
    /// Number of candidates detected.
    pub candidates_detected: u64,
    /// Number of candidates rejected (quality threshold).
    pub candidates_rejected: u64,
    /// Total compression time in microseconds.
    pub compress_time_us: u64,
    /// Total decompression time in microseconds.
    pub decompress_time_us: u64,
}

impl QLoraStatistics {
    /// Calculate the average compression ratio.
    pub fn compression_ratio(&self) -> f64 {
        if self.bytes_out == 0 {
            return 0.0;
        }
        self.bytes_in as f64 / self.bytes_out as f64
    }

    /// Calculate throughput in MB/s.
    pub fn compress_throughput_mbps(&self) -> f64 {
        if self.compress_time_us == 0 {
            return 0.0;
        }
        (self.bytes_in as f64 / 1_000_000.0) / (self.compress_time_us as f64 / 1_000_000.0)
    }

    /// Calculate decompression throughput in MB/s.
    pub fn decompress_throughput_mbps(&self) -> f64 {
        if self.decompress_time_us == 0 {
            return 0.0;
        }
        (self.bytes_in as f64 / 1_000_000.0) / (self.decompress_time_us as f64 / 1_000_000.0)
    }
}

// ═══════════════════════════════════════════════════════════════════════════════
// NO-OP PROVIDER (FALLBACK)
// ═══════════════════════════════════════════════════════════════════════════════

/// No-op QLoRA provider for non-LunaOS systems.
///
/// This provider always returns `false` for `is_qlora_candidate` and
/// `None` for compression/decompression. It allows LCPFS to compile
/// and run on systems without libluna, falling back to standard
/// compression algorithms.
pub struct NoOpQLoraProvider {
    config: QLoraConfig,
}

impl NoOpQLoraProvider {
    /// Create a new no-op provider.
    pub const fn new() -> Self {
        Self {
            config: QLoraConfig {
                bits: 4,
                rank: 16,
                group_size: 64,
                symmetric: true,
                min_size: 4096,
                max_size: 1024 * 1024 * 1024,
                quality_threshold: 0.97,
            },
        }
    }
}

impl Default for NoOpQLoraProvider {
    fn default() -> Self {
        Self::new()
    }
}

impl QLoraProvider for NoOpQLoraProvider {
    fn is_qlora_candidate(&self, _data: &[u8]) -> bool {
        // No-op: never detect candidates
        false
    }

    fn compress(&self, _data: &[u8], _config: &QLoraConfig) -> Option<(Vec<u8>, QLoraMetadata)> {
        // No-op: never compress
        None
    }

    fn decompress(&self, _compressed: &[u8], _metadata: &QLoraMetadata) -> Option<Vec<u8>> {
        // No-op: never decompress
        None
    }

    fn config(&self) -> &QLoraConfig {
        &self.config
    }

    fn name(&self) -> &str {
        "no-op"
    }

    fn supports_gpu(&self) -> bool {
        false
    }
}

/// Global no-op provider instance.
static NOOP_PROVIDER: NoOpQLoraProvider = NoOpQLoraProvider::new();

// ═══════════════════════════════════════════════════════════════════════════════
// GLOBAL PROVIDER REGISTRATION
// ═══════════════════════════════════════════════════════════════════════════════

/// Global QLoRA provider storage.
///
/// Uses `spin::Once` for lock-free, one-time initialization.
static QLORA_PROVIDER: spin::Once<&'static dyn QLoraProvider> = spin::Once::new();

/// Register a QLoRA provider.
///
/// This should be called once during system initialization (e.g., in libluna's
/// init function). Only the first registration takes effect.
///
/// # Arguments
/// * `provider` - The provider to register (must have static lifetime)
///
/// # Example
///
/// ```rust,ignore
/// // In libluna initialization:
/// static PROVIDER: LiblunaQLoraProvider = LiblunaQLoraProvider::new();
///
/// pub fn init() {
///     lcpfs::lcpfs_qlora::register_qlora_provider(&PROVIDER);
/// }
/// ```
///
/// # Thread Safety
///
/// This function is thread-safe. If called concurrently, only one
/// registration will succeed.
pub fn register_qlora_provider(provider: &'static dyn QLoraProvider) {
    QLORA_PROVIDER.call_once(|| provider);
}

/// Get the registered QLoRA provider.
///
/// Returns the registered provider, or the no-op fallback if none
/// has been registered.
///
/// # Example
///
/// ```rust,ignore
/// let provider = lcpfs::lcpfs_qlora::get_qlora_provider();
/// if provider.is_qlora_candidate(data) {
///     if let Some((compressed, meta)) = provider.compress(data, provider.config()) {
///         // Use compressed data
///     }
/// }
/// ```
pub fn get_qlora_provider() -> &'static dyn QLoraProvider {
    QLORA_PROVIDER.get().copied().unwrap_or(&NOOP_PROVIDER)
}

/// Check if a real QLoRA provider is registered.
///
/// Returns `true` if a provider other than the no-op fallback is registered.
pub fn is_qlora_available() -> bool {
    QLORA_PROVIDER.get().is_some()
}

// ═══════════════════════════════════════════════════════════════════════════════
// CONVENIENCE FUNCTIONS
// ═══════════════════════════════════════════════════════════════════════════════

/// Check if data is a QLoRA candidate using the global provider.
pub fn is_qlora_candidate(data: &[u8]) -> bool {
    get_qlora_provider().is_qlora_candidate(data)
}

/// Compress data using QLoRA with the global provider.
pub fn compress_qlora(data: &[u8]) -> Option<(Vec<u8>, QLoraMetadata)> {
    let provider = get_qlora_provider();
    provider.compress(data, provider.config())
}

/// Compress data using QLoRA with custom configuration.
pub fn compress_qlora_with_config(
    data: &[u8],
    config: &QLoraConfig,
) -> Option<(Vec<u8>, QLoraMetadata)> {
    get_qlora_provider().compress(data, config)
}

/// Decompress QLoRA data using the global provider.
pub fn decompress_qlora(compressed: &[u8], metadata: &QLoraMetadata) -> Option<Vec<u8>> {
    get_qlora_provider().decompress(compressed, metadata)
}

// ═══════════════════════════════════════════════════════════════════════════════
// TENSOR FORMAT DETECTION
// ═══════════════════════════════════════════════════════════════════════════════

/// Known tensor file format magic bytes.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum TensorFormat {
    /// SafeTensors format (.safetensors).
    SafeTensors,
    /// GGUF format (.gguf).
    Gguf,
    /// GGML format (.ggml).
    Ggml,
    /// PyTorch pickle format (.pt, .pth).
    PyTorch,
    /// NumPy format (.npy).
    NumPy,
    /// Raw floating-point data.
    Raw,
    /// Unknown format.
    Unknown,
}

impl TensorFormat {
    /// Detect format from file header.
    pub fn detect(data: &[u8]) -> Self {
        if data.len() < 8 {
            return TensorFormat::Unknown;
        }

        // SafeTensors: starts with JSON header length (little-endian u64)
        // followed by JSON that starts with '{'
        if data.len() >= 16 {
            let header_len = u64::from_le_bytes(data[0..8].try_into().unwrap_or([0; 8]));
            if header_len > 0 && header_len < 1_000_000 && data.get(8) == Some(&b'{') {
                return TensorFormat::SafeTensors;
            }
        }

        // GGUF: "GGUF" magic
        if &data[0..4] == b"GGUF" {
            return TensorFormat::Gguf;
        }

        // GGML: "GGML" magic or "lmgg" (little-endian)
        if &data[0..4] == b"GGML" || &data[0..4] == b"lmgg" {
            return TensorFormat::Ggml;
        }

        // NumPy: "\x93NUMPY" magic
        if data.len() >= 6 && data[0] == 0x93 && &data[1..6] == b"NUMPY" {
            return TensorFormat::NumPy;
        }

        // PyTorch: ZIP file with pickle (starts with PK)
        if &data[0..2] == b"PK" {
            return TensorFormat::PyTorch;
        }

        TensorFormat::Unknown
    }

    /// Get the file extension for this format.
    pub const fn extension(&self) -> &'static str {
        match self {
            TensorFormat::SafeTensors => ".safetensors",
            TensorFormat::Gguf => ".gguf",
            TensorFormat::Ggml => ".ggml",
            TensorFormat::PyTorch => ".pt",
            TensorFormat::NumPy => ".npy",
            TensorFormat::Raw => ".bin",
            TensorFormat::Unknown => "",
        }
    }
}

// ═══════════════════════════════════════════════════════════════════════════════
// TESTS
// ═══════════════════════════════════════════════════════════════════════════════

#[cfg(test)]
mod tests {
    use super::*;
    use alloc::vec;

    #[test]
    fn test_tensor_dtype_bits() {
        assert_eq!(TensorDtype::Float32.bits(), 32);
        assert_eq!(TensorDtype::Float16.bits(), 16);
        assert_eq!(TensorDtype::BFloat16.bits(), 16);
        assert_eq!(TensorDtype::Float64.bits(), 64);
        assert_eq!(TensorDtype::Float8E4M3.bits(), 8);
    }

    #[test]
    fn test_tensor_dtype_from_u8() {
        assert_eq!(TensorDtype::from_u8(0), Some(TensorDtype::Float32));
        assert_eq!(TensorDtype::from_u8(1), Some(TensorDtype::Float16));
        assert_eq!(TensorDtype::from_u8(255), None);
    }

    #[test]
    fn test_qlora_metadata_new() {
        let meta = QLoraMetadata::new(1024, 4, 16);
        assert_eq!(meta.original_size, 1024);
        assert_eq!(meta.bits, 4);
        assert_eq!(meta.rank, 16);
    }

    #[test]
    fn test_qlora_metadata_expected_ratio() {
        let mut meta = QLoraMetadata::new(1024, 4, 16);
        meta.dtype = TensorDtype::Float32;
        let ratio = meta.expected_ratio();
        // 32-bit -> 4-bit = 8x, with LoRA overhead ~6.8x
        assert!(ratio > 6.0 && ratio < 8.0);
    }

    #[test]
    fn test_qlora_metadata_serialization() {
        let mut meta = QLoraMetadata::new(1024, 4, 16);
        meta.dimensions = vec![32, 32];
        meta.scale_factors = vec![1.0, 0.5, 0.25];
        meta.original_checksum = 0xDEADBEEF;
        meta.timestamp = 1234567890;

        let bytes = meta.to_bytes();
        let recovered = QLoraMetadata::from_bytes(&bytes).unwrap();

        assert_eq!(recovered.original_size, meta.original_size);
        assert_eq!(recovered.bits, meta.bits);
        assert_eq!(recovered.rank, meta.rank);
        assert_eq!(recovered.dimensions, meta.dimensions);
        assert_eq!(recovered.scale_factors.len(), meta.scale_factors.len());
        assert_eq!(recovered.original_checksum, meta.original_checksum);
        assert_eq!(recovered.timestamp, meta.timestamp);
    }

    #[test]
    fn test_qlora_config_presets() {
        let aggressive = QLoraConfig::aggressive();
        assert_eq!(aggressive.bits, 4);
        assert_eq!(aggressive.rank, 8);
        assert!(aggressive.is_valid());

        let quality = QLoraConfig::quality();
        assert_eq!(quality.bits, 8);
        assert_eq!(quality.rank, 64);
        assert!(quality.is_valid());

        let balanced = QLoraConfig::balanced();
        assert!(balanced.is_valid());
    }

    #[test]
    fn test_qlora_config_validation() {
        let mut config = QLoraConfig::default();
        assert!(config.is_valid());

        config.bits = 3; // Invalid
        assert!(!config.is_valid());

        config.bits = 4;
        config.rank = 0; // Invalid
        assert!(!config.is_valid());
    }

    #[test]
    fn test_noop_provider() {
        let provider = NoOpQLoraProvider::new();
        assert!(!provider.is_qlora_candidate(&[1, 2, 3, 4]));
        assert!(
            provider
                .compress(&[1, 2, 3, 4], &QLoraConfig::default())
                .is_none()
        );
        assert!(
            provider
                .decompress(&[], &QLoraMetadata::default())
                .is_none()
        );
        assert_eq!(provider.name(), "no-op");
        assert!(!provider.supports_gpu());
    }

    #[test]
    fn test_global_provider_fallback() {
        // Without registration, should return no-op
        let provider = get_qlora_provider();
        assert_eq!(provider.name(), "no-op");
        assert!(!is_qlora_candidate(&[1, 2, 3, 4]));
    }

    #[test]
    fn test_tensor_format_detection() {
        // GGUF
        let gguf_data = b"GGUF\x00\x00\x00\x00";
        assert_eq!(TensorFormat::detect(gguf_data), TensorFormat::Gguf);

        // GGML
        let ggml_data = b"GGML\x00\x00\x00\x00";
        assert_eq!(TensorFormat::detect(ggml_data), TensorFormat::Ggml);

        // NumPy
        let numpy_data = [0x93, b'N', b'U', b'M', b'P', b'Y', 0, 0];
        assert_eq!(TensorFormat::detect(&numpy_data), TensorFormat::NumPy);

        // PyTorch (ZIP)
        let pytorch_data = b"PK\x03\x04\x00\x00\x00\x00";
        assert_eq!(TensorFormat::detect(pytorch_data), TensorFormat::PyTorch);

        // Unknown
        let unknown_data = b"UNKNOWN_";
        assert_eq!(TensorFormat::detect(unknown_data), TensorFormat::Unknown);

        // Too short
        assert_eq!(TensorFormat::detect(b"short"), TensorFormat::Unknown);
    }

    #[test]
    fn test_tensor_format_extension() {
        assert_eq!(TensorFormat::SafeTensors.extension(), ".safetensors");
        assert_eq!(TensorFormat::Gguf.extension(), ".gguf");
        assert_eq!(TensorFormat::PyTorch.extension(), ".pt");
    }

    #[test]
    fn test_statistics_calculations() {
        let stats = QLoraStatistics {
            bytes_in: 1_000_000,
            bytes_out: 100_000,
            compress_count: 10,
            decompress_count: 5,
            candidates_detected: 15,
            candidates_rejected: 5,
            compress_time_us: 1_000_000, // 1 second
            decompress_time_us: 500_000, // 0.5 seconds
        };

        assert!((stats.compression_ratio() - 10.0).abs() < 0.01);
        assert!((stats.compress_throughput_mbps() - 1.0).abs() < 0.01);
        assert!((stats.decompress_throughput_mbps() - 2.0).abs() < 0.01);
    }

    #[test]
    fn test_metadata_validity() {
        let mut meta = QLoraMetadata::new(1024, 4, 16);
        meta.scale_factors = vec![1.0];
        assert!(meta.is_valid());

        // Invalid bits
        meta.bits = 3;
        assert!(!meta.is_valid());
        meta.bits = 4;

        // Invalid rank
        meta.rank = 0;
        assert!(!meta.is_valid());
        meta.rank = 16;

        // Empty scale factors
        meta.scale_factors.clear();
        assert!(!meta.is_valid());
    }

    #[test]
    fn test_convenience_functions() {
        // These use the global no-op provider
        assert!(!is_qlora_candidate(&[1, 2, 3, 4]));
        assert!(compress_qlora(&[1, 2, 3, 4]).is_none());
        assert!(compress_qlora_with_config(&[1, 2, 3, 4], &QLoraConfig::aggressive()).is_none());
        assert!(decompress_qlora(&[], &QLoraMetadata::default()).is_none());
    }
}