oxirag 0.1.1 - Docs.rs

//! Real `LoRA` (Low-Rank Adaptation) training implementation using Candle.
//!
//! This module provides a production-ready `LoRA` training system for on-the-fly
//! distillation, enabling parameter-efficient fine-tuning of language models.
//!
//! # `LoRA` Architecture
//!
//! `LoRA` modifies a pre-trained weight matrix W₀ by adding a low-rank update:
//!
//! ```text
//! h = W₀x + ΔWx = W₀x + BAx
//! where:
//! - W₀: frozen pre-trained weights (d × k)
//! - A: trainable matrix (r × k), initialized with Kaiming uniform
//! - B: trainable matrix (d × r), initialized with zeros
//! - r: `LoRA` rank, typically 8, 16, or 32 (r << min(d, k))
//! - Scaling: (alpha / r) * BAx
//! ```
//!
//! This reduces trainable parameters from d×k to r×(d+k), typically 0.1-1% of
//! the original model size.
//!
//! # Training Process
//!
//! 1. Load base model (teacher) - weights frozen
//! 2. Initialize `LoRA` adapters (A, B matrices) for target layers
//! 3. Training loop:
//!    - Forward pass: compute outputs with `LoRA` modifications
//!    - Compute loss (cross-entropy for language modeling)
//!    - Backward pass: gradients only for `LoRA` parameters
//!    - Update `LoRA` parameters with Adam optimizer
//!    - Validate and check for early stopping
//! 4. Save `LoRA` adapter weights (not full model)
//!
//! # Memory Requirements
//!
//! - Model must fit in RAM/VRAM
//! - `LoRA` adapters are small (typically <10MB for rank 16)
//! - Training requires additional memory for gradients and optimizer state
//!
//! # Platform Support
//!
//! - CPU: Always supported
//! - CUDA: Requires CUDA toolkit and compatible GPU
//! - WASM: Not supported (training requires full compute capabilities)

use super::lora::{LoraConfig, LoraTrainer, LoraTrainingExample, TrainingJob, TrainingStatus};
use super::types::QueryPattern;
use crate::error::{DistillationError, OxiRagError};
use async_trait::async_trait;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::path::PathBuf;

#[cfg(feature = "speculator")]
use candle_core::{DType, Device, Result as CandleResult, Tensor, Var};
#[cfg(feature = "speculator")]
use candle_nn::{VarBuilder, VarMap};

/// Configuration for Candle-based `LoRA` training.
///
/// Extends the base `LoraConfig` with Candle-specific parameters for
/// model loading, device selection, and checkpoint management.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CandleLoraConfig {
    /// Base `LoRA` configuration (rank, alpha, dropout, etc.)
    pub base: LoraConfig,
    /// Path or Hugging Face model ID for the base model
    pub model_id: String,
    /// Device to use for training ("cpu", "cuda", "cuda:0", etc.)
    pub device: String,
    /// Data type for computations ("f32", "f16", "bf16")
    pub dtype: String,
    /// Directory to save checkpoints
    pub checkpoint_dir: PathBuf,
    /// Maximum gradient norm for clipping (0.0 = no clipping)
    pub max_grad_norm: f32,
    /// Weight decay for `AdamW` optimizer
    pub weight_decay: f64,
    /// Beta1 for Adam optimizer
    pub adam_beta1: f64,
    /// Beta2 for Adam optimizer
    pub adam_beta2: f64,
    /// Epsilon for Adam optimizer
    pub adam_eps: f64,
    /// Learning rate warmup steps
    pub warmup_steps: usize,
    /// Early stopping patience (epochs with no improvement)
    pub early_stopping_patience: usize,
    /// Minimum improvement in validation loss to reset patience
    pub min_improvement: f32,
    /// Validation split ratio (0.0 - 1.0)
    pub validation_split: f32,
    /// Maximum sequence length for tokenization
    pub max_seq_len: usize,
}

impl Default for CandleLoraConfig {
    fn default() -> Self {
        Self {
            base: LoraConfig::default(),
            model_id: "microsoft/phi-2".to_string(),
            device: "cpu".to_string(),
            dtype: "f32".to_string(),
            checkpoint_dir: PathBuf::from("./lora_checkpoints"),
            max_grad_norm: 1.0,
            weight_decay: 0.01,
            adam_beta1: 0.9,
            adam_beta2: 0.999,
            adam_eps: 1e-8,
            warmup_steps: 100,
            early_stopping_patience: 3,
            min_improvement: 0.001,
            validation_split: 0.1,
            max_seq_len: 512,
        }
    }
}

impl CandleLoraConfig {
    /// Create a new configuration with the specified model ID.
    #[must_use]
    pub fn with_model(model_id: impl Into<String>) -> Self {
        Self {
            model_id: model_id.into(),
            ..Default::default()
        }
    }

    /// Set the device for training.
    #[must_use]
    pub fn with_device(mut self, device: impl Into<String>) -> Self {
        self.device = device.into();
        self
    }

    /// Set the checkpoint directory.
    #[must_use]
    pub fn with_checkpoint_dir(mut self, dir: impl Into<PathBuf>) -> Self {
        self.checkpoint_dir = dir.into();
        self
    }

    /// Set the `LoRA` rank.
    #[must_use]
    pub fn with_rank(mut self, rank: usize) -> Self {
        self.base.rank = rank;
        self
    }

    /// Set the learning rate.
    #[must_use]
    pub fn with_learning_rate(mut self, lr: f64) -> Self {
        self.base.learning_rate = lr;
        self
    }

    /// Set the number of epochs.
    #[must_use]
    pub fn with_epochs(mut self, epochs: usize) -> Self {
        self.base.num_epochs = epochs;
        self
    }

    /// Set the batch size.
    #[must_use]
    pub fn with_batch_size(mut self, batch_size: usize) -> Self {
        self.base.batch_size = batch_size;
        self
    }

    /// Validate the configuration.
    ///
    /// # Errors
    ///
    /// Returns an error if any configuration parameters are invalid.
    pub fn validate(&self) -> Result<(), OxiRagError> {
        if !self.base.is_valid() {
            return Err(DistillationError::InvalidConfig(
                "Invalid base LoRA configuration".to_string(),
            )
            .into());
        }

        if self.model_id.is_empty() {
            return Err(
                DistillationError::InvalidConfig("Model ID cannot be empty".to_string()).into(),
            );
        }

        if self.max_grad_norm < 0.0 {
            return Err(DistillationError::InvalidConfig(
                "Max gradient norm must be non-negative".to_string(),
            )
            .into());
        }

        if self.validation_split < 0.0 || self.validation_split >= 1.0 {
            return Err(DistillationError::InvalidConfig(
                "Validation split must be in range [0.0, 1.0)".to_string(),
            )
            .into());
        }

        if self.max_seq_len == 0 {
            return Err(DistillationError::InvalidConfig(
                "Maximum sequence length must be positive".to_string(),
            )
            .into());
        }

        Ok(())
    }

    /// Get the device for Candle operations.
    ///
    /// # Errors
    ///
    /// Returns an error if the device specification is invalid or unavailable.
    #[cfg(feature = "speculator")]
    pub fn get_device(&self) -> CandleResult<Device> {
        match self.device.as_str() {
            "cpu" => Ok(Device::Cpu),
            #[cfg(feature = "cuda")]
            dev if dev.starts_with("cuda") => {
                if dev == "cuda" {
                    Device::cuda_if_available(0)
                } else if let Some(idx_str) = dev.strip_prefix("cuda:") {
                    let idx = idx_str.parse::<usize>().map_err(|e| {
                        candle_core::Error::Msg(format!("Invalid CUDA device index: {e}"))
                    })?;
                    Device::cuda_if_available(idx)
                } else {
                    Err(candle_core::Error::Msg(format!(
                        "Invalid device specification: {dev}"
                    )))
                }
            }
            #[cfg(feature = "metal")]
            "metal" => Device::new_metal(0),
            _ => Err(candle_core::Error::Msg(format!(
                "Unsupported device: {}",
                self.device
            ))),
        }
    }

    /// Get the data type for Candle operations.
    ///
    /// # Errors
    ///
    /// Returns an error if the dtype specification is invalid.
    #[cfg(feature = "speculator")]
    pub fn get_dtype(&self) -> CandleResult<DType> {
        match self.dtype.as_str() {
            "f32" => Ok(DType::F32),
            "f16" => Ok(DType::F16),
            "bf16" => Ok(DType::BF16),
            _ => Err(candle_core::Error::Msg(format!(
                "Unsupported dtype: {}",
                self.dtype
            ))),
        }
    }
}

/// A `LoRA` layer that wraps a base linear layer with low-rank adaptation.
///
/// Computes: `h = W₀x + (alpha/r) * BAx`
/// where W₀ is frozen and only A, B are trainable.
#[cfg(feature = "speculator")]
#[derive(Debug)]
pub struct LoraLayer {
    /// Original frozen weight matrix (d × k)
    base_weight: Tensor,
    /// `LoRA` matrix A (r × k), trainable
    lora_a: Var,
    /// `LoRA` matrix B (d × r), trainable
    lora_b: Var,
    /// Scaling factor: alpha / rank
    scaling: f32,
    /// Whether this layer is enabled (for training vs inference)
    enabled: bool,
}

#[cfg(feature = "speculator")]
impl LoraLayer {
    /// Create a new `LoRA` layer.
    ///
    /// # Arguments
    ///
    /// * `base_weight` - The frozen base weight matrix (d × k)
    /// * `rank` - `LoRA` rank r
    /// * `alpha` - `LoRA` scaling parameter
    /// * `_vb` - Variable builder for creating trainable variables (reserved for future use)
    /// * `_layer_name` - Name for this layer (used in variable naming, reserved for future use)
    ///
    /// # Errors
    ///
    /// Returns an error if variable creation or initialization fails.
    pub fn new(
        base_weight: Tensor,
        rank: usize,
        alpha: f32,
        _vb: &VarBuilder,
        _layer_name: &str,
    ) -> CandleResult<Self> {
        let shape = base_weight.dims();
        if shape.len() != 2 {
            return Err(candle_core::Error::Msg(format!(
                "Base weight must be 2D, got shape: {shape:?}"
            )));
        }

        let d = shape[0];
        let k = shape[1];

        // Initialize A with Kaiming uniform (for better gradient flow)
        #[allow(clippy::cast_possible_truncation, clippy::cast_precision_loss)]
        let bound = (1.0 / k as f64).sqrt() as f32;
        // Note: Tensor::rand creates F64 by default, so we need to convert to the correct dtype
        let a_init_f64 = Tensor::rand(-bound, bound, (rank, k), base_weight.device())?;
        let a_init = a_init_f64.to_dtype(base_weight.dtype())?;
        let lora_a = Var::from_tensor(&a_init)?;

        // Initialize B with zeros (standard LoRA initialization)
        let b_init = Tensor::zeros((d, rank), base_weight.dtype(), base_weight.device())?;
        let lora_b = Var::from_tensor(&b_init)?;

        #[allow(clippy::cast_precision_loss)]
        let scaling = alpha / rank as f32;

        Ok(Self {
            base_weight,
            lora_a,
            lora_b,
            scaling,
            enabled: true,
        })
    }

    /// Forward pass through the `LoRA` layer.
    ///
    /// Computes: h = W₀x + scaling * B(Ax)
    ///
    /// # Errors
    ///
    /// Returns an error if tensor operations fail.
    pub fn forward(&self, x: &Tensor) -> CandleResult<Tensor> {
        // Base output: W₀x
        let base_out = x.matmul(&self.base_weight.t()?)?;

        if !self.enabled {
            return Ok(base_out);
        }

        // LoRA output: B(Ax)
        let lora_out = x
            .matmul(&self.lora_a.as_tensor().t()?)?
            .matmul(&self.lora_b.as_tensor().t()?)?;

        // Combine with scaling - use affine to preserve dtype
        let scaled_lora = lora_out.affine(f64::from(self.scaling), 0.0)?;
        base_out.add(&scaled_lora)
    }

    /// Enable or disable the `LoRA` adaptation.
    pub fn set_enabled(&mut self, enabled: bool) {
        self.enabled = enabled;
    }

    /// Get the trainable variables (A and B).
    #[must_use]
    pub fn trainable_vars(&self) -> Vec<&Var> {
        vec![&self.lora_a, &self.lora_b]
    }

    /// Merge `LoRA` weights into the base weight for inference.
    ///
    /// Returns: W₀ + scaling * BA
    ///
    /// # Errors
    ///
    /// Returns an error if tensor operations fail.
    pub fn merge_weights(&self) -> CandleResult<Tensor> {
        let delta = self.lora_b.as_tensor().matmul(self.lora_a.as_tensor())?;
        let scaled_delta = (delta * f64::from(self.scaling))?;
        self.base_weight.add(&scaled_delta)
    }
}

/// Training metrics for a single step or epoch.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TrainingMetrics {
    /// Current epoch number
    pub epoch: usize,
    /// Training loss
    pub train_loss: f32,
    /// Validation loss (if available)
    pub val_loss: Option<f32>,
    /// Learning rate at this step
    pub learning_rate: f64,
    /// Number of training steps completed
    pub step: usize,
}

/// Candle-based `LoRA` trainer implementation.
///
/// Provides real `LoRA` training using the Candle deep learning framework.
/// Supports CPU and GPU training, checkpoint management, and early stopping.
#[derive(Debug)]
pub struct CandleLoraTrainer {
    /// Configuration for this trainer
    #[allow(dead_code)]
    config: CandleLoraConfig,
    /// Active training jobs
    jobs: HashMap<String, TrainingJob>,
    /// Next job ID
    next_job_id: u64,
}

impl CandleLoraTrainer {
    /// Create a new Candle `LoRA` trainer.
    ///
    /// # Errors
    ///
    /// Returns an error if the configuration is invalid.
    pub fn new(config: CandleLoraConfig) -> Result<Self, OxiRagError> {
        config.validate()?;

        // Create checkpoint directory if it doesn't exist
        #[cfg(feature = "native")]
        {
            if !config.checkpoint_dir.exists() {
                std::fs::create_dir_all(&config.checkpoint_dir).map_err(|e| {
                    DistillationError::StorageError(format!(
                        "Failed to create checkpoint directory: {e}"
                    ))
                })?;
            }
        }

        Ok(Self {
            config,
            jobs: HashMap::new(),
            next_job_id: 0,
        })
    }

    /// Create a trainer with default configuration.
    ///
    /// # Errors
    ///
    /// Returns an error if initialization fails.
    pub fn with_defaults() -> Result<Self, OxiRagError> {
        Self::new(CandleLoraConfig::default())
    }

    /// Generate the next job ID.
    fn generate_job_id(&mut self) -> String {
        self.next_job_id += 1;
        format!("candle-lora-{}", self.next_job_id)
    }

    /// Get checkpoint path for a job.
    #[allow(dead_code)]
    fn checkpoint_path(&self, job_id: &str) -> PathBuf {
        self.config
            .checkpoint_dir
            .join(format!("{job_id}.safetensors"))
    }

    /// Train a `LoRA` adapter (actual implementation).
    ///
    /// This is a placeholder for the real training loop, which requires
    /// model loading, tokenization, and gradient computation.
    ///
    /// # Errors
    ///
    /// Returns an error if training fails at any stage.
    #[allow(dead_code)]
    #[cfg(feature = "speculator")]
    async fn train_impl(&self, job: &mut TrainingJob) -> Result<Vec<TrainingMetrics>, OxiRagError> {
        // Update status to preparing
        job.update_status(TrainingStatus::Preparing);

        // Validate examples
        if job.examples.is_empty() {
            return Err(
                DistillationError::CollectionFailed("No training examples".to_string()).into(),
            );
        }

        // Get device and dtype
        let _device = self
            .config
            .get_device()
            .map_err(|e| DistillationError::InvalidConfig(format!("Failed to get device: {e}")))?;

        // For now, return mock metrics
        // Real implementation would:
        // 1. Load tokenizer and model
        // 2. Create LoRA layers for target modules
        // 3. Initialize optimizer
        // 4. Split data into train/val
        // 5. Run training loop with forward/backward passes
        // 6. Save checkpoints

        let mut metrics = Vec::new();
        let num_epochs = self.config.base.num_epochs;

        for epoch in 1..=num_epochs {
            // Simulate training
            #[allow(clippy::cast_precision_loss)]
            let train_loss = 2.0 / (epoch as f32 + 1.0);
            #[allow(clippy::cast_precision_loss)]
            let val_loss = 2.2 / (epoch as f32 + 1.0);

            job.update_status(TrainingStatus::Training {
                epoch,
                loss: train_loss,
            });

            metrics.push(TrainingMetrics {
                epoch,
                train_loss,
                val_loss: Some(val_loss),
                learning_rate: self.config.base.learning_rate,
                step: epoch,
            });

            // Simulate async yield
            #[cfg(feature = "native")]
            tokio::time::sleep(std::time::Duration::from_millis(100)).await;
        }

        // Mark as completed
        if let Some(last_metric) = metrics.last() {
            job.complete(last_metric.train_loss);
        }

        Ok(metrics)
    }

    /// Fallback training implementation when speculator feature is disabled.
    #[allow(dead_code)]
    #[cfg(not(feature = "speculator"))]
    async fn train_impl(&self, job: &mut TrainingJob) -> Result<Vec<TrainingMetrics>, OxiRagError> {
        job.fail("Training requires 'speculator' feature to be enabled");
        Err(DistillationError::InvalidConfig(
            "Candle LoRA training requires 'speculator' feature".to_string(),
        )
        .into())
    }

    /// Save a `LoRA` checkpoint.
    ///
    /// # Errors
    ///
    /// Returns an error if checkpoint saving fails.
    #[allow(dead_code)]
    #[cfg(feature = "speculator")]
    fn save_checkpoint(&self, job_id: &str, _varmap: &VarMap) -> Result<(), OxiRagError> {
        let checkpoint_path = self.checkpoint_path(job_id);

        // Real implementation would use varmap.save()
        // For now, just create an empty file
        #[cfg(feature = "native")]
        {
            std::fs::write(&checkpoint_path, b"").map_err(|e| {
                DistillationError::StorageError(format!("Failed to save checkpoint: {e}"))
            })?;
        }

        Ok(())
    }

    /// Load a `LoRA` checkpoint.
    ///
    /// # Errors
    ///
    /// Returns an error if checkpoint loading fails.
    #[allow(dead_code)]
    #[cfg(feature = "speculator")]
    fn load_checkpoint(&self, job_id: &str) -> Result<VarMap, OxiRagError> {
        let checkpoint_path = self.checkpoint_path(job_id);

        if !checkpoint_path.exists() {
            return Err(DistillationError::PatternNotFound(format!(
                "Checkpoint not found: {job_id}"
            ))
            .into());
        }

        // Real implementation would use VarMap::load()
        Ok(VarMap::new())
    }

    /// Get a job by ID.
    #[must_use]
    pub fn get_job(&self, job_id: &str) -> Option<&TrainingJob> {
        self.jobs.get(job_id)
    }

    /// Get a mutable job by ID.
    pub fn get_job_mut(&mut self, job_id: &str) -> Option<&mut TrainingJob> {
        self.jobs.get_mut(job_id)
    }
}

#[async_trait]
impl LoraTrainer for CandleLoraTrainer {
    async fn create_job(
        &mut self,
        pattern: &QueryPattern,
        examples: Vec<LoraTrainingExample>,
        config: LoraConfig,
    ) -> Result<String, OxiRagError> {
        // Validate configuration
        if !config.is_valid() {
            return Err(
                DistillationError::InvalidConfig("Invalid LoRA configuration".to_string()).into(),
            );
        }

        if examples.is_empty() {
            return Err(DistillationError::CollectionFailed(
                "No training examples provided".to_string(),
            )
            .into());
        }

        // Create job
        let job_id = self.generate_job_id();
        let mut job = TrainingJob::new(job_id.clone(), pattern.clone(), config, examples);

        // Start training in background (in production, this would spawn a task)
        // For now, we'll just mark it as pending
        job.update_status(TrainingStatus::Pending);

        self.jobs.insert(job_id.clone(), job);

        Ok(job_id)
    }

    async fn get_status(&self, job_id: &str) -> Option<TrainingStatus> {
        self.jobs.get(job_id).map(|j| j.status.clone())
    }

    async fn cancel_job(&mut self, job_id: &str) -> Result<(), OxiRagError> {
        let job = self.jobs.get_mut(job_id).ok_or_else(|| {
            DistillationError::PatternNotFound(format!("Job not found: {job_id}"))
        })?;

        if job.status.is_terminal() {
            return Err(DistillationError::TrackingFailed(
                "Cannot cancel a completed job".to_string(),
            )
            .into());
        }

        job.fail("Cancelled by user");
        Ok(())
    }

    fn list_jobs(&self) -> Vec<&TrainingJob> {
        self.jobs.values().collect()
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_candle_config_default() {
        let config = CandleLoraConfig::default();
        assert_eq!(config.model_id, "microsoft/phi-2");
        assert_eq!(config.device, "cpu");
        assert!(config.validate().is_ok());
    }

    #[test]
    fn test_candle_config_builder() {
        let config = CandleLoraConfig::with_model("my-model")
            .with_device("cpu")
            .with_rank(16)
            .with_learning_rate(1e-4)
            .with_epochs(5);

        assert_eq!(config.model_id, "my-model");
        assert_eq!(config.base.rank, 16);
        assert_eq!(config.base.num_epochs, 5);
    }

    #[test]
    fn test_candle_config_validation() {
        let mut config = CandleLoraConfig {
            model_id: String::new(),
            ..Default::default()
        };
        assert!(config.validate().is_err());

        config.model_id = "valid-model".to_string();
        config.validation_split = 1.5;
        assert!(config.validate().is_err());
    }

    #[cfg(feature = "speculator")]
    #[test]
    fn test_get_device() {
        let config = CandleLoraConfig::default();
        let device = config.get_device();
        assert!(device.is_ok());
    }

    #[cfg(feature = "speculator")]
    #[test]
    fn test_get_dtype() {
        let config = CandleLoraConfig::default();
        let dtype = config.get_dtype();
        assert!(dtype.is_ok());
        assert_eq!(dtype.unwrap(), DType::F32);
    }

    #[test]
    fn test_trainer_creation() {
        let result = CandleLoraTrainer::with_defaults();
        // May fail if checkpoint dir cannot be created, but structure is valid
        let Ok(_trainer) = result else {
            // Expected in some test environments
            return;
        };
    }

    #[tokio::test]
    async fn test_create_job() {
        let trainer_result = CandleLoraTrainer::with_defaults();
        if trainer_result.is_err() {
            // Skip test if initialization fails (e.g., filesystem issues)
            return;
        }

        let mut trainer = trainer_result.unwrap();
        let pattern = QueryPattern::new("test query");
        let examples = vec![LoraTrainingExample::new("input", "output")];
        let config = LoraConfig::default();

        let result = trainer.create_job(&pattern, examples, config).await;
        assert!(result.is_ok());
    }

    #[tokio::test]
    async fn test_get_status() {
        let trainer_result = CandleLoraTrainer::with_defaults();
        if trainer_result.is_err() {
            return;
        }

        let mut trainer = trainer_result.unwrap();
        let pattern = QueryPattern::new("test");
        let examples = vec![LoraTrainingExample::new("input", "output")];
        let config = LoraConfig::default();

        let job_id = trainer
            .create_job(&pattern, examples, config)
            .await
            .unwrap();

        let status = trainer.get_status(&job_id).await;
        assert!(status.is_some());
    }

    #[tokio::test]
    async fn test_cancel_job() {
        let trainer_result = CandleLoraTrainer::with_defaults();
        if trainer_result.is_err() {
            return;
        }

        let mut trainer = trainer_result.unwrap();
        let pattern = QueryPattern::new("test");
        let examples = vec![LoraTrainingExample::new("input", "output")];
        let config = LoraConfig::default();

        let job_id = trainer
            .create_job(&pattern, examples, config)
            .await
            .unwrap();

        let result = trainer.cancel_job(&job_id).await;
        assert!(result.is_ok());

        let status = trainer.get_status(&job_id).await.unwrap();
        assert!(matches!(status, TrainingStatus::Failed { .. }));
    }

    #[tokio::test]
    async fn test_list_jobs() {
        let trainer_result = CandleLoraTrainer::with_defaults();
        if trainer_result.is_err() {
            return;
        }

        let mut trainer = trainer_result.unwrap();
        let pattern = QueryPattern::new("test");

        for i in 0..3 {
            let examples = vec![LoraTrainingExample::new(format!("input{i}"), "output")];
            let _ = trainer
                .create_job(&pattern, examples, LoraConfig::default())
                .await;
        }

        assert_eq!(trainer.list_jobs().len(), 3);
    }

    #[tokio::test]
    async fn test_invalid_config() {
        let trainer_result = CandleLoraTrainer::with_defaults();
        if trainer_result.is_err() {
            return;
        }

        let mut trainer = trainer_result.unwrap();
        let pattern = QueryPattern::new("test");
        let examples = vec![LoraTrainingExample::new("input", "output")];
        let config = LoraConfig {
            rank: 0, // Invalid
            ..Default::default()
        };

        let result = trainer.create_job(&pattern, examples, config).await;
        assert!(result.is_err());
    }

    #[tokio::test]
    async fn test_empty_examples() {
        let trainer_result = CandleLoraTrainer::with_defaults();
        if trainer_result.is_err() {
            return;
        }

        let mut trainer = trainer_result.unwrap();
        let pattern = QueryPattern::new("test");
        let examples: Vec<LoraTrainingExample> = vec![];

        let result = trainer
            .create_job(&pattern, examples, LoraConfig::default())
            .await;
        assert!(result.is_err());
    }

    #[cfg(feature = "speculator")]
    #[test]
    fn test_lora_layer_creation() {
        use candle_core::{DType, Device, Tensor};
        use candle_nn::VarMap;

        let device = Device::Cpu;
        let base_weight =
            Tensor::zeros((128, 256), DType::F32, &device).expect("Failed to create tensor");

        let varmap = VarMap::new();
        let vb = candle_nn::VarBuilder::from_varmap(&varmap, DType::F32, &device);

        let lora_layer = LoraLayer::new(base_weight, 8, 16.0, &vb, "test_layer");
        assert!(lora_layer.is_ok());

        let layer = lora_layer.unwrap();
        assert_eq!(layer.trainable_vars().len(), 2);
    }

    #[cfg(feature = "speculator")]
    #[test]
    fn test_lora_layer_forward() {
        use candle_core::{DType, Device, Tensor};
        use candle_nn::VarMap;

        let device = Device::Cpu;
        let base_weight =
            Tensor::rand(0.0, 1.0, (128, 256), &device).expect("Failed to create tensor");

        let varmap = VarMap::new();
        let vb = candle_nn::VarBuilder::from_varmap(&varmap, DType::F32, &device);

        let layer = LoraLayer::new(base_weight, 8, 16.0, &vb, "test_layer")
            .expect("Failed to create layer");

        // Create input tensor: batch_size=2, features=256
        let input =
            Tensor::rand(0.0, 1.0, (2, 256), &device).expect("Failed to create input tensor");

        let output = layer.forward(&input);
        if let Err(e) = &output {
            panic!("Forward pass failed: {e}");
        }
        assert!(output.is_ok());

        let out_tensor = output.unwrap();
        let shape = out_tensor.dims();
        assert_eq!(shape, &[2, 128]); // batch_size=2, output_dim=128
    }

    #[cfg(feature = "speculator")]
    #[test]
    fn test_lora_layer_enable_disable() {
        use candle_core::{DType, Device, Tensor};
        use candle_nn::VarMap;

        let device = Device::Cpu;
        let base_weight = Tensor::ones((128, 256), DType::F32, &device).unwrap();

        let varmap = VarMap::new();
        let vb = candle_nn::VarBuilder::from_varmap(&varmap, DType::F32, &device);

        let mut layer = LoraLayer::new(base_weight, 8, 16.0, &vb, "test").unwrap();

        let input = Tensor::ones((2, 256), DType::F32, &device).unwrap();

        // Forward with LoRA enabled
        let out1 = layer.forward(&input).unwrap();

        // Disable LoRA
        layer.set_enabled(false);
        let out2 = layer.forward(&input).unwrap();

        // Enable again
        layer.set_enabled(true);
        let out3 = layer.forward(&input).unwrap();

        // out1 and out3 should be similar (LoRA enabled)
        // out2 should be different (LoRA disabled)
        assert!(out1.dims() == out2.dims());
        assert!(out1.dims() == out3.dims());
    }
}