ruvllm 2.2.0 - Docs.rs

#![allow(
    clippy::all,
    unused_imports,
    unused_variables,
    dead_code,
    unused_mut,
    unused_assignments,
    non_camel_case_types,
    clippy::approx_constant,
    unexpected_cfgs,
    unused_must_use,
    unused_parens
)]
//! Integration tests for LLM backends
//!
//! Tests the LLM backend infrastructure including model loading,
//! text generation, streaming, and embeddings extraction.

use ruvllm::{
    backends::{
        create_backend, DType, DeviceType, GenerateParams, LlmBackend, ModelArchitecture,
        ModelConfig, ModelInfo, Quantization, SpecialTokens, TokenStream, Tokenizer,
    },
    error::Result,
};
use std::sync::Arc;

/// Mock backend for testing without requiring actual model files
#[derive(Debug)]
struct MockBackend {
    model_info: Option<ModelInfo>,
    loaded: bool,
}

impl MockBackend {
    fn new() -> Self {
        Self {
            model_info: None,
            loaded: false,
        }
    }
}

impl LlmBackend for MockBackend {
    fn load_model(&mut self, model_id: &str, config: ModelConfig) -> Result<()> {
        self.model_info = Some(ModelInfo {
            name: model_id.to_string(),
            architecture: config.architecture,
            num_parameters: 100_000,
            vocab_size: 32000,
            hidden_size: 768,
            num_layers: 12,
            max_context_length: config.max_sequence_length,
            quantization: config.quantization,
            memory_usage: 1024 * 1024 * 100, // 100MB
        });
        self.loaded = true;
        Ok(())
    }

    fn generate(&self, prompt: &str, _params: GenerateParams) -> Result<String> {
        if !self.loaded {
            return Err(ruvllm::RuvLLMError::Backend("Model not loaded".to_string()));
        }
        Ok(format!("Response to: {}", prompt))
    }

    fn generate_stream(
        &self,
        _prompt: &str,
        _params: GenerateParams,
    ) -> Result<Box<dyn Iterator<Item = Result<ruvllm::backends::GeneratedToken>> + Send + '_>>
    {
        if !self.loaded {
            return Err(ruvllm::RuvLLMError::Backend("Model not loaded".to_string()));
        }

        let tokens = vec![
            ruvllm::backends::GeneratedToken {
                id: 1,
                text: "Hello".to_string(),
                logprob: Some(-0.5),
                is_special: false,
            },
            ruvllm::backends::GeneratedToken {
                id: 2,
                text: " world".to_string(),
                logprob: Some(-0.3),
                is_special: false,
            },
            ruvllm::backends::GeneratedToken {
                id: 3,
                text: "!".to_string(),
                logprob: Some(-0.1),
                is_special: false,
            },
        ];

        Ok(Box::new(tokens.into_iter().map(Ok)))
    }

    fn generate_stream_v2(&self, _prompt: &str, _params: GenerateParams) -> Result<TokenStream> {
        if !self.loaded {
            return Err(ruvllm::RuvLLMError::Backend("Model not loaded".to_string()));
        }
        // Return a mock stream using channel
        let (tx, stream) = TokenStream::channel();
        // Drop tx immediately since we don't need to send anything for this mock
        drop(tx);
        Ok(stream)
    }

    fn get_embeddings(&self, _text: &str) -> Result<Vec<f32>> {
        if !self.loaded {
            return Err(ruvllm::RuvLLMError::Backend("Model not loaded".to_string()));
        }
        // Return a mock embedding
        Ok(vec![0.1; 768])
    }

    fn tokenizer(&self) -> Option<&dyn Tokenizer> {
        None
    }

    fn is_model_loaded(&self) -> bool {
        self.loaded
    }

    fn model_info(&self) -> Option<ModelInfo> {
        self.model_info.clone()
    }

    fn unload_model(&mut self) {
        self.loaded = false;
        self.model_info = None;
    }
}

#[test]
fn test_mock_backend_load_model() {
    let mut backend = MockBackend::new();

    // Initially not loaded
    assert!(!backend.is_model_loaded());
    assert!(backend.model_info().is_none());

    // Load model
    let config = ModelConfig::default();
    let result = backend.load_model("test-model", config);
    assert!(result.is_ok());
    assert!(backend.is_model_loaded());
    assert!(backend.model_info().is_some());
}

#[test]
fn test_backend_generate_basic() {
    let mut backend = MockBackend::new();
    backend
        .load_model("test-model", ModelConfig::default())
        .unwrap();

    let params = GenerateParams {
        max_tokens: 100,
        temperature: 0.7,
        top_p: 0.9,
        top_k: 40,
        repetition_penalty: 1.1,
        frequency_penalty: 0.0,
        presence_penalty: 0.0,
        stop_sequences: vec![],
        seed: Some(42),
    };

    let result = backend.generate("Hello, how are you?", params);
    assert!(result.is_ok());
    let output = result.unwrap();
    assert!(!output.is_empty());
    assert!(output.contains("Hello"));
}

#[test]
fn test_backend_generate_requires_loaded_model() {
    let backend = MockBackend::new();

    let params = GenerateParams::default();
    let result = backend.generate("Test prompt", params);

    assert!(result.is_err());
}

#[test]
fn test_backend_streaming() {
    let mut backend = MockBackend::new();
    backend
        .load_model("test-model", ModelConfig::default())
        .unwrap();

    let params = GenerateParams::default();
    let stream = backend.generate_stream("Hello", params).unwrap();

    let tokens: Vec<_> = stream.collect();
    assert_eq!(tokens.len(), 3);

    let first = tokens[0].as_ref().unwrap();
    assert_eq!(first.text, "Hello");
    assert_eq!(first.id, 1);
    assert!(!first.is_special);
}

#[test]
fn test_backend_embeddings() {
    let mut backend = MockBackend::new();
    backend
        .load_model("test-model", ModelConfig::default())
        .unwrap();

    let embedding = backend.get_embeddings("Test text for embedding").unwrap();

    assert_eq!(embedding.len(), 768);
    assert!(embedding.iter().all(|&v| v.is_finite()));
}

#[test]
fn test_backend_model_info() {
    let mut backend = MockBackend::new();

    let config = ModelConfig {
        architecture: ModelArchitecture::Llama,
        max_sequence_length: 4096,
        quantization: Some(Quantization::Q4K),
        ..Default::default()
    };

    backend.load_model("llama-test", config).unwrap();
    let info = backend.model_info().unwrap();

    assert_eq!(info.name, "llama-test");
    assert_eq!(info.max_context_length, 4096);
    assert!(matches!(info.architecture, ModelArchitecture::Llama));
    assert!(matches!(info.quantization, Some(Quantization::Q4K)));
}

#[test]
fn test_backend_unload() {
    let mut backend = MockBackend::new();
    backend
        .load_model("test-model", ModelConfig::default())
        .unwrap();
    assert!(backend.is_model_loaded());

    backend.unload_model();
    assert!(!backend.is_model_loaded());
    assert!(backend.model_info().is_none());

    // Should fail after unload
    let result = backend.generate("Test", GenerateParams::default());
    assert!(result.is_err());
}

#[test]
fn test_model_config() {
    let config = ModelConfig {
        architecture: ModelArchitecture::Mistral,
        device: DeviceType::Cpu,
        dtype: DType::F32,
        quantization: Some(Quantization::Q4K),
        use_flash_attention: true,
        max_sequence_length: 4096,
        num_kv_heads: Some(8),
        hidden_size: Some(4096),
        num_layers: Some(32),
        vocab_size: Some(32000),
        rope_theta: Some(10000.0),
        sliding_window: None,
    };

    assert!(matches!(config.device, DeviceType::Cpu));
    assert!(matches!(config.dtype, DType::F32));
    assert!(matches!(config.quantization, Some(Quantization::Q4K)));
    assert!(config.use_flash_attention);
    assert_eq!(config.max_sequence_length, 4096);
}

#[test]
fn test_generate_params_default() {
    let params = GenerateParams::default();

    assert!(params.max_tokens > 0);
    assert!(params.temperature > 0.0);
    assert!(params.top_p <= 1.0);
    assert!(params.top_k > 0);
}

#[test]
fn test_generate_params_builder() {
    let params = GenerateParams::default()
        .with_max_tokens(512)
        .with_temperature(0.5)
        .with_top_p(0.95)
        .with_top_k(50)
        .with_repetition_penalty(1.2)
        .with_seed(42);

    assert_eq!(params.max_tokens, 512);
    assert_eq!(params.temperature, 0.5);
    assert_eq!(params.top_p, 0.95);
    assert_eq!(params.top_k, 50);
    assert_eq!(params.repetition_penalty, 1.2);
    assert_eq!(params.seed, Some(42));
}

#[test]
fn test_quantization_variants() {
    let q4 = Quantization::Q4;
    let q8 = Quantization::Q8;
    let q4k = Quantization::Q4K;
    let f16 = Quantization::F16;

    assert!(q4.is_gguf());
    assert!(q8.is_gguf());
    assert!(q4k.is_gguf());
    assert!(!f16.is_gguf());

    // Check bytes per weight
    assert_eq!(Quantization::None.bytes_per_weight(), 4.0);
    assert_eq!(Quantization::F16.bytes_per_weight(), 2.0);
    assert_eq!(Quantization::Q8.bytes_per_weight(), 1.0);
    assert_eq!(Quantization::Q4K.bytes_per_weight(), 0.5);
}

#[test]
fn test_device_type_variants() {
    let cpu = DeviceType::Cpu;
    let metal = DeviceType::Metal;
    let cuda = DeviceType::Cuda(0);

    assert!(matches!(cpu, DeviceType::Cpu));
    assert!(matches!(metal, DeviceType::Metal));
    if let DeviceType::Cuda(idx) = cuda {
        assert_eq!(idx, 0);
    }
}

#[test]
fn test_model_architecture_variants() {
    let llama = ModelArchitecture::Llama;
    let mistral = ModelArchitecture::Mistral;
    let phi = ModelArchitecture::Phi;
    let qwen = ModelArchitecture::Qwen;
    let gemma = ModelArchitecture::Gemma;

    assert_eq!(llama.config_name(), "llama");
    assert_eq!(mistral.config_name(), "mistral");
    assert_eq!(phi.config_name(), "phi");
    assert_eq!(qwen.config_name(), "qwen2");
    assert_eq!(gemma.config_name(), "gemma");
}

#[test]
fn test_dtype_variants() {
    let f32_type = DType::F32;
    let f16_type = DType::F16;
    let bf16_type = DType::Bf16;

    assert!(matches!(f32_type, DType::F32));
    assert!(matches!(f16_type, DType::F16));
    assert!(matches!(bf16_type, DType::Bf16));
}

#[test]
fn test_special_tokens() {
    let tokens = SpecialTokens {
        bos_token_id: Some(1),
        eos_token_id: Some(2),
        pad_token_id: Some(0),
        unk_token_id: Some(3),
    };

    assert_eq!(tokens.bos_token_id, Some(1));
    assert_eq!(tokens.eos_token_id, Some(2));
    assert_eq!(tokens.pad_token_id, Some(0));
    assert_eq!(tokens.unk_token_id, Some(3));
}

#[test]
fn test_create_backend() {
    // This creates a NoopBackend when candle feature is not enabled
    let backend = create_backend();

    // Without the candle feature, the backend should not be able to load models
    #[cfg(not(feature = "candle"))]
    {
        assert!(!backend.is_model_loaded());
    }
}

// Candle backend tests (only run when the feature is enabled)
#[cfg(feature = "candle")]
mod candle_tests {
    use super::*;
    use ruvllm::backends::CandleBackend;

    #[test]
    #[ignore] // Requires model download
    fn test_candle_backend_creation() {
        let backend = CandleBackend::new();
        assert!(backend.is_ok());
    }

    #[test]
    #[ignore] // Requires model download
    fn test_candle_backend_load_model() {
        let mut backend = CandleBackend::new().unwrap();
        let config = ModelConfig {
            architecture: ModelArchitecture::Phi,
            device: DeviceType::Cpu,
            ..Default::default()
        };

        // This would require an actual model file
        // let result = backend.load_model("microsoft/phi-2", config);
        // assert!(result.is_ok());
    }
}

// ========== V2 Feature Tests: Memory Pool Integration ==========

mod memory_pool_tests {
    use ruvllm::memory_pool::{
        BufferPool, BufferSize, InferenceArena, MemoryManager, MemoryManagerConfig,
        ScratchSpaceManager,
    };

    /// Test memory pool integration with streaming generation
    #[test]
    fn test_memory_pool_integration() {
        let pool = BufferPool::new();

        // Pre-warm the pool
        pool.prewarm_all(4).expect("prewarm failed");

        // Simulate multiple generation steps
        for step in 0..10 {
            // Acquire buffers for KV cache
            let kv_buffer = pool.acquire(BufferSize::KB64).expect("acquire failed");
            assert_eq!(kv_buffer.capacity(), 65536);

            // Simulate processing
            let data = kv_buffer.as_slice::<f32>();
            assert!(!data.is_empty());

            // Buffer returns to pool when dropped
        }

        // Check pool statistics
        let stats = pool.stats();
        assert!(stats.hits + stats.misses > 0, "Pool should have been used");

        // Hit rate should be decent after warm-up
        if stats.hits + stats.misses >= 10 {
            assert!(
                stats.hit_rate > 0.5,
                "Pool hit rate should be decent: {:.2}",
                stats.hit_rate
            );
        }
    }

    /// Test streaming with memory pool
    #[test]
    fn test_streaming_with_pool() {
        let manager = MemoryManager::new().expect("manager creation failed");

        // Simulate streaming generation
        for token_idx in 0..100 {
            // Reset arena at start of each step
            manager.reset_step();

            // Allocate temporary buffers from arena
            let activations: &mut [f32] = manager.arena.alloc(1024).expect("arena alloc failed");
            activations[0] = token_idx as f32;

            let logits: &mut [f32] = manager.arena.alloc(32000).expect("arena alloc for logits");
            logits[0] = token_idx as f32 * 0.1;

            // Acquire KV cache buffer from pool
            let kv_buf = manager
                .pool
                .acquire(BufferSize::KB16)
                .expect("acquire failed");
            assert!(kv_buf.capacity() >= 16384);

            // Use scratch space for intermediate computations
            let mut scratch = manager.scratch.get_scratch().expect("get_scratch failed");
            if let Some(temp) = scratch.get::<f32>(256) {
                temp.fill(1.0);
                assert_eq!(temp.len(), 256);
            }

            // Verify arena usage grows
            assert!(manager.arena.used() > 0);
        }

        // Verify final statistics
        let stats = manager.stats();
        assert!(stats.pool.hits + stats.pool.misses > 0);
        assert!(stats.arena.high_water_mark > 0);
    }

    /// Test arena allocation and reset cycle
    #[test]
    fn test_arena_allocation_cycle() {
        let arena = InferenceArena::new(4 * 1024 * 1024).expect("arena creation failed"); // 4MB

        for cycle in 0..50 {
            // Allocate various buffer sizes
            let buf1: &mut [f32] = arena.alloc(4096).expect("alloc 4096");
            let buf2: &mut [f32] = arena.alloc(8192).expect("alloc 8192");
            let buf3: &mut [f32] = arena.alloc(1024).expect("alloc 1024");

            // Write to buffers
            buf1[0] = cycle as f32;
            buf2[0] = cycle as f32 * 2.0;
            buf3[0] = cycle as f32 * 3.0;

            // Verify allocations
            assert_eq!(arena.allocation_count(), 3);
            assert!(arena.used() > 0);

            // Reset for next cycle
            arena.reset();
            assert_eq!(arena.used(), 0);
            assert_eq!(arena.allocation_count(), 0);
        }

        // High water mark should be set
        assert!(arena.high_water_mark() > 0);
    }

    /// Test buffer pool reuse efficiency
    #[test]
    fn test_buffer_pool_reuse() {
        let pool = BufferPool::with_capacity(8);

        // Acquire and release same size multiple times
        for _ in 0..20 {
            let buf = pool.acquire(BufferSize::KB4).expect("acquire failed");
            assert_eq!(buf.capacity(), 4096);
            // Buffer returns to pool on drop
        }

        let stats = pool.stats();
        // After first allocation, subsequent ones should hit the pool
        assert!(
            stats.hits >= 19,
            "Expected at least 19 hits, got {}",
            stats.hits
        );
    }

    /// Test scratch space thread isolation
    #[test]
    fn test_scratch_space_isolation() {
        use std::sync::Arc;
        use std::thread;

        let manager = Arc::new(ScratchSpaceManager::new(8192, 8).expect("manager creation failed"));

        let handles: Vec<_> = (0..4)
            .map(|thread_id| {
                let manager = Arc::clone(&manager);
                thread::spawn(move || {
                    for _ in 0..10 {
                        let mut scratch = manager.get_scratch().expect("get_scratch failed");

                        // Each thread writes its ID
                        if let Some(buf) = scratch.get::<u32>(100) {
                            buf.fill(thread_id);
                            // Verify no cross-thread contamination
                            assert!(buf.iter().all(|&v| v == thread_id));
                        }

                        scratch.reset();
                    }
                })
            })
            .collect();

        for handle in handles {
            handle.join().expect("Thread panicked");
        }

        // Verify 4 threads were tracked
        assert_eq!(manager.active_threads(), 4);
    }

    /// Test memory manager configuration for model
    #[test]
    fn test_memory_manager_for_model() {
        // Configure for a small LLM (e.g., Phi-2)
        let config = MemoryManagerConfig::for_model(
            2560,  // hidden_dim
            51200, // vocab_size
            1,     // batch_size
        );

        let manager = MemoryManager::with_config(config).expect("manager creation failed");

        // Verify adequate capacity
        assert!(manager.arena.capacity() > 2560 * 4 * 4); // At least hidden_dim * 4 * sizeof(f32)

        // Simulate inference
        let activations: &mut [f32] = manager.arena.alloc(2560).expect("alloc activations");
        let logits: &mut [f32] = manager.arena.alloc(51200).expect("alloc logits");

        assert_eq!(activations.len(), 2560);
        assert_eq!(logits.len(), 51200);

        // Reset for next step
        manager.reset_step();
        assert_eq!(manager.arena.used(), 0);
    }

    /// Test buffer size class selection
    #[test]
    fn test_buffer_size_selection() {
        let pool = BufferPool::new();

        // Test automatic size class selection
        if let Some(buf) = pool.acquire_for_size(500).ok().flatten() {
            assert!(buf.capacity() >= 500);
            assert_eq!(buf.size_class(), BufferSize::KB1);
        }

        if let Some(buf) = pool.acquire_for_size(3000).ok().flatten() {
            assert!(buf.capacity() >= 3000);
            assert_eq!(buf.size_class(), BufferSize::KB4);
        }

        if let Some(buf) = pool.acquire_for_size(100000).ok().flatten() {
            assert!(buf.capacity() >= 100000);
            assert_eq!(buf.size_class(), BufferSize::KB256);
        }

        // Size too large should return None
        let too_large = pool.acquire_for_size(500000).ok().flatten();
        assert!(too_large.is_none(), "Should not find buffer for 500KB");
    }
}