ruvllm 2.1.0 - Docs.rs

#![allow(
    clippy::all,
    unused_imports,
    unused_variables,
    dead_code,
    unused_mut,
    unused_assignments,
    non_camel_case_types,
    clippy::approx_constant,
    unexpected_cfgs,
    unused_must_use,
    unused_parens
)]
//! Benchmark token generation speed on real GGUF models
//!
//! This benchmark measures:
//! - Time to first token (TTFT)
//! - Tokens per second (throughput)
//! - Latency distribution (p50, p95, p99)
//! - Memory usage
//!
//! ## Usage
//!
//! ```bash
//! # Benchmark a specific model
//! cargo run -p ruvllm --example benchmark_model --release -- --model ./test_models/tinyllama.gguf
//!
//! # With custom parameters
//! cargo run -p ruvllm --example benchmark_model --release -- \
//!     --model ./model.gguf \
//!     --warmup 5 \
//!     --iterations 20 \
//!     --max-tokens 100
//!
//! # JSON output for CI/automation
//! cargo run -p ruvllm --example benchmark_model --release -- \
//!     --model ./model.gguf --json
//! ```
//!
//! ## Output Example
//!
//! ```text
//! RuvLLM Model Benchmark
//! =====================
//! Model: ./test_models/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf
//! Model Size: 669.34 MB
//!
//! Configuration:
//!   Warmup iterations: 5
//!   Benchmark iterations: 20
//!   Max tokens per generation: 50
//!
//! Running warmup...
//!   Warmup 1/5: 32.4 tok/s
//!   Warmup 2/5: 35.2 tok/s
//!   ...
//!
//! Running benchmark...
//!   Iteration 1/20: 34.8 tok/s, TTFT: 45.2ms
//!   Iteration 2/20: 35.1 tok/s, TTFT: 44.8ms
//!   ...
//!
//! Results:
//!   Throughput (tok/s):
//!     Mean:   35.2
//!     Median: 35.1
//!     Std:    1.2
//!     Min:    33.5
//!     Max:    37.8
//!
//!   Latency (ms):
//!     TTFT Mean: 45.0
//!     P50:  28.5
//!     P95:  32.1
//!     P99:  35.8
//!
//!   Memory:
//!     Peak RSS: 1.2 GB
//! ```

use std::env;
use std::fs;
use std::path::PathBuf;
use std::time::Duration;

/// Benchmark configuration
#[derive(Debug, Clone)]
struct BenchmarkConfig {
    /// Path to the GGUF model file
    model_path: PathBuf,
    /// Number of warmup iterations (not counted in results)
    warmup_iterations: usize,
    /// Number of benchmark iterations
    benchmark_iterations: usize,
    /// Maximum tokens to generate per iteration
    max_tokens: usize,
    /// Test prompts to use (reserved for future use with actual model loading)
    #[allow(dead_code)]
    prompts: Vec<String>,
    /// Output results as JSON
    json_output: bool,
    /// Temperature for generation
    temperature: f32,
    /// Verbose output
    verbose: bool,
}

impl Default for BenchmarkConfig {
    fn default() -> Self {
        Self {
            model_path: PathBuf::new(),
            warmup_iterations: 5,
            benchmark_iterations: 20,
            max_tokens: 50,
            prompts: vec![
                "The quick brown fox".to_string(),
                "Once upon a time".to_string(),
                "In the beginning".to_string(),
                "Hello, I am".to_string(),
                "The capital of France is".to_string(),
            ],
            json_output: false,
            temperature: 0.7,
            verbose: false,
        }
    }
}

/// Results from a single generation
#[derive(Debug, Clone)]
struct GenerationResult {
    tokens_generated: usize,
    total_duration: Duration,
    time_to_first_token: Duration,
    token_latencies: Vec<Duration>,
}

impl GenerationResult {
    fn tokens_per_second(&self) -> f64 {
        if self.total_duration.as_secs_f64() > 0.0 {
            self.tokens_generated as f64 / self.total_duration.as_secs_f64()
        } else {
            0.0
        }
    }
}

/// Aggregated benchmark results
#[derive(Debug)]
struct BenchmarkResults {
    model_path: String,
    model_size_bytes: u64,
    warmup_iterations: usize,
    benchmark_iterations: usize,
    max_tokens: usize,

    // Throughput statistics
    throughput_mean: f64,
    throughput_median: f64,
    throughput_std: f64,
    throughput_min: f64,
    throughput_max: f64,

    // Latency statistics (in milliseconds)
    ttft_mean: f64,
    ttft_median: f64,
    latency_p50: f64,
    latency_p95: f64,
    latency_p99: f64,

    // Memory (if available)
    peak_memory_bytes: Option<u64>,

    // Individual results (reserved for detailed analysis)
    #[allow(dead_code)]
    results: Vec<GenerationResult>,
}

impl BenchmarkResults {
    fn from_results(
        config: &BenchmarkConfig,
        model_size_bytes: u64,
        results: Vec<GenerationResult>,
    ) -> Self {
        let throughputs: Vec<f64> = results.iter().map(|r| r.tokens_per_second()).collect();
        let ttfts: Vec<f64> = results
            .iter()
            .map(|r| r.time_to_first_token.as_secs_f64() * 1000.0)
            .collect();

        // Collect all token latencies
        let mut all_latencies: Vec<f64> = results
            .iter()
            .flat_map(|r| r.token_latencies.iter().map(|d| d.as_secs_f64() * 1000.0))
            .collect();
        all_latencies.sort_by(|a, b| a.partial_cmp(b).unwrap());

        Self {
            model_path: config.model_path.display().to_string(),
            model_size_bytes,
            warmup_iterations: config.warmup_iterations,
            benchmark_iterations: config.benchmark_iterations,
            max_tokens: config.max_tokens,

            throughput_mean: mean(&throughputs),
            throughput_median: median(&throughputs),
            throughput_std: std_dev(&throughputs),
            throughput_min: throughputs.iter().cloned().fold(f64::INFINITY, f64::min),
            throughput_max: throughputs
                .iter()
                .cloned()
                .fold(f64::NEG_INFINITY, f64::max),

            ttft_mean: mean(&ttfts),
            ttft_median: median(&ttfts),
            latency_p50: percentile(&all_latencies, 50),
            latency_p95: percentile(&all_latencies, 95),
            latency_p99: percentile(&all_latencies, 99),

            peak_memory_bytes: get_peak_memory(),
            results,
        }
    }

    fn print_text(&self) {
        println!("\nResults:");
        println!("========");
        println!();
        println!("Throughput (tok/s):");
        println!("  Mean:   {:.1}", self.throughput_mean);
        println!("  Median: {:.1}", self.throughput_median);
        println!("  Std:    {:.1}", self.throughput_std);
        println!("  Min:    {:.1}", self.throughput_min);
        println!("  Max:    {:.1}", self.throughput_max);
        println!();
        println!("Latency (ms):");
        println!("  TTFT Mean:   {:.1}", self.ttft_mean);
        println!("  TTFT Median: {:.1}", self.ttft_median);
        println!("  P50:  {:.1}", self.latency_p50);
        println!("  P95:  {:.1}", self.latency_p95);
        println!("  P99:  {:.1}", self.latency_p99);

        if let Some(mem) = self.peak_memory_bytes {
            println!();
            println!("Memory:");
            println!("  Peak RSS: {}", format_bytes(mem));
        }
    }

    fn print_json(&self) {
        let json = format!(
            r#"{{
  "model_path": "{}",
  "model_size_bytes": {},
  "config": {{
    "warmup_iterations": {},
    "benchmark_iterations": {},
    "max_tokens": {}
  }},
  "throughput": {{
    "mean": {:.2},
    "median": {:.2},
    "std": {:.2},
    "min": {:.2},
    "max": {:.2}
  }},
  "latency_ms": {{
    "ttft_mean": {:.2},
    "ttft_median": {:.2},
    "p50": {:.2},
    "p95": {:.2},
    "p99": {:.2}
  }},
  "memory_bytes": {}
}}"#,
            self.model_path,
            self.model_size_bytes,
            self.warmup_iterations,
            self.benchmark_iterations,
            self.max_tokens,
            self.throughput_mean,
            self.throughput_median,
            self.throughput_std,
            self.throughput_min,
            self.throughput_max,
            self.ttft_mean,
            self.ttft_median,
            self.latency_p50,
            self.latency_p95,
            self.latency_p99,
            self.peak_memory_bytes
                .map(|m| m.to_string())
                .unwrap_or_else(|| "null".to_string()),
        );
        println!("{}", json);
    }
}

fn main() {
    let config = parse_args();

    // Validate model path
    if !config.model_path.exists() {
        eprintln!(
            "Error: Model file not found: {}",
            config.model_path.display()
        );
        eprintln!();
        eprintln!("Download a test model with:");
        eprintln!("  cargo run -p ruvllm --example download_test_model -- --model tinyllama");
        std::process::exit(1);
    }

    // Get model size
    let model_size = fs::metadata(&config.model_path)
        .map(|m| m.len())
        .unwrap_or(0);

    if !config.json_output {
        println!("RuvLLM Model Benchmark");
        println!("======================");
        println!();
        println!("Model: {}", config.model_path.display());
        println!("Model Size: {}", format_bytes(model_size));
        println!();
        println!("Configuration:");
        println!("  Warmup iterations: {}", config.warmup_iterations);
        println!("  Benchmark iterations: {}", config.benchmark_iterations);
        println!("  Max tokens per generation: {}", config.max_tokens);
        println!("  Temperature: {}", config.temperature);
        println!();
    }

    // Run benchmark
    let results = run_benchmark(&config, model_size);

    // Output results
    if config.json_output {
        results.print_json();
    } else {
        results.print_text();
    }
}

fn parse_args() -> BenchmarkConfig {
    let args: Vec<String> = env::args().collect();
    let mut config = BenchmarkConfig::default();

    if args.len() < 2 || args.contains(&"--help".to_string()) || args.contains(&"-h".to_string()) {
        print_help();
        std::process::exit(0);
    }

    let mut i = 1;
    while i < args.len() {
        match args[i].as_str() {
            "--model" | "-m" => {
                i += 1;
                if i < args.len() {
                    config.model_path = PathBuf::from(&args[i]);
                }
            }
            "--warmup" | "-w" => {
                i += 1;
                if i < args.len() {
                    config.warmup_iterations = args[i].parse().unwrap_or(5);
                }
            }
            "--iterations" | "-i" => {
                i += 1;
                if i < args.len() {
                    config.benchmark_iterations = args[i].parse().unwrap_or(20);
                }
            }
            "--max-tokens" | "-t" => {
                i += 1;
                if i < args.len() {
                    config.max_tokens = args[i].parse().unwrap_or(50);
                }
            }
            "--temperature" => {
                i += 1;
                if i < args.len() {
                    config.temperature = args[i].parse().unwrap_or(0.7);
                }
            }
            "--json" | "-j" => {
                config.json_output = true;
            }
            "--verbose" | "-v" => {
                config.verbose = true;
            }
            arg if !arg.starts_with('-') && config.model_path.as_os_str().is_empty() => {
                config.model_path = PathBuf::from(arg);
            }
            _ => {}
        }
        i += 1;
    }

    config
}

fn print_help() {
    println!("RuvLLM Model Benchmark");
    println!();
    println!("USAGE:");
    println!("    cargo run -p ruvllm --example benchmark_model --release -- [OPTIONS] <MODEL>");
    println!();
    println!("ARGUMENTS:");
    println!("    <MODEL>    Path to GGUF model file");
    println!();
    println!("OPTIONS:");
    println!("    -m, --model <PATH>          Path to GGUF model file");
    println!("    -w, --warmup <N>            Number of warmup iterations (default: 5)");
    println!("    -i, --iterations <N>        Number of benchmark iterations (default: 20)");
    println!("    -t, --max-tokens <N>        Max tokens per generation (default: 50)");
    println!("        --temperature <TEMP>    Temperature for sampling (default: 0.7)");
    println!("    -j, --json                  Output results as JSON");
    println!("    -v, --verbose               Verbose output");
    println!("    -h, --help                  Print help information");
    println!();
    println!("EXAMPLES:");
    println!("    # Basic benchmark");
    println!("    cargo run -p ruvllm --example benchmark_model --release -- ./model.gguf");
    println!();
    println!("    # Custom configuration");
    println!("    cargo run -p ruvllm --example benchmark_model --release -- \\");
    println!("        --model ./model.gguf --warmup 10 --iterations 50 --max-tokens 100");
    println!();
    println!("    # JSON output for automation");
    println!("    cargo run -p ruvllm --example benchmark_model --release -- \\");
    println!("        --model ./model.gguf --json > results.json");
}

fn run_benchmark(config: &BenchmarkConfig, model_size: u64) -> BenchmarkResults {
    // Try to use real model inference with candle backend
    #[cfg(feature = "candle")]
    {
        match run_real_benchmark(config, model_size) {
            Ok(results) => return results,
            Err(e) => {
                if !config.json_output {
                    println!("Warning: Failed to run real benchmark: {}", e);
                    println!("Falling back to simulated results.");
                    println!();
                }
            }
        }
    }

    // Fallback to simulated results
    run_simulated_benchmark(config, model_size)
}

#[cfg(feature = "candle")]
fn run_real_benchmark(
    config: &BenchmarkConfig,
    model_size: u64,
) -> Result<BenchmarkResults, String> {
    use ruvllm::{CandleBackend, GenerateParams, LlmBackend, ModelConfig};
    use std::time::Instant;

    if !config.json_output {
        println!("Loading model with Candle backend (Metal acceleration)...");
    }

    // Create backend and load model
    let mut backend =
        CandleBackend::new().map_err(|e| format!("Failed to create backend: {}", e))?;

    let model_config = ModelConfig::default();
    backend
        .load_gguf(&config.model_path, &model_config)
        .map_err(|e| format!("Failed to load GGUF model: {}", e))?;

    // Load tokenizer from same directory as model
    if let Some(parent) = config.model_path.parent() {
        let tokenizer_path = parent.join("tokenizer.json");
        if tokenizer_path.exists() {
            if !config.json_output {
                println!("Loading tokenizer from: {:?}", tokenizer_path);
            }
            backend
                .load_tokenizer(&tokenizer_path)
                .map_err(|e| format!("Failed to load tokenizer: {}", e))?;
        } else {
            return Err(format!(
                "Tokenizer not found at {:?}. Download it from HuggingFace.",
                tokenizer_path
            ));
        }
    }

    if !config.json_output {
        println!("Model loaded successfully!");
        println!();
    }

    let prompts = vec![
        "Explain quantum computing in simple terms.",
        "Write a haiku about programming.",
        "What is the meaning of life?",
        "Describe the process of photosynthesis.",
        "Tell me a short story about a robot.",
    ];

    let params = GenerateParams {
        max_tokens: config.max_tokens,
        temperature: config.temperature,
        top_p: 0.9,
        top_k: 40,
        ..Default::default()
    };

    let mut all_results = Vec::new();

    // Warmup phase
    if !config.json_output {
        println!(
            "Running warmup ({} iterations)...",
            config.warmup_iterations
        );
    }

    for i in 0..config.warmup_iterations {
        let prompt = &prompts[i % prompts.len()];
        let start = Instant::now();
        let first_token_time = Instant::now();

        match backend.generate(prompt, params.clone()) {
            Ok(output) => {
                let total_duration = start.elapsed();
                let tokens_generated = output.split_whitespace().count().max(1);

                let result = GenerationResult {
                    tokens_generated,
                    total_duration,
                    time_to_first_token: first_token_time.elapsed(),
                    token_latencies: vec![
                        total_duration / tokens_generated as u32;
                        tokens_generated
                    ],
                };

                if !config.json_output {
                    println!(
                        "  Warmup {}/{}: {:.1} tok/s",
                        i + 1,
                        config.warmup_iterations,
                        result.tokens_per_second()
                    );
                }
            }
            Err(e) => {
                if !config.json_output {
                    println!(
                        "  Warmup {}/{}: Error - {}",
                        i + 1,
                        config.warmup_iterations,
                        e
                    );
                }
            }
        }
    }

    // Benchmark phase
    if !config.json_output {
        println!();
        println!(
            "Running benchmark ({} iterations)...",
            config.benchmark_iterations
        );
    }

    for i in 0..config.benchmark_iterations {
        let prompt = &prompts[i % prompts.len()];
        let start = Instant::now();
        let first_token_time = Instant::now();

        match backend.generate(prompt, params.clone()) {
            Ok(output) => {
                let total_duration = start.elapsed();
                let tokens_generated = output.split_whitespace().count().max(1);

                let result = GenerationResult {
                    tokens_generated,
                    total_duration,
                    time_to_first_token: first_token_time.elapsed(),
                    token_latencies: vec![
                        total_duration / tokens_generated as u32;
                        tokens_generated
                    ],
                };

                if !config.json_output && (config.verbose || i % 5 == 0) {
                    println!(
                        "  Iteration {}/{}: {:.1} tok/s, TTFT: {:.1}ms",
                        i + 1,
                        config.benchmark_iterations,
                        result.tokens_per_second(),
                        result.time_to_first_token.as_secs_f64() * 1000.0
                    );
                }
                all_results.push(result);
            }
            Err(e) => {
                if !config.json_output {
                    println!(
                        "  Iteration {}/{}: Error - {}",
                        i + 1,
                        config.benchmark_iterations,
                        e
                    );
                }
            }
        }
    }

    if all_results.is_empty() {
        return Err("No successful generations".to_string());
    }

    // Print SONA learning stats
    if !config.json_output {
        if let Some(stats) = backend.sona_stats() {
            println!();
            println!("SONA Self-Learning Stats:");
            println!("  Total trajectories: {}", stats.total_trajectories);
            println!("  Instant updates: {}", stats.instant_updates);
            println!("  Background updates: {}", stats.background_updates);
            println!("  Patterns learned: {}", stats.patterns_learned);
        }
    }

    Ok(BenchmarkResults::from_results(
        config,
        model_size,
        all_results,
    ))
}

fn run_simulated_benchmark(config: &BenchmarkConfig, model_size: u64) -> BenchmarkResults {
    if !config.json_output {
        println!("Note: Running with simulated results (candle feature not enabled or model load failed).");
        println!();
    }

    let mut all_results = Vec::new();

    // Warmup phase
    if !config.json_output {
        println!(
            "Running warmup ({} iterations)...",
            config.warmup_iterations
        );
    }

    for i in 0..config.warmup_iterations {
        let result = simulate_generation(config);
        if !config.json_output {
            println!(
                "  Warmup {}/{}: {:.1} tok/s",
                i + 1,
                config.warmup_iterations,
                result.tokens_per_second()
            );
        }
    }

    // Benchmark phase
    if !config.json_output {
        println!();
        println!(
            "Running benchmark ({} iterations)...",
            config.benchmark_iterations
        );
    }

    for i in 0..config.benchmark_iterations {
        let result = simulate_generation(config);
        if !config.json_output && (config.verbose || i % 5 == 0) {
            println!(
                "  Iteration {}/{}: {:.1} tok/s, TTFT: {:.1}ms",
                i + 1,
                config.benchmark_iterations,
                result.tokens_per_second(),
                result.time_to_first_token.as_secs_f64() * 1000.0
            );
        }
        all_results.push(result);
    }

    BenchmarkResults::from_results(config, model_size, all_results)
}

/// Simulate a generation for demonstration purposes
fn simulate_generation(config: &BenchmarkConfig) -> GenerationResult {
    use rand::Rng;
    let mut rng = rand::thread_rng();

    // Simulate realistic timing characteristics
    // These would be replaced with actual measurements in a real implementation
    let base_speed = 30.0 + rng.gen::<f64>() * 10.0; // 30-40 tok/s
    let tokens = config.max_tokens.min(rng.gen_range(30..60));
    let total_secs = tokens as f64 / base_speed;

    let ttft_ms = 40.0 + rng.gen::<f64>() * 20.0; // 40-60ms TTFT
    let ttft = Duration::from_secs_f64(ttft_ms / 1000.0);

    let mut latencies = Vec::with_capacity(tokens);
    for _ in 0..tokens {
        let latency_ms = 25.0 + rng.gen::<f64>() * 10.0; // 25-35ms per token
        latencies.push(Duration::from_secs_f64(latency_ms / 1000.0));
    }

    GenerationResult {
        tokens_generated: tokens,
        total_duration: Duration::from_secs_f64(total_secs),
        time_to_first_token: ttft,
        token_latencies: latencies,
    }
}

// ============================================================================
// Statistics Helpers
// ============================================================================

fn mean(values: &[f64]) -> f64 {
    if values.is_empty() {
        return 0.0;
    }
    values.iter().sum::<f64>() / values.len() as f64
}

fn median(values: &[f64]) -> f64 {
    if values.is_empty() {
        return 0.0;
    }
    let mut sorted = values.to_vec();
    sorted.sort_by(|a, b| a.partial_cmp(b).unwrap());
    let mid = sorted.len() / 2;
    if sorted.len() % 2 == 0 {
        (sorted[mid - 1] + sorted[mid]) / 2.0
    } else {
        sorted[mid]
    }
}

fn std_dev(values: &[f64]) -> f64 {
    if values.len() < 2 {
        return 0.0;
    }
    let m = mean(values);
    let variance = values.iter().map(|x| (x - m).powi(2)).sum::<f64>() / (values.len() - 1) as f64;
    variance.sqrt()
}

fn percentile(sorted_values: &[f64], p: usize) -> f64 {
    if sorted_values.is_empty() {
        return 0.0;
    }
    let idx = (p * sorted_values.len() / 100).min(sorted_values.len() - 1);
    sorted_values[idx]
}

fn format_bytes(bytes: u64) -> String {
    const KB: u64 = 1024;
    const MB: u64 = KB * 1024;
    const GB: u64 = MB * 1024;

    if bytes >= GB {
        format!("{:.2} GB", bytes as f64 / GB as f64)
    } else if bytes >= MB {
        format!("{:.2} MB", bytes as f64 / MB as f64)
    } else if bytes >= KB {
        format!("{:.2} KB", bytes as f64 / KB as f64)
    } else {
        format!("{} B", bytes)
    }
}

/// Get peak memory usage (platform-specific)
fn get_peak_memory() -> Option<u64> {
    #[cfg(target_os = "macos")]
    {
        use std::process::Command;
        let pid = std::process::id();
        let output = Command::new("ps")
            .args(["-o", "rss=", "-p", &pid.to_string()])
            .output()
            .ok()?;

        let rss_kb: u64 = String::from_utf8_lossy(&output.stdout)
            .trim()
            .parse()
            .ok()?;

        Some(rss_kb * 1024) // Convert KB to bytes
    }

    #[cfg(target_os = "linux")]
    {
        use std::fs;
        let status = fs::read_to_string("/proc/self/status").ok()?;
        for line in status.lines() {
            if line.starts_with("VmPeak:") {
                let parts: Vec<&str> = line.split_whitespace().collect();
                if parts.len() >= 2 {
                    let kb: u64 = parts[1].parse().ok()?;
                    return Some(kb * 1024);
                }
            }
        }
        None
    }

    #[cfg(not(any(target_os = "macos", target_os = "linux")))]
    {
        None
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_statistics() {
        let values = vec![1.0, 2.0, 3.0, 4.0, 5.0];
        assert_eq!(mean(&values), 3.0);
        assert_eq!(median(&values), 3.0);
        assert!((std_dev(&values) - 1.5811).abs() < 0.001);
    }

    #[test]
    fn test_percentile() {
        let values: Vec<f64> = (0..100).map(|i| i as f64).collect();
        assert_eq!(percentile(&values, 50), 50.0);
        assert_eq!(percentile(&values, 95), 95.0);
        assert_eq!(percentile(&values, 99), 99.0);
    }

    #[test]
    fn test_format_bytes() {
        assert_eq!(format_bytes(500), "500 B");
        assert_eq!(format_bytes(1536), "1.50 KB");
        assert_eq!(format_bytes(1_572_864), "1.50 MB");
        assert_eq!(format_bytes(1_610_612_736), "1.50 GB");
    }
}