apr-cli 0.31.1 - Docs.rs


/// Run warmup iterations with progress display.
fn run_bench_warmup<F: FnMut()>(config: &BenchConfig, count: usize, mut f: F) {
    if !config.quiet {
        eprintln!("{}", "Running warmup...".yellow());
    }
    for i in 0..count {
        f();
        if !config.quiet {
            eprint!("  Warmup {}/{}\r", i + 1, count);
            std::io::Write::flush(&mut std::io::stderr()).ok();
        }
    }
    if !config.quiet {
        eprintln!("  Warmup complete        ");
        eprintln!();
    }
}

/// Print benchmark progress for a single iteration.
fn print_bench_progress(config: &BenchConfig, i: usize, tokens: usize, time: Duration) {
    if !config.quiet {
        eprint!(
            "  Iteration {}/{}: {} tokens in {:.2}s\r",
            i + 1,
            config.iterations,
            tokens,
            time.as_secs_f32()
        );
        std::io::Write::flush(&mut std::io::stderr()).ok();
    }
}

fn print_results(result: &BenchResult) {
    output::section("Results");
    println!();

    // Throughput (the key metric)
    let throughput_str = format!("{:.1} tok/s", result.tokens_per_second);
    if result.passed {
        println!(
            "{} {} {}",
            "Throughput:".white().bold(),
            throughput_str.green().bold(),
            "(PASS: >= 10 tok/s)".green()
        );
    } else {
        println!(
            "{} {} {}",
            "Throughput:".white().bold(),
            throughput_str.red().bold(),
            "(FAIL: < 10 tok/s)".red()
        );
    }

    println!();
    output::kv("Total tokens", result.total_tokens);
    output::kv(
        "Total time",
        format!("{:.2}s", result.total_time.as_secs_f32()),
    );
    output::kv(
        "Time to first token",
        format!("{:.0}ms", result.time_to_first_token.as_secs_f64() * 1000.0),
    );
    println!();
    output::kv(
        "Mean iteration time",
        format!("{:.2}s", result.mean_time.as_secs_f32()),
    );
    output::kv(
        "Median iteration time",
        format!("{:.2}s", result.median_time.as_secs_f32()),
    );
    output::kv(
        "Std deviation",
        format!("{:.0}ms", result.std_dev.as_secs_f64() * 1000.0),
    );
    println!();

    // Performance grade (Dean & Ghemawat 2025 style)
    let grade = if result.tokens_per_second >= 100.0 {
        "A+ (Excellent)".green()
    } else if result.tokens_per_second >= 50.0 {
        "A (Very Good)".green()
    } else if result.tokens_per_second >= 20.0 {
        "B (Good)".blue()
    } else if result.tokens_per_second >= 10.0 {
        "C (Acceptable)".yellow()
    } else {
        "F (Below Threshold)".red()
    };
    output::kv("Performance Grade", grade);
}

/// Realizar-based benchmark for model inference
///
/// Supports all formats: GGUF, APR, and SafeTensors.
/// Automatically uses CUDA GPU acceleration when available for GGUF.
#[cfg(feature = "inference")]
fn run_realizar_benchmark(path: &Path, config: &BenchConfig) -> Result<BenchResult> {
    use realizar::format::{detect_format, ModelFormat};

    // Read first 8 bytes for format detection
    let header_bytes = std::fs::read(path)
        .map_err(|e| CliError::ValidationFailed(format!("Failed to read model: {e}")))?;

    // Detect format
    let format = detect_format(&header_bytes[..8.min(header_bytes.len())])
        .map_err(|e| CliError::ValidationFailed(format!("Failed to detect format: {e}")))?;

    if !config.quiet {
        eprintln!("{} {}", "Format:".cyan().bold(), format.to_string().green());
    }

    // Check CUDA availability (only used for GGUF currently)
    #[cfg(feature = "cuda")]
    let (cuda_available, cuda_devices) = {
        use realizar::cuda::CudaExecutor;
        (CudaExecutor::is_available(), CudaExecutor::num_devices())
    };
    #[cfg(not(feature = "cuda"))]
    let (cuda_available, cuda_devices) = (false, 0usize);

    if !config.quiet {
        if cuda_available && cuda_devices > 0 {
            eprintln!(
                "{} {} GPU(s) detected",
                "CUDA:".cyan().bold(),
                cuda_devices.to_string().green()
            );
        } else {
            eprintln!(
                "{} {}",
                "CUDA:".cyan().bold(),
                "Not available (CPU mode)".yellow()
            );
        }
    }

    // Route to format-specific benchmark
    // GH-192: All formats now support CUDA acceleration
    let use_cuda = cuda_available && cuda_devices > 0;
    let tracer = TracerImpl::new_local();
    match format {
        ModelFormat::Gguf => run_gguf_benchmark(path, config, use_cuda, &tracer),
        ModelFormat::Apr => run_apr_benchmark(path, config, use_cuda, &tracer),
        ModelFormat::SafeTensors => run_safetensors_benchmark(path, config, use_cuda, &tracer),
    }
}

/// GGUF format benchmark (supports GPU acceleration)
#[cfg(feature = "inference")]
fn run_gguf_benchmark(
    path: &Path,
    config: &BenchConfig,
    use_cuda: bool,
    tracer: &TracerImpl,
) -> Result<BenchResult> {
    use realizar::gguf::{GGUFModel, QuantizedGenerateConfig};

    if !config.quiet {
        eprintln!("{}", "Loading GGUF model...".yellow());
    }
    let start = Instant::now();

    // Load model for tokenization
    let model_bytes = std::fs::read(path)
        .map_err(|e| CliError::ValidationFailed(format!("Failed to read model: {e}")))?;
    let gguf = GGUFModel::from_bytes(&model_bytes)
        .map_err(|e| CliError::ValidationFailed(format!("Failed to parse GGUF: {e}")))?;

    // Tokenize prompt
    let bos = aprender::demo::SpecialTokens::qwen2().bos_id;
    let prompt_tokens: Vec<u32> = gguf
        .encode(&config.prompt)
        .unwrap_or_else(|| vec![bos, 9707, 11, 358, 1079, 264, 11761, 18328, 13, 9842]);

    let gen_config = QuantizedGenerateConfig {
        max_tokens: config.max_tokens.min(128),
        temperature: 0.0,
        top_k: 1,
        ..Default::default()
    };

    #[cfg(feature = "cuda")]
    if use_cuda {
        match run_cuda_benchmark(
            &gguf,
            &model_bytes,
            &prompt_tokens,
            &gen_config,
            config,
            start,
            path,
            tracer,
        ) {
            Ok(result) => return Ok(result),
            Err(e) => {
                // GH-284: Fall back to CPU on CUDA capability mismatch (e.g. missing QkNorm kernel)
                if !config.quiet {
                    eprintln!(
                        "{}",
                        format!("CUDA init failed, falling back to CPU: {e}").yellow()
                    );
                }
                let cpu_start = Instant::now();
                return run_cpu_benchmark(&prompt_tokens, &gen_config, config, cpu_start, path, tracer);
            }
        }
    }
    run_cpu_benchmark(&prompt_tokens, &gen_config, config, start, path, tracer)
}

/// Resolve prompt tokens from APR model's tokenizer, with fallbacks.
#[cfg(feature = "inference")]
fn resolve_apr_prompt_tokens(path: &Path, prompt: &str) -> Vec<u32> {
    use realizar::apr::AprV2Model;

    if let Some(tokenizer) = realizar::safetensors::find_sibling_file(path, "tokenizer.json")
        .and_then(|tp| AprV2Model::load_tokenizer_from_path(&tp))
    {
        tokenizer.encode(prompt)
    } else if let Some((vocab, _, _)) = AprV2Model::load_tokenizer_from_sibling(path) {
        let token_to_id: std::collections::HashMap<String, u32> = vocab
            .iter()
            .enumerate()
            .map(|(i, t)| (t.clone(), i as u32))
            .collect();
        prompt
            .split_whitespace()
            .filter_map(|w| token_to_id.get(w).copied())
            .collect()
    } else {
        vec![1, 2, 3, 4, 5]
    }
}

/// Handle fallback when generation produced zero new tokens (GH-254).
///
/// Returns adjusted `total_tokens` for forward-pass throughput reporting.
#[cfg(feature = "inference")]
fn handle_zero_generation_fallback(
    generation_failed: bool,
    total_tokens: usize,
    iterations: usize,
    prompt_len: usize,
    _quiet: bool,
) -> usize {
    if generation_failed && total_tokens == 0 {
        eprintln!(
            "{}",
            "Note: Generation produced 0 new tokens, reporting forward-pass throughput.".yellow()
        );
        iterations * prompt_len
    } else {
        total_tokens
    }
}

/// Try CUDA benchmark, returning None if GPU produced 0 tokens (extracted to reduce complexity).
#[cfg_attr(coverage_nightly, coverage(off))]
#[cfg(all(feature = "inference", feature = "cuda"))]
fn try_cuda_benchmark(
    path: &Path,
    config: &BenchConfig,
    tracer: &TracerImpl,
) -> Result<Option<BenchResult>> {
    let result = run_apr_cuda_benchmark(path, config, tracer)?;
    if result.total_tokens > 0 {
        return Ok(Some(result));
    }
    if !config.quiet {
        eprintln!(
            "{}",
            "GPU generated 0 tokens, falling back to CPU...".yellow()
        );
    }
    Ok(None)
}

/// APR format benchmark
/// GH-192: Now supports CUDA GPU acceleration and uses KV cache for O(n) generation
#[cfg(feature = "inference")]
fn run_apr_benchmark(
    path: &Path,
    config: &BenchConfig,
    use_cuda: bool,
    tracer: &TracerImpl,
) -> Result<BenchResult> {
    use realizar::apr_transformer::{AprTransformer, GenerateConfig};

    #[cfg(feature = "cuda")]
    if use_cuda {
        if let Some(result) = try_cuda_benchmark(path, config, tracer)? {
            return Ok(result);
        }
    }

    if !config.quiet {
        eprintln!("{}", "Loading APR model (CPU)...".yellow());
    }
    let start = Instant::now();

    let transformer = AprTransformer::from_apr_file(path)
        .map_err(|e| CliError::ValidationFailed(format!("Failed to load APR: {e}")))?;

    let load_time = start.elapsed();
    if !config.quiet {
        eprintln!(
            "{} in {:.2}s",
            "Model ready".green(),
            load_time.as_secs_f32()
        );
        eprintln!();
    }

    let prompt_tokens = resolve_apr_prompt_tokens(path, &config.prompt);

    let gen_config = GenerateConfig {
        max_tokens: config.max_tokens.min(32),
        temperature: 0.0,
        top_p: 1.0,
        top_k: 0,
        repetition_penalty: 1.0,
        trace: false,
    stop_tokens: vec![],
    };

    // Warmup (untraced)
    run_bench_warmup(config, config.warmup, || {
        let _ = transformer.generate_with_cache(&prompt_tokens, &gen_config);
    });

    let (iteration_times, total_tokens, first_token_time) =
        run_apr_measurement(&transformer, &prompt_tokens, &gen_config, config, tracer)?;

    calculate_benchmark_stats(iteration_times, total_tokens, first_token_time, config)
}

/// Run the measurement phase of APR benchmark, collecting timing and token data.
#[cfg(feature = "inference")]
fn run_apr_measurement(
    transformer: &realizar::apr_transformer::AprTransformer,
    prompt_tokens: &[u32],
    gen_config: &realizar::apr_transformer::GenerateConfig,
    config: &BenchConfig,
    tracer: &TracerImpl,
) -> Result<(Vec<Duration>, usize, Duration)> {
    if !config.quiet {
        eprintln!("{}", "Running benchmark...".yellow());
    }
    let mut iteration_times = Vec::with_capacity(config.iterations);
    let mut total_tokens = 0usize;
    let mut first_token_time = Duration::ZERO;
    let mut generation_failed = false;
    let budget_us = config.max_tokens as u64 * 100_000;
    let mut generate_errors = 0usize;

    for i in 0..config.iterations {
        let traced = tracer.trace("bench_apr_iter", budget_us, || {
            transformer.generate_with_cache(prompt_tokens, gen_config)
        });
        let output = match traced.result {
            Ok(tokens) => tokens,
            Err(e) => {
                if generate_errors == 0 {
                    eprintln!(
                        "{}",
                        format!("Warning: generate_with_cache() failed on iteration {i}: {e}").yellow()
                    );
                }
                generate_errors += 1;
                prompt_tokens.to_vec()
            }
        };
        let tokens_generated = output.len().saturating_sub(prompt_tokens.len());
        let iter_time = Duration::from_micros(traced.duration_us);
        iteration_times.push(iter_time);
        total_tokens += tokens_generated;

        if i == 0 {
            first_token_time =
                Duration::from_secs_f64(iter_time.as_secs_f64() / tokens_generated.max(1) as f64);
            if tokens_generated == 0 {
                generation_failed = true;
            }
        }
        print_bench_progress(config, i, tokens_generated, iter_time);
    }
    if !config.quiet {
        eprintln!();
    }

    total_tokens = handle_zero_generation_fallback(
        generation_failed, total_tokens, config.iterations, prompt_tokens.len(), config.quiet,
    );
    if !config.quiet {
        eprintln!();
    }

    Ok((iteration_times, total_tokens, first_token_time))
}

/// APR format CUDA benchmark using fused Q4K kernels (GH-87)
///
/// F-KERNEL-DISPATCH-001: Uses OwnedQuantizedModelCuda (fused Q4K/Q6K GEMV,
/// 190+ tok/s) instead of AprV2ModelCuda (generic transformer, 0.5 tok/s).
/// Loading path: MappedAprModel → OwnedQuantizedModel::from_apr() → OwnedQuantizedModelCuda.
#[cfg_attr(coverage_nightly, coverage(off))]
#[cfg(all(feature = "inference", feature = "cuda"))]
fn run_apr_cuda_benchmark(
    path: &Path,
    config: &BenchConfig,
    tracer: &TracerImpl,
) -> Result<BenchResult> {
    use realizar::apr::{AprV2Model, MappedAprModel};
    use realizar::gguf::{OwnedQuantizedModel, OwnedQuantizedModelCuda, QuantizedGenerateConfig};

    if !config.quiet {
        eprintln!("{}", "Loading APR model (GPU, fused Q4K kernels)...".yellow());
    }
    let start = Instant::now();

    // GH-87: Load via MappedAprModel -> OwnedQuantizedModel for fused kernel path
    let mapped = MappedAprModel::from_path(path)
        .map_err(|e| CliError::ValidationFailed(format!("Failed to map APR: {e}")))?;

    let tensor_count = mapped.tensors.len();

    // Use embedded tokenizer if available, else fall back to sibling/default
    let prompt_tokens: Vec<u32> = {
        let cpu_model = AprV2Model::load(path)
            .map_err(|e| CliError::ValidationFailed(format!("Failed to load APR for tokenizer: {e}")))?;
        if let Some(tokenizer) = cpu_model.load_embedded_bpe_tokenizer() {
            tokenizer.encode(&config.prompt)
        } else {
            resolve_apr_prompt_tokens(path, &config.prompt)
        }
    };

    let model = OwnedQuantizedModel::from_apr(&mapped)
        .map_err(|e| CliError::ValidationFailed(format!("Failed to create quantized model from APR: {e}")))?;

    let mut cuda_model = OwnedQuantizedModelCuda::new(model, 0)
        .map_err(|e| CliError::ValidationFailed(format!("Failed to init CUDA: {e}")))?;

    let gen_config = QuantizedGenerateConfig {
        max_tokens: config.max_tokens.min(128),
        temperature: 0.0,
        top_k: 1,
        ..Default::default()
    };

    let load_time = start.elapsed();
    if !config.quiet {
        eprintln!(
            "{} in {:.2}s ({} tensors, GPU device 0, fused Q4K kernels)",
            "Model ready".green(),
            load_time.as_secs_f32(),
            tensor_count
        );
        eprintln!();
    }

    // Warmup (untraced)
    run_bench_warmup(config, config.warmup, || {
        let _ = cuda_model.generate_gpu_resident(&prompt_tokens, &gen_config);
    });

    // Measurement
    if !config.quiet {
        eprintln!("{}", "Running benchmark (GPU)...".yellow());
    }
    let mut iteration_times = Vec::with_capacity(config.iterations);
    let mut total_tokens = 0usize;
    let mut first_token_time = Duration::ZERO;
    let budget_us = config.max_tokens as u64 * 100_000;

    for i in 0..config.iterations {
        let traced = tracer.trace("bench_apr_gpu_iter", budget_us, || {
            cuda_model
                .generate_gpu_resident(&prompt_tokens, &gen_config)
                .unwrap_or_default()
        });
        let output = traced.result;
        let tokens_generated = output.len().saturating_sub(prompt_tokens.len());

        let iter_time = Duration::from_micros(traced.duration_us);
        iteration_times.push(iter_time);
        total_tokens += tokens_generated;

        if i == 0 {
            first_token_time =
                Duration::from_secs_f64(iter_time.as_secs_f64() / tokens_generated.max(1) as f64);
        }

        print_bench_progress(config, i, tokens_generated, iter_time);
    }
    if !config.quiet {
        eprintln!();
        eprintln!();
    }

    calculate_benchmark_stats(iteration_times, total_tokens, first_token_time, config)
}