use cortex_rust::model::BitLlamaConfig;
use std::fs;
use std::path::Path;
use std::time::Instant;
fn main() -> anyhow::Result<()> {
println!("═══════════════════════════════════════════════════════════════════");
println!(" Llama2-13B 1.58-bit Model Test (GPU FORCED: n_gpu_layers=10)");
println!("═══════════════════════════════════════════════════════════════════\n");
let model_path = Path::new("benchmark/llama2-13b-converted");
if !model_path.exists() {
eprintln!("❌ Model not found at {:?}", model_path);
eprintln!(" Please convert Llama2-13B first using bit_converter");
std::process::exit(1);
}
print_memory_usage("Before loading");
println!("📋 Loading config...");
let config_path = model_path.join("config.json");
let config_str = fs::read_to_string(&config_path)?;
let mut config: BitLlamaConfig = serde_json::from_str(&config_str)?;
config.n_gpu_layers = Some(0);
println!("⚡ CPU-only mode (n_gpu_layers = 0)");
println!(" Hidden dim: {}", config.hidden_dim);
println!(" Layers: {}", config.num_layers);
println!(" Heads: {}", config.n_heads);
println!(" Vocab size: {}", config.vocab_size);
println!(" GPU Layers: {:?}", config.n_gpu_layers);
println!();
println!("📦 Loading model weights with GPU offload...");
let load_start = Instant::now();
let model_file = model_path.join("model.safetensors");
let tokenizer_file = model_path.join("tokenizer.json");
let mut llama = cortex_rust::Llama::load_direct(&model_file, &tokenizer_file, config)?;
let load_time = load_start.elapsed();
println!("✅ Model loaded in {:.2}s", load_time.as_secs_f64());
print_memory_usage("After loading");
println!();
let prompt = "Hello, my name is";
let gen_tokens = 3;
println!("🔤 Prompt: \"{}\"", prompt);
println!(" Generating {} tokens...", gen_tokens);
let gen_start = Instant::now();
let output = llama.generate(prompt, gen_tokens)?;
let gen_time = gen_start.elapsed();
let tokens_per_sec = gen_tokens as f64 / gen_time.as_secs_f64();
println!();
println!("═══════════════════════════════════════════════════════════════════");
println!("📊 Results:");
println!("═══════════════════════════════════════════════════════════════════");
println!(" Output: \"{}\"", output);
println!(" Generation time: {:.2}ms", gen_time.as_millis());
println!(" Throughput: {:.2} tokens/sec", tokens_per_sec);
print_memory_usage("After generation");
println!("═══════════════════════════════════════════════════════════════════");
Ok(())
}
fn print_memory_usage(label: &str) {
println!("💾 Memory checkpoint [{}]", label);
println!(" (Run 'nvidia-smi' in another terminal for GPU memory details)");
}