Skip to main content

llm_manager/backend/
benchmark.rs

1use std::path::PathBuf;
2use std::time::{Duration, Instant};
3
4use tokio::sync::{mpsc, watch};
5
6use crate::backend::server::{spawn_server, SpawnServerRequest};
7use crate::models::{
8    BenchTuneConfig, BenchTuneMetrics, BenchTuneMode, BenchTuneParamValue, BenchTuneResult,
9    BenchTuneStatus, DiscoveredModel, ModelSettings, ServerMode,
10};
11
12/// Benchmark tuning constants
13const HEALTH_CHECK_ITERATIONS: u32 = 120;
14const HEALTH_CHECK_INTERVAL_MS: u64 = 500;
15const HEALTH_CHECK_LOG_INTERVAL: u32 = 10;
16const REQUEST_TIMEOUT_SECS: u64 = 120;
17
18struct BenchAccumulator {
19    params: BenchTuneParamValue,
20    total_prompt_tokens: u64,
21    total_generation_tokens: u64,
22    total_prompt_time: Duration,
23    total_generation_time: Duration,
24    total_time: Duration,
25    first_token_times: Vec<u128>,
26    outputs: Vec<String>,
27    per_iteration_metrics: Vec<BenchTuneMetrics>,
28    base_settings: Option<ModelSettings>,
29}
30
31fn build_bench_result(acc: BenchAccumulator) -> BenchTuneResult {
32    let BenchAccumulator {
33        params,
34        total_prompt_tokens,
35        total_generation_tokens,
36        total_prompt_time,
37        total_generation_time,
38        total_time,
39        first_token_times,
40        outputs,
41        per_iteration_metrics,
42        base_settings,
43    } = acc;
44    let prompt_tps = if total_prompt_time.as_secs_f64() > 0.0 {
45        (total_prompt_tokens as f64) / total_prompt_time.as_secs_f64()
46    } else {
47        0.0
48    };
49
50    let generation_tps = if total_generation_time.as_secs_f64() > 0.0 {
51        (total_generation_tokens as f64) / total_generation_time.as_secs_f64()
52    } else {
53        0.0
54    };
55
56    let combined_tps = if total_time.as_secs_f64() > 0.0 {
57        ((total_prompt_tokens + total_generation_tokens) as f64) / total_time.as_secs_f64()
58    } else {
59        0.0
60    };
61
62    let avg_latency_per_token = if total_generation_tokens > 0 {
63        total_generation_time.as_millis() as f64 / (total_generation_tokens as f64)
64    } else {
65        0.0
66    };
67
68    let avg_first_token_time = if !first_token_times.is_empty() {
69        first_token_times.iter().sum::<u128>() as f64 / first_token_times.len() as f64
70    } else {
71        0.0
72    };
73
74    BenchTuneResult {
75        params,
76        metrics: BenchTuneMetrics {
77            prompt_tps,
78            generation_tps,
79            combined_tps,
80            latency_per_token: avg_latency_per_token,
81            first_token_time: avg_first_token_time,
82        },
83        outputs,
84        per_iteration_metrics,
85        base_settings,
86    }
87}
88
89pub struct BenchTuneRequest<'a> {
90    pub main_config: &'a crate::config::Config,
91    pub config: &'a BenchTuneConfig,
92    pub model: &'a DiscoveredModel,
93    pub settings: &'a ModelSettings,
94    pub progress_tx: mpsc::Sender<BenchTuneStatus>,
95    pub log_tx: mpsc::Sender<String>,
96    pub cancel_rx: &'a mut watch::Receiver<bool>,
97}
98
99/// Run a benchmark tuning test with multiple parameter combinations
100pub async fn run_bench_tune(
101    req: BenchTuneRequest<'_>,
102) -> Result<Vec<BenchTuneResult>, Box<dyn std::error::Error + Send + Sync>> {
103    let BenchTuneRequest {
104        main_config,
105        config,
106        model,
107        settings,
108        progress_tx,
109        log_tx,
110        cancel_rx,
111    } = req;
112    let start_time = Instant::now();
113    let total_tests = config.get_total_tests_count();
114
115    // Warn on large runs
116    if total_tests > 500 {
117        let _ = log_tx
118            .send(format!(
119                "WARNING: Benchmark will run {} combinations. This may take a long time.",
120                total_tests
121            ))
122            .await;
123    }
124
125    // Generate all parameter combinations
126    let combinations = config.generate_combinations();
127
128    // Results storage
129    let mut results = Vec::new();
130    let mut failed_tests: Vec<(usize, String)> = Vec::new();
131
132    // Apply chat_template_kwargs from config to settings
133    let mut settings = settings.clone();
134    if let Some(kwargs) = &config.chat_template_kwargs {
135        settings.chat_template_kwargs = Some(kwargs.clone());
136    }
137
138    // Create a shared HTTP client for all inference requests
139    let client = reqwest::Client::builder()
140        .timeout(Duration::from_secs(REQUEST_TIMEOUT_SECS))
141        .build()?;
142
143    // If runtime-only mode, send params in request body (no server restarts)
144    if config.bench_mode == BenchTuneMode::RuntimeOnly {
145        // Spawn a single server for all runtime-only iterations
146        let (exit_tx, _exit_rx) = tokio::sync::mpsc::channel(1);
147        let (server_handle, _command) = spawn_server(SpawnServerRequest {
148            config: main_config,
149            model: Some(model),
150            settings: &settings,
151            log_tx: log_tx.clone(),
152            progress_tx: None,
153            server_mode: ServerMode::Normal,
154            router_max_models: 1,
155            exit_tx,
156        })
157        .await?;
158
159        let host = if server_handle.host == "0.0.0.0" {
160            "127.0.0.1"
161        } else {
162            &server_handle.host
163        };
164
165        // Wait for server to be ready
166        for i in 0..HEALTH_CHECK_ITERATIONS {
167            if *cancel_rx.borrow() {
168                let _ = crate::backend::server::kill_server(server_handle).await;
169                let elapsed = start_time.elapsed();
170                progress_tx
171                    .send(BenchTuneStatus::Cancelled {
172                        total_tests,
173                        successful_tests: results.len(),
174                        failed_tests: failed_tests.len(),
175                        elapsed,
176                    })
177                    .await?;
178                return Ok(results);
179            }
180            if crate::backend::server::check_health(host, server_handle.port).await {
181                break;
182            }
183            if i % HEALTH_CHECK_LOG_INTERVAL == 0 && i > 0 {
184                let _ = log_tx
185                    .send(format!(
186                        "  ... still waiting ({:.0}s)...",
187                        i as f32 * (HEALTH_CHECK_INTERVAL_MS as f32 / 1000.0)
188                    ))
189                    .await;
190            }
191            tokio::time::sleep(Duration::from_millis(HEALTH_CHECK_INTERVAL_MS)).await;
192        }
193
194        let server_port = server_handle.port;
195        let server_host = host.to_string();
196
197        for (idx, combination) in combinations.iter().enumerate() {
198            // Check cancellation before each test
199            if *cancel_rx.borrow() {
200                let _ = crate::backend::server::kill_server(server_handle).await;
201                let elapsed = start_time.elapsed();
202                progress_tx
203                    .send(BenchTuneStatus::Cancelled {
204                        total_tests,
205                        successful_tests: results.len(),
206                        failed_tests: failed_tests.len(),
207                        elapsed,
208                    })
209                    .await?;
210                return Ok(results);
211            }
212
213            let progress = (idx as f32 / total_tests as f32) * 100.0;
214            progress_tx
215                .send(BenchTuneStatus::Running {
216                    current: idx + 1,
217                    total: total_tests,
218                    progress,
219                    current_params: combination.clone(),
220                })
221                .await?;
222
223            let result = run_bench_tune_runtime_only(RuntimeOnlyCtx {
224                params: combination,
225                settings: &settings,
226                num_iterations: config.num_iterations,
227                prompt: config.prompt.clone(),
228                server_host: &server_host,
229                server_port,
230                log_tx: log_tx.clone(),
231                config,
232                client: &client,
233            })
234            .await;
235
236            match result {
237                Ok(test_result) => results.push(test_result),
238                Err(e) => {
239                    failed_tests.push((idx + 1, e.to_string()));
240                    let _ = log_tx
241                        .send(format!(
242                            "Benchmark test {}/{} failed: {}",
243                            idx + 1,
244                            total_tests,
245                            e
246                        ))
247                        .await;
248                }
249            }
250        }
251
252        let _ = crate::backend::server::kill_server(server_handle).await;
253    } else {
254        // Full mode: spawn a new server for each parameter combination
255        for (idx, combination) in combinations.iter().enumerate() {
256            // Check cancellation before each test
257            if *cancel_rx.borrow() {
258                let elapsed = start_time.elapsed();
259                progress_tx
260                    .send(BenchTuneStatus::Cancelled {
261                        total_tests,
262                        successful_tests: results.len(),
263                        failed_tests: failed_tests.len(),
264                        elapsed,
265                    })
266                    .await?;
267                return Ok(results);
268            }
269
270            let progress = (idx as f32 / total_tests as f32) * 100.0;
271            progress_tx
272                .send(BenchTuneStatus::Running {
273                    current: idx + 1,
274                    total: total_tests,
275                    progress,
276                    current_params: combination.clone(),
277                })
278                .await?;
279
280            let result = run_bench_tune_single_test(SingleTestCtx {
281                main_config,
282                params: combination,
283                model,
284                base_settings: &settings,
285                num_iterations: config.num_iterations,
286                prompt: config.prompt.clone(),
287                log_tx: log_tx.clone(),
288                config,
289                client: &client,
290            })
291            .await;
292
293            match result {
294                Ok(test_result) => results.push(test_result),
295                Err(e) => {
296                    failed_tests.push((idx + 1, e.to_string()));
297                    let _ = log_tx
298                        .send(format!(
299                            "Benchmark test {}/{} failed: {}",
300                            idx + 1,
301                            total_tests,
302                            e
303                        ))
304                        .await;
305                }
306            }
307        }
308    }
309
310    // Sort results by combined_tps (descending)
311    results.sort_by(|a, b| {
312        b.metrics
313            .combined_tps
314            .partial_cmp(&a.metrics.combined_tps)
315            .unwrap_or(std::cmp::Ordering::Equal)
316    });
317
318    let elapsed = start_time.elapsed();
319    let successful_tests = results.len();
320    let failed_count = failed_tests.len();
321
322    // Final progress update - distinguish between full success and partial success
323    if failed_count > 0 {
324        progress_tx
325            .send(BenchTuneStatus::PartiallyCompleted {
326                total_tests,
327                successful_tests,
328                failed_tests: failed_count,
329                elapsed,
330            })
331            .await?;
332    } else {
333        progress_tx
334            .send(BenchTuneStatus::Completed {
335                total_tests,
336                successful_tests,
337                elapsed,
338            })
339            .await?;
340    }
341
342    Ok(results)
343}
344
345/// Run inference iterations and accumulate metrics into a BenchTuneResult.
346struct IterationLoopCtx<'a> {
347    prompt: &'a str,
348    host: &'a str,
349    port: u16,
350    params: &'a BenchTuneParamValue,
351    num_iterations: u32,
352    config: &'a BenchTuneConfig,
353    client: &'a reqwest::Client,
354    log_tx: mpsc::Sender<String>,
355    log_prefix: &'a str,
356}
357
358/// Shared by both runtime-only and full benchmark modes.
359async fn run_iteration_loop(
360    ctx: IterationLoopCtx<'_>,
361) -> Result<BenchTuneResult, Box<dyn std::error::Error + Send + Sync>> {
362    let IterationLoopCtx {
363        prompt,
364        host,
365        port,
366        params,
367        num_iterations,
368        config,
369        client,
370        log_tx,
371        log_prefix,
372    } = ctx;
373    let mut total_prompt_tokens = 0u64;
374    let mut total_generation_tokens = 0u64;
375    let mut total_prompt_time = Duration::ZERO;
376    let mut total_generation_time = Duration::ZERO;
377    let mut total_time = Duration::ZERO;
378    let mut first_token_times = Vec::new();
379    let mut outputs = Vec::new();
380    let mut per_iteration_metrics = Vec::new();
381
382    let _ = log_tx
383        .send(format!(
384            "Running {} inference iterations {}...",
385            num_iterations, log_prefix
386        ))
387        .await;
388
389    for i in 0..num_iterations {
390        let result = send_inference_request(prompt, host, port, params, config, client).await;
391
392        match result {
393            Ok(res) => {
394                total_prompt_tokens += res.prompt_tokens;
395                total_generation_tokens += res.generation_tokens;
396                total_prompt_time += res.prompt_time;
397                total_generation_time += res.generation_time;
398                total_time += res.total_time;
399                first_token_times.push(res.first_token_time);
400                outputs.push(res.content.clone());
401
402                let iter_prompt_tps = if res.prompt_time.as_secs_f64() > 0.0 {
403                    res.prompt_tokens as f64 / res.prompt_time.as_secs_f64()
404                } else {
405                    0.0
406                };
407                let iter_gen_tps = if res.generation_time.as_secs_f64() > 0.0 {
408                    res.generation_tokens as f64 / res.generation_time.as_secs_f64()
409                } else {
410                    0.0
411                };
412                let iter_latency = if res.generation_tokens > 0 {
413                    res.generation_time.as_millis() as f64 / res.generation_tokens as f64
414                } else {
415                    0.0
416                };
417
418                per_iteration_metrics.push(BenchTuneMetrics {
419                    prompt_tps: iter_prompt_tps,
420                    generation_tps: iter_gen_tps,
421                    combined_tps: 0.0,
422                    latency_per_token: iter_latency,
423                    first_token_time: res.first_token_time as f64,
424                });
425
426                if num_iterations > 1 {
427                    let _ = log_tx
428                        .send(format!(
429                            "  Iteration {}/{}: {:.2} gen t/s",
430                            i + 1,
431                            num_iterations,
432                            iter_gen_tps
433                        ))
434                        .await;
435                }
436
437                let _ = log_tx
438                    .send(format!(
439                        "--- Generated Output (Iter {}) ---\n{}\n----------------------------------",
440                        i + 1,
441                        res.content
442                    ))
443                    .await;
444            }
445            Err(e) => {
446                let _ = log_tx
447                    .send(format!(
448                        "  Iteration {}/{} FAILED: {}",
449                        i + 1,
450                        num_iterations,
451                        e
452                    ))
453                    .await;
454                if i == 0 {
455                    return Err(format!("Inference failed: {}", e).into());
456                }
457            }
458        }
459    }
460
461    Ok(build_bench_result(BenchAccumulator {
462        params: params.clone(),
463        total_prompt_tokens,
464        total_generation_tokens,
465        total_prompt_time,
466        total_generation_time,
467        total_time,
468        first_token_times,
469        outputs,
470        per_iteration_metrics,
471        base_settings: None,
472    }))
473}
474
475struct RuntimeOnlyCtx<'a> {
476    params: &'a BenchTuneParamValue,
477    settings: &'a ModelSettings,
478    num_iterations: u32,
479    prompt: String,
480    server_host: &'a str,
481    server_port: u16,
482    log_tx: mpsc::Sender<String>,
483    config: &'a BenchTuneConfig,
484    client: &'a reqwest::Client,
485}
486
487/// Run benchmark in runtime-only mode: sends params in /completion request body, no server restarts
488async fn run_bench_tune_runtime_only(
489    ctx: RuntimeOnlyCtx<'_>,
490) -> Result<BenchTuneResult, Box<dyn std::error::Error + Send + Sync>> {
491    let RuntimeOnlyCtx {
492        params,
493        settings,
494        num_iterations,
495        prompt,
496        server_host,
497        server_port,
498        log_tx,
499        config,
500        client,
501    } = ctx;
502    run_iteration_loop(IterationLoopCtx {
503        prompt: &prompt,
504        host: server_host,
505        port: server_port,
506        params,
507        num_iterations,
508        config,
509        client,
510        log_tx,
511        log_prefix: "(runtime-only mode)",
512    })
513    .await
514    .map(|mut r| {
515        r.base_settings = Some(settings.clone());
516        r
517    })
518}
519
520struct SingleTestCtx<'a> {
521    main_config: &'a crate::config::Config,
522    params: &'a BenchTuneParamValue,
523    model: &'a DiscoveredModel,
524    base_settings: &'a ModelSettings,
525    num_iterations: u32,
526    prompt: String,
527    log_tx: mpsc::Sender<String>,
528    config: &'a BenchTuneConfig,
529    client: &'a reqwest::Client,
530}
531
532/// Run a single benchmark tuning test with specific parameters
533async fn run_bench_tune_single_test(
534    ctx: SingleTestCtx<'_>,
535) -> Result<BenchTuneResult, Box<dyn std::error::Error + Send + Sync>> {
536    let SingleTestCtx {
537        main_config,
538        params,
539        model,
540        base_settings,
541        num_iterations,
542        prompt,
543        log_tx,
544        config,
545        client,
546    } = ctx;
547    // Create settings with test parameters
548    let mut settings = base_settings.clone();
549
550    // Apply test parameters
551    if let Some(temperature) = params.temperature {
552        settings.temperature = temperature as f32;
553    }
554    if let Some(top_p) = params.top_p {
555        settings.top_p = top_p as f32;
556    }
557    if let Some(top_k) = params.top_k {
558        settings.top_k = top_k as i32;
559    }
560    if let Some(repeat_penalty) = params.repeat_penalty {
561        settings.repeat_penalty = repeat_penalty as f32;
562    }
563    if let Some(flash_attn) = params.flash_attn {
564        settings.flash_attn = flash_attn;
565    }
566    if let Some(threads) = params.threads {
567        settings.threads = threads;
568        settings.threads_batch = threads; // Usually keep them equal for benchmarks
569    }
570    if let Some(batch_size) = params.batch_size {
571        settings.batch_size = batch_size;
572        settings.ubatch_size = batch_size;
573    }
574    if let Some(expert_count) = params.expert_count {
575        settings.expert_count = expert_count;
576    }
577
578    // Spawn server with test parameters
579    let (exit_tx, _exit_rx) = tokio::sync::mpsc::channel(1);
580    let (server_handle, _command) = spawn_server(SpawnServerRequest {
581        config: main_config,
582        model: Some(model),
583        settings: &settings,
584        log_tx: log_tx.clone(),
585        progress_tx: None,
586        server_mode: ServerMode::Normal,
587        router_max_models: 1,
588        exit_tx,
589    })
590    .await?;
591    // Wait for server to be ready
592    let mut ready = false;
593    let host = if server_handle.host == "0.0.0.0" {
594        "127.0.0.1"
595    } else {
596        &server_handle.host
597    };
598
599    let _ = log_tx
600        .send(format!(
601            "Waiting for server on {}:{}...",
602            host, server_handle.port
603        ))
604        .await;
605
606    for i in 0..HEALTH_CHECK_ITERATIONS {
607        if crate::backend::server::check_health(host, server_handle.port).await {
608            ready = true;
609            break;
610        }
611        if i % HEALTH_CHECK_LOG_INTERVAL == 0 && i > 0 {
612            let _ = log_tx
613                .send(format!(
614                    "  ... still waiting ({:.0}s)...",
615                    i as f32 * (HEALTH_CHECK_INTERVAL_MS as f32 / 1000.0)
616                ))
617                .await;
618        }
619        tokio::time::sleep(Duration::from_millis(HEALTH_CHECK_INTERVAL_MS)).await;
620    }
621
622    if !ready {
623        let _ = log_tx
624            .send("Error: Server health check timed out".to_string())
625            .await;
626        let _ = crate::backend::server::kill_server(server_handle).await;
627        return Err("Server failed to become healthy".into());
628    }
629
630    let result = run_iteration_loop(IterationLoopCtx {
631        prompt: &prompt,
632        host,
633        port: server_handle.port,
634        params,
635        num_iterations,
636        config,
637        client,
638        log_tx,
639        log_prefix: "",
640    })
641    .await;
642
643    let _ = crate::backend::server::kill_server(server_handle).await;
644    tokio::time::sleep(Duration::from_secs(1)).await;
645
646    result.map(|mut r| {
647        r.base_settings = Some(base_settings.clone());
648        r
649    })
650}
651
652/// Send an inference request and measure response time
653async fn send_inference_request(
654    prompt: &str,
655    host: &str,
656    port: u16,
657    params: &BenchTuneParamValue,
658    config: &BenchTuneConfig,
659    client: &reqwest::Client,
660) -> Result<InferenceResult, Box<dyn std::error::Error + Send + Sync>> {
661    // Build request body with benchmark params
662    let mut body = serde_json::json!({
663        "prompt": prompt,
664        "n_predict": config.n_predict,
665        "stream": false
666    });
667
668    if let Some(temperature) = params.temperature {
669        body["temperature"] = serde_json::json!(temperature);
670    }
671    if let Some(top_p) = params.top_p {
672        body["top_p"] = serde_json::json!(top_p);
673    }
674    if let Some(top_k) = params.top_k {
675        body["top_k"] = serde_json::json!(top_k);
676    }
677    if let Some(repeat_penalty) = params.repeat_penalty {
678        body["repeat_penalty"] = serde_json::json!(repeat_penalty);
679    }
680
681    let url = format!("http://{}:{}/completion", host, port);
682    let start = Instant::now();
683    let resp = client.post(url).json(&body).send().await?;
684
685    if !resp.status().is_success() {
686        let status = resp.status();
687        let body = resp.text().await.unwrap_or_else(|_| "no body".to_string());
688        return Err(format!("Server returned error {}: {}", status, body).into());
689    }
690
691    let total_time = start.elapsed();
692    let json: serde_json::Value = resp.json().await?;
693
694    // Robust timings parsing
695    let prompt_tokens = json["tokens_evaluated"]
696        .as_u64()
697        .or_else(|| json["prompt_n"].as_u64())
698        .unwrap_or(0);
699
700    let generation_tokens = json["tokens_predicted"]
701        .as_u64()
702        .or_else(|| json["predicted_n"].as_u64())
703        .unwrap_or(0);
704
705    let timings = &json["timings"];
706    let prompt_time_ms = timings["prompt_ms"]
707        .as_f64()
708        .or_else(|| timings["prompt_eval_ms"].as_f64())
709        .unwrap_or(0.0);
710
711    let generation_time_ms = timings["predicted_ms"]
712        .as_f64()
713        .or_else(|| timings["eval_ms"].as_f64())
714        .unwrap_or(0.0);
715
716    Ok(InferenceResult {
717        prompt_tokens,
718        generation_tokens,
719        prompt_time: Duration::from_millis(prompt_time_ms as u64),
720        generation_time: Duration::from_millis(generation_time_ms as u64),
721        total_time,
722        first_token_time: prompt_time_ms as u128,
723        content: json["content"].as_str().unwrap_or("").to_string(),
724    })
725}
726
727/// Save benchmark results to disk in Markdown format
728pub async fn save_results(
729    results: &[BenchTuneResult],
730    output_dir: &PathBuf,
731    config: &BenchTuneConfig,
732) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
733    // Create output directory if it doesn't exist
734    std::fs::create_dir_all(output_dir)?;
735
736    // Generate timestamp for the filename
737    let timestamp = chrono::Local::now().format("%Y%m%d_%H%M%S");
738    let filename = format!("benchmark_{}.md", timestamp);
739    let filepath = output_dir.join(filename);
740
741    let mut md = String::new();
742    md.push_str("# LLM Benchmark Results\n\n");
743    md.push_str(&format!(
744        "Generated on: {}\n\n",
745        chrono::Local::now().format("%Y-%m-%d %H:%M:%S")
746    ));
747
748    md.push_str("| Temp | Top-P | Top-K | RepPen | FA | Threads | Batch | Exp | Spec | Draft | Prompt t/s | Gen t/s | Latency (ms) | First Tok (ms) |\n");
749    md.push_str("|------|-------|-------|--------|----|---------|-------|-----|------|-------|------------|---------|--------------|----------------|\n");
750
751    for r in results {
752        let temp = r
753            .params
754            .temperature
755            .map(|v| format!("{:.2}", v))
756            .unwrap_or_else(|| "-".to_string());
757        let top_p = r
758            .params
759            .top_p
760            .map(|v| format!("{:.2}", v))
761            .unwrap_or_else(|| "-".to_string());
762        let top_k = r
763            .params
764            .top_k
765            .map(|v| v.to_string())
766            .unwrap_or_else(|| "-".to_string());
767        let rep_pen = r
768            .params
769            .repeat_penalty
770            .map(|v| format!("{:.2}", v))
771            .unwrap_or_else(|| "-".to_string());
772        let fa = r
773            .params
774            .flash_attn
775            .map(|v| if v { "ON" } else { "OFF" })
776            .unwrap_or("-");
777        let threads = r
778            .params
779            .threads
780            .map(|v| v.to_string())
781            .unwrap_or_else(|| "-".to_string());
782        let batch = r
783            .params
784            .batch_size
785            .map(|v| v.to_string())
786            .unwrap_or_else(|| "-".to_string());
787        let exp = r
788            .params
789            .expert_count
790            .map(|v| v.to_string())
791            .unwrap_or_else(|| "-".to_string());
792
793        let spec = r
794            .params
795            .spec_type
796            .as_ref()
797            .map(|s| {
798                if s.is_empty() {
799                    "-".to_string()
800                } else {
801                    s.clone()
802                }
803            })
804            .unwrap_or_else(|| "-".to_string());
805        let draft = r
806            .params
807            .draft_tokens
808            .map(|v| v.to_string())
809            .unwrap_or_else(|| "-".to_string());
810
811        md.push_str(&format!(
812            "| {} | {} | {} | {} | {} | {} | {} | {} | {} | {} | {:.2} | {:.2} | {:.2} | {:.2} |\n",
813            temp,
814            top_p,
815            top_k,
816            rep_pen,
817            fa,
818            threads,
819            batch,
820            exp,
821            spec,
822            draft,
823            r.metrics.prompt_tps,
824            r.metrics.generation_tps,
825            r.metrics.latency_per_token,
826            r.metrics.first_token_time
827        ));
828    }
829
830    tokio::fs::write(&filepath, md).await?;
831
832    // Save full results as JSON with outputs
833    let json_filename = format!("benchmark_{}.json", timestamp);
834    let json_filepath = output_dir.join(&json_filename);
835    let json_content = serde_json::to_string_pretty(&results)?;
836    tokio::fs::write(&json_filepath, json_content).await?;
837
838    // Also save full results as YAML with outputs
839    let yaml_filename = format!("benchmark_{}.yaml", timestamp);
840    let yaml_filepath = output_dir.join(&yaml_filename);
841    let yaml_content = serde_yaml::to_string(&results)?;
842    tokio::fs::write(&yaml_filepath, yaml_content).await?;
843
844    // Generate HTML report
845    let html_filename = format!("benchmark_{}.html", timestamp);
846    let html_filepath = output_dir.join(&html_filename);
847    let html_content = generate_html_report(results, config);
848    tokio::fs::write(&html_filepath, html_content).await?;
849
850    Ok(())
851}
852
853fn generate_html_report(results: &[BenchTuneResult], config: &BenchTuneConfig) -> String {
854    use chrono::Local;
855
856    let total_tests = results.len();
857    let timestamp = Local::now().format("%Y-%m-%d %H:%M:%S").to_string();
858
859    // Extract model metadata from first result's base_settings
860    let model_info = results.first().and_then(|r| {
861        r.base_settings.as_ref().map(|s| {
862            let model_name = if config.model_path.file_name().is_some() {
863                config
864                    .model_path
865                    .file_name()
866                    .unwrap()
867                    .to_string_lossy()
868                    .to_string()
869            } else {
870                config.model_path.display().to_string()
871            };
872            let file_size_mb = results
873                .first()
874                .and_then(|r| {
875                    r.base_settings.as_ref().map(|_s| {
876                        // We don't have file_size in settings, use a placeholder
877                        0u64
878                    })
879                })
880                .unwrap_or(0);
881            (model_name, file_size_mb, s.clone())
882        })
883    });
884
885    // Resolve benchmark params against base settings (fill in None with base values)
886    struct ResolvedParams {
887        temperature: f64,
888        top_p: f64,
889        top_k: i64,
890        repeat_penalty: f64,
891        flash_attn: bool,
892        threads: u32,
893        batch_size: u32,
894        expert_count: i32,
895        spec_type: String,
896        draft_tokens: u32,
897    }
898
899    fn resolve_params(
900        params: &BenchTuneParamValue,
901        base: &crate::models::ModelSettings,
902    ) -> ResolvedParams {
903        ResolvedParams {
904            temperature: params.temperature.unwrap_or(base.temperature as f64),
905            top_p: params.top_p.unwrap_or(base.top_p as f64),
906            top_k: params.top_k.unwrap_or(base.top_k as i64),
907            repeat_penalty: params.repeat_penalty.unwrap_or(base.repeat_penalty as f64),
908            flash_attn: params.flash_attn.unwrap_or(base.flash_attn),
909            threads: params.threads.unwrap_or(base.threads),
910            batch_size: params.batch_size.unwrap_or(base.batch_size),
911            expert_count: params.expert_count.unwrap_or(base.expert_count),
912            spec_type: params
913                .spec_type
914                .clone()
915                .unwrap_or_else(|| base.spec_type.clone()),
916            draft_tokens: params.draft_tokens.unwrap_or(base.draft_tokens),
917        }
918    }
919
920    // Statistics helpers
921    fn mean(vals: &[f64]) -> f64 {
922        if vals.is_empty() {
923            return 0.0;
924        }
925        vals.iter().sum::<f64>() / vals.len() as f64
926    }
927    fn median(vals: &mut [f64]) -> f64 {
928        if vals.is_empty() {
929            return 0.0;
930        }
931        vals.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
932        let mid = vals.len() / 2;
933        if vals.len().is_multiple_of(2) {
934            (vals[mid - 1] + vals[mid]) / 2.0
935        } else {
936            vals[mid]
937        }
938    }
939    fn std_dev(vals: &[f64], avg: f64) -> f64 {
940        if vals.len() <= 1 {
941            return 0.0;
942        }
943        let variance =
944            vals.iter().map(|v| (v - avg).powi(2)).sum::<f64>() / (vals.len() - 1) as f64;
945        variance.sqrt()
946    }
947    fn min_val(vals: &[f64]) -> f64 {
948        vals.iter().cloned().fold(f64::INFINITY, f64::min)
949    }
950    fn max_val(vals: &[f64]) -> f64 {
951        vals.iter().cloned().fold(f64::NEG_INFINITY, f64::max)
952    }
953
954    let gen_tps: Vec<f64> = results.iter().map(|r| r.metrics.generation_tps).collect();
955    let mut prompt_tps: Vec<f64> = results.iter().map(|r| r.metrics.prompt_tps).collect();
956    let latency: Vec<f64> = results
957        .iter()
958        .map(|r| r.metrics.latency_per_token)
959        .collect();
960    let mut first_token: Vec<f64> = results.iter().map(|r| r.metrics.first_token_time).collect();
961
962    let mut gen_tps_sorted = gen_tps.clone();
963    let mut latency_sorted = latency.clone();
964
965    let avg_gen_tps = mean(&gen_tps);
966    let avg_prompt_tps = mean(&prompt_tps);
967    let avg_latency = mean(&latency);
968    let avg_first_token = mean(&first_token);
969    let _avg_combined_tps = mean(
970        &results
971            .iter()
972            .map(|r| r.metrics.combined_tps)
973            .collect::<Vec<f64>>(),
974    );
975
976    let gen_std = std_dev(&gen_tps, avg_gen_tps);
977    let prompt_std = std_dev(&prompt_tps, avg_prompt_tps);
978    let lat_std = std_dev(&latency, avg_latency);
979    let ft_std = std_dev(&first_token, avg_first_token);
980
981    let best_idx = results
982        .iter()
983        .enumerate()
984        .max_by(|a, b| {
985            a.1.metrics
986                .generation_tps
987                .partial_cmp(&b.1.metrics.generation_tps)
988                .unwrap_or(std::cmp::Ordering::Equal)
989        })
990        .map(|(i, _)| i);
991    let best_gen_tps = if !gen_tps.is_empty() {
992        max_val(&gen_tps)
993    } else {
994        0.0
995    };
996    let best_prompt_tps = if !prompt_tps.is_empty() {
997        max_val(&prompt_tps)
998    } else {
999        0.0
1000    };
1001    let best_latency = if !latency.is_empty() {
1002        min_val(&latency)
1003    } else {
1004        0.0
1005    };
1006    let best_first_token = if !first_token.is_empty() {
1007        min_val(&first_token)
1008    } else {
1009        0.0
1010    };
1011    let min_gen_tps = min_val(&gen_tps);
1012    let min_prompt_tps = min_val(&prompt_tps);
1013    let min_latency = min_val(&latency);
1014    let min_first_token = min_val(&first_token);
1015
1016    // Per-parameter impact analysis
1017    let param_names = [("temperature", "Temperature"),
1018        ("top_p", "Top-P"),
1019        ("top_k", "Top-K"),
1020        ("repeat_penalty", "Repeat Penalty"),
1021        ("flash_attn", "Flash Attention"),
1022        ("threads", "Threads"),
1023        ("batch_size", "Batch Size"),
1024        ("expert_count", "Experts")];
1025
1026    let impact_data: Vec<(String, String, f64)> = param_names
1027        .iter()
1028        .filter_map(|(key, label)| {
1029            let values: Vec<f64> = results
1030                .iter()
1031                .filter_map(|r| {
1032                    let base = r.base_settings.as_ref()?;
1033                    let rp = resolve_params(&r.params, base);
1034                    Some(match *key {
1035                        "temperature" => rp.temperature,
1036                        "top_p" => rp.top_p,
1037                        "top_k" => rp.top_k as f64,
1038                        "repeat_penalty" => rp.repeat_penalty,
1039                        "flash_attn" => {
1040                            if rp.flash_attn {
1041                                1.0
1042                            } else {
1043                                0.0
1044                            }
1045                        }
1046                        "threads" => rp.threads as f64,
1047                        "batch_size" => rp.batch_size as f64,
1048                        "expert_count" => rp.expert_count as f64,
1049                        _ => return None,
1050                    })
1051                })
1052                .collect();
1053
1054            // Group by parameter value and compute mean gen_tps per group
1055            let mut groups: std::collections::HashMap<String, Vec<f64>> =
1056                std::collections::HashMap::new();
1057            for (r, v) in results.iter().zip(values.iter()) {
1058                let key_str = if *key == "flash_attn" {
1059                    if *v > 0.5 {
1060                        "ON".to_string()
1061                    } else {
1062                        "OFF".to_string()
1063                    }
1064                } else {
1065                    format!("{:.2}", v)
1066                };
1067                groups
1068                    .entry(key_str)
1069                    .or_default()
1070                    .push(r.metrics.generation_tps);
1071            }
1072
1073            if groups.len() <= 1 {
1074                return None;
1075            } // Parameter doesn't vary
1076
1077            let group_means: Vec<f64> = groups.values().map(|vals| mean(vals)).collect();
1078            let spread = max_val(&group_means) - min_val(&group_means);
1079            Some((label.to_string(), format!("{:.1}", spread), spread))
1080        })
1081        .collect();
1082
1083    // Sort by impact (spread) descending
1084    let mut impact_sorted = impact_data.clone();
1085    impact_sorted.sort_by(|a, b| b.2.partial_cmp(&a.2).unwrap_or(std::cmp::Ordering::Equal));
1086
1087    // Consistency indicator (coefficient of variation from per-iteration metrics)
1088    let consistency_data: Vec<f64> = results
1089        .iter()
1090        .map(|r| {
1091            if r.per_iteration_metrics.len() <= 1 {
1092                return 1.0; // No consistency data = neutral
1093            }
1094            let iter_gen_tps: Vec<f64> = r
1095                .per_iteration_metrics
1096                .iter()
1097                .map(|m| m.generation_tps)
1098                .collect();
1099            let iter_mean = mean(&iter_gen_tps);
1100            if iter_mean == 0.0 {
1101                return 1.0;
1102            }
1103            let iter_std = std_dev(&iter_gen_tps, iter_mean);
1104            let cv = iter_std / iter_mean; // Coefficient of variation
1105            // Map CV to 0-1 score (lower CV = more consistent = higher score)
1106            // CV of 0% = 1.0 (perfect), CV of 20%+ = 0.0 (poor)
1107            (1.0 - (cv * 5.0)).clamp(0.0, 1.0)
1108        })
1109        .collect();
1110
1111    // Top N for charts
1112    let top_n = std::cmp::min(20, total_tests);
1113    let top_indices: Vec<(usize, usize)> = (0..total_tests)
1114        .map(|i| (i, results[i].metrics.generation_tps))
1115        .enumerate()
1116        .take(top_n)
1117        .map(|(rank, (idx, _))| (rank + 1, idx))
1118        .collect();
1119
1120    let top_labels: Vec<String> = top_indices
1121        .iter()
1122        .map(|(_rank, idx)| {
1123            let base = results[*idx].base_settings.as_ref().unwrap();
1124            let rp = resolve_params(&results[*idx].params, base);
1125            format!("T={:.2} TP={:.2}", rp.temperature, rp.top_p)
1126        })
1127        .collect();
1128    let top_gen_tps: Vec<f64> = top_indices
1129        .iter()
1130        .map(|(_, idx)| results[*idx].metrics.generation_tps)
1131        .collect();
1132
1133    // Scatter data with labels
1134    let scatter_gen_tps: Vec<f64> = results.iter().map(|r| r.metrics.generation_tps).collect();
1135    let scatter_latency: Vec<f64> = results
1136        .iter()
1137        .map(|r| r.metrics.latency_per_token)
1138        .collect();
1139    let scatter_first_token: Vec<f64> =
1140        results.iter().map(|r| r.metrics.first_token_time).collect();
1141
1142    let param_headers: Vec<String> = vec![
1143        "Temp".to_string(),
1144        "Top-P".to_string(),
1145        "Top-K".to_string(),
1146        "RepPen".to_string(),
1147        "FA".to_string(),
1148        "Threads".to_string(),
1149        "Batch".to_string(),
1150        "Exp".to_string(),
1151        "Spec".to_string(),
1152        "Draft".to_string(),
1153    ];
1154    let param_vals: Vec<Vec<String>> = results
1155        .iter()
1156        .map(|r| {
1157            let base = r.base_settings.as_ref().unwrap();
1158            let rp = resolve_params(&r.params, base);
1159            vec![
1160                format!("{:.2}", rp.temperature),
1161                format!("{:.2}", rp.top_p),
1162                rp.top_k.to_string(),
1163                format!("{:.2}", rp.repeat_penalty),
1164                if rp.flash_attn {
1165                    "ON".to_string()
1166                } else {
1167                    "OFF".to_string()
1168                },
1169                rp.threads.to_string(),
1170                rp.batch_size.to_string(),
1171                rp.expert_count.to_string(),
1172                if rp.spec_type.is_empty() {
1173                    "-".to_string()
1174                } else {
1175                    rp.spec_type.clone()
1176                },
1177                rp.draft_tokens.to_string(),
1178            ]
1179        })
1180        .collect();
1181
1182    // Build metrics JSON with consistency data
1183    let metrics_data: Vec<serde_json::Value> = results
1184        .iter()
1185        .enumerate()
1186        .map(|(i, r)| {
1187            let base = r.base_settings.as_ref().unwrap();
1188            let rp = resolve_params(&r.params, base);
1189            serde_json::json!({
1190                "idx": i,
1191                "temp": rp.temperature,
1192                "top_p": rp.top_p,
1193                "top_k": rp.top_k,
1194                "repeat_penalty": rp.repeat_penalty,
1195                "flash_attn": rp.flash_attn,
1196                "threads": rp.threads,
1197                "batch_size": rp.batch_size,
1198                "expert_count": rp.expert_count,
1199                "spec_type": rp.spec_type,
1200                "draft_tokens": rp.draft_tokens,
1201                "prompt_tps": r.metrics.prompt_tps,
1202                "generation_tps": r.metrics.generation_tps,
1203                "combined_tps": r.metrics.combined_tps,
1204                "latency_per_token": r.metrics.latency_per_token,
1205                "first_token_time": r.metrics.first_token_time,
1206                "consistency": consistency_data[i],
1207                "outputs": r.outputs,
1208                "per_iteration_metrics": r.per_iteration_metrics.iter().map(|m| {
1209                    serde_json::json!({
1210                        "prompt_tps": m.prompt_tps,
1211                        "generation_tps": m.generation_tps,
1212                        "combined_tps": m.combined_tps,
1213                        "latency_per_token": m.latency_per_token,
1214                        "first_token_time": m.first_token_time,
1215                    })
1216                }).collect::<Vec<_>>(),
1217            })
1218        })
1219        .collect();
1220
1221    // Scatter data with labels for tooltips
1222    let scatter_data_json = serde_json::to_string(
1223        &scatter_gen_tps
1224            .iter()
1225            .zip(scatter_latency.iter())
1226            .zip(scatter_first_token.iter())
1227            .map(|((g, l), f)| {
1228                let mut s = String::from("{x:");
1229                s.push_str(&format!("{:.2}", g));
1230                s.push_str(",y:");
1231                s.push_str(&format!("{:.2}", l));
1232                s.push_str(",ft:");
1233                s.push_str(&format!("{:.2}", f));
1234                s.push('}');
1235                s
1236            })
1237            .collect::<Vec<_>>(),
1238    )
1239    .unwrap();
1240    let scatter_data2_json = serde_json::to_string(
1241        &scatter_gen_tps
1242            .iter()
1243            .zip(scatter_first_token.iter())
1244            .map(|(g, f)| {
1245                let mut s = String::from("{x:");
1246                s.push_str(&format!("{:.2}", g));
1247                s.push_str(",y:");
1248                s.push_str(&format!("{:.2}", f));
1249                s.push_str(",lat:");
1250                s.push_str(&format!("{:.2}", min_val(&latency)));
1251                s.push('}');
1252                s
1253            })
1254            .collect::<Vec<_>>(),
1255    )
1256    .unwrap();
1257
1258    // Model metadata JSON
1259    let model_meta_json = model_info.as_ref().map(|(name, _size, settings)| {
1260        serde_json::json!({
1261            "model_name": name,
1262            "context_length": settings.context_length,
1263            "threads": settings.threads,
1264            "temperature": settings.temperature,
1265            "top_p": settings.top_p,
1266            "top_k": settings.top_k,
1267            "repeat_penalty": settings.repeat_penalty,
1268            "flash_attn": settings.flash_attn,
1269            "kv_cache_offload": settings.kv_cache_offload,
1270            "mlock": settings.mlock,
1271            "system_prompt": settings.system_prompt,
1272        })
1273    });
1274
1275    // Impact analysis JSON
1276    let _impact_json = serde_json::to_string(&impact_sorted).unwrap();
1277
1278    // Column definitions for visibility toggle
1279    let column_defs_json = serde_json::to_string(&vec![
1280        ("col-rank", "#", true),
1281        ("col-temp", "Temp", true),
1282        ("col-top-p", "Top-P", true),
1283        ("col-top-k", "Top-K", true),
1284        ("col-rep-pen", "RepPen", true),
1285        ("col-fa", "FA", true),
1286        ("col-threads", "Threads", true),
1287        ("col-batch", "Batch", true),
1288        ("col-exp", "Exp", true),
1289        ("col-spec", "Spec", true),
1290        ("col-draft", "Draft", true),
1291        ("col-gen-tps", "Gen t/s", true),
1292        ("col-prompt-tps", "Prompt t/s", true),
1293        ("col-latency", "Latency", true),
1294        ("col-first-token", "First Tok", true),
1295        ("col-combined", "Combined", true),
1296        ("col-consistency", "Consistency", true),
1297    ])
1298    .unwrap();
1299
1300    // CSV data
1301    let csv_header = "Rank,Temp,Top-P,Top-K,RepPen,FA,Threads,Batch,Exp,Spec,Draft,Gen t/s,Prompt t/s,Latency (ms),First Tok (ms),Combined,Consistency";
1302    let csv_rows: Vec<String> = (0..total_tests)
1303        .map(|i| {
1304            let d = &metrics_data[i];
1305            let rank = i + 1;
1306            let spec = d
1307                .get("spec_type")
1308                .map(|v| v.as_str().unwrap_or("-"))
1309                .unwrap_or("-")
1310                .to_string();
1311            let draft = d
1312                .get("draft_tokens")
1313                .map(|v| v.as_u64().unwrap_or(0).to_string())
1314                .unwrap_or("-".to_string());
1315            format!(
1316                "{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{:.1}",
1317                rank,
1318                d["temp"].as_f64().unwrap_or(0.0),
1319                d["top_p"].as_f64().unwrap_or(0.0),
1320                d["top_k"].as_i64().unwrap_or(0),
1321                d["repeat_penalty"].as_f64().unwrap_or(0.0),
1322                if d["flash_attn"].as_bool().unwrap_or(false) {
1323                    "ON"
1324                } else {
1325                    "OFF"
1326                },
1327                d["threads"].as_u64().unwrap_or(0),
1328                d["batch_size"].as_u64().unwrap_or(0),
1329                d["expert_count"].as_i64().unwrap_or(0),
1330                spec,
1331                draft,
1332                d["generation_tps"].as_f64().unwrap_or(0.0),
1333                d["prompt_tps"].as_f64().unwrap_or(0.0),
1334                d["latency_per_token"].as_f64().unwrap_or(0.0),
1335                d["first_token_time"].as_f64().unwrap_or(0.0),
1336                d["combined_tps"].as_f64().unwrap_or(0.0),
1337                d["consistency"].as_f64().unwrap_or(1.0)
1338            )
1339        })
1340        .collect();
1341    let csv_content = format!("{}\n{}", csv_header, csv_rows.join("\n"));
1342    let csv_b64 = base64_encode(&csv_content);
1343
1344    let metrics_json = serde_json::to_string(&metrics_data).unwrap();
1345    let param_headers_json = serde_json::to_string(&param_headers).unwrap();
1346    let param_vals_json = serde_json::to_string(&param_vals).unwrap();
1347    let top_labels_json = serde_json::to_string(&top_labels).unwrap();
1348    let top_gen_tps_json = serde_json::to_string(&top_gen_tps).unwrap();
1349
1350    // Build model metadata HTML
1351    let model_meta_html = model_info
1352        .as_ref()
1353        .map(|(name, _size, s)| {
1354            format!(
1355                r#"
1356<div class="meta-section">
1357<h2>Model &amp; Configuration</h2>
1358<div class="meta-grid">
1359<div class="meta-item"><div class="ml">Model</div><div class="mv">{}</div></div>
1360<div class="meta-item"><div class="ml">Context</div><div class="mv">{}</div></div>
1361<div class="meta-item"><div class="ml">Threads</div><div class="mv">{}</div></div>
1362<div class="meta-item"><div class="ml">Flash Attention</div><div class="mv">{}</div></div>
1363<div class="meta-item"><div class="ml">KV Cache Offload</div><div class="mv">{}</div></div>
1364<div class="meta-item"><div class="ml">MLOCK</div><div class="mv">{}</div></div>
1365<div class="meta-item"><div class="ml">Prompt</div><div class="mv meta-prompt">{}</div></div>
1366</div>
1367</div>"#,
1368                escape_html(name),
1369                s.context_length,
1370                s.threads,
1371                if s.flash_attn { "ON" } else { "OFF" },
1372                if s.kv_cache_offload { "ON" } else { "OFF" },
1373                if s.mlock { "ON" } else { "OFF" },
1374                escape_html(&s.system_prompt.chars().take(100).collect::<String>())
1375            )
1376        })
1377        .unwrap_or_default();
1378
1379    // Build winner section HTML
1380    let winner_html = best_idx.and_then(|idx| {
1381        let r = &results[idx];
1382        let base = r.base_settings.as_ref()?;
1383        let rp = resolve_params(&r.params, base);
1384        let m = &r.metrics;
1385        Some(format!(r#"
1386<div class="winner-section">
1387<div class="winner-icon">&#127942;</div>
1388<div class="winner-content">
1389<div class="winner-title">Best Configuration</div>
1390<div class="winner-metrics">
1391<div class="winner-metric"><span class="wm-label">Gen t/s</span><span class="wm-value" style="color:#3fb950;font-size:1.8em;">{:.2}</span></div>
1392<div class="winner-metric"><span class="wm-label">Prompt t/s</span><span class="wm-value">{:.2}</span></div>
1393<div class="winner-metric"><span class="wm-label">Latency</span><span class="wm-value">{:.2}ms</span></div>
1394<div class="winner-metric"><span class="wm-label">First Token</span><span class="wm-value">{:.0}ms</span></div>
1395</div>
1396<div class="winner-params">Temp: {:.2} &middot; Top-P: {:.2} &middot; Top-K: {} &middot; RepPen: {:.2} &middot; FA: {} &middot; Threads: {} &middot; Batch: {} &middot; Exp: {} &middot; Spec: {} &middot; Draft: {}</div>
1397</div>
1398</div>"#,
1399                m.generation_tps, m.prompt_tps, m.latency_per_token, m.first_token_time,
1400                rp.temperature, rp.top_p, rp.top_k, rp.repeat_penalty,
1401                if rp.flash_attn { "ON" } else { "OFF" }, rp.threads,
1402                rp.batch_size, rp.expert_count,
1403                if rp.spec_type.is_empty() { "Off".to_string() } else { rp.spec_type.clone() }, rp.draft_tokens
1404            ))
1405    }).unwrap_or_default();
1406
1407    // Build impact analysis HTML
1408    let impact_html = if !impact_sorted.is_empty() {
1409        let max_impact = impact_sorted[0].2;
1410        let rows: String = impact_sorted
1411            .iter()
1412            .map(|(label, spread, value)| {
1413                let bar_width = if max_impact > 0.0 {
1414                    (value / max_impact * 100.0) as i32
1415                } else {
1416                    0
1417                };
1418                let bar_color = if *value > max_impact * 0.7 {
1419                    "#f85149"
1420                } else if *value > max_impact * 0.4 {
1421                    "#d29922"
1422                } else {
1423                    "#3fb950"
1424                };
1425                format!(
1426                    r#"<div class="impact-row">
1427<div class="impact-label">{}</div>
1428<div class="impact-bar-bg"><div class="impact-bar-fill" style="width:{}%;background:{}"></div></div>
1429<div class="impact-value">{}</div>
1430</div>"#,
1431                    label, bar_width, bar_color, spread
1432                )
1433            })
1434            .collect();
1435        format!(
1436            r#"
1437<div class="impact-section">
1438<h2>Parameter Impact Analysis</h2>
1439<p class="impact-desc">Larger spread in generation throughput between parameter values indicates greater impact on performance.</p>
1440{}
1441</div>"#,
1442            rows
1443        )
1444    } else {
1445        r#"<div class="impact-section"><h2>Parameter Impact Analysis</h2><p class="impact-desc">All parameters were held constant — no impact data available.</p></div>"#.to_string()
1446    };
1447
1448    // Empty state
1449    let empty_html = if total_tests == 0 {
1450        r#"<div class="empty-state">
1451<div class="empty-icon">&#128202;</div>
1452<div class="empty-title">No Results</div>
1453<div class="empty-text">Run a benchmark tuning test to generate results here.</div>
1454</div>"#
1455    } else {
1456        ""
1457    };
1458
1459    let html = include_str!("benchmark_report.html");
1460
1461    // Replace placeholders
1462    html.replace("__TIMESTAMP__", &timestamp)
1463        .replace("__TOTAL_TESTS__", &total_tests.to_string())
1464        .replace("__EMPTY_STATE__", empty_html)
1465        .replace("__MODEL_META__", &model_meta_html)
1466        .replace("__WINNER__", &winner_html)
1467        .replace("__AVG_GEN_TPS__", &format!("{:.1}", avg_gen_tps))
1468        .replace(
1469            "__MED_GEN_TPS__",
1470            &format!("{:.1}", median(&mut gen_tps_sorted)),
1471        )
1472        .replace("__GEN_STD__", &format!("{:.1}", gen_std))
1473        .replace("__MIN_GEN__", &format!("{:.1}", min_gen_tps))
1474        .replace("__MAX_GEN__", &format!("{:.1}", best_gen_tps))
1475        .replace("__AVG_PROMPT_TPS__", &format!("{:.1}", avg_prompt_tps))
1476        .replace(
1477            "__MED_PROMPT_TPS__",
1478            &format!("{:.1}", median(&mut prompt_tps)),
1479        )
1480        .replace("__PROMPT_STD__", &format!("{:.1}", prompt_std))
1481        .replace("__MIN_PROMPT__", &format!("{:.1}", min_prompt_tps))
1482        .replace("__MAX_PROMPT__", &format!("{:.1}", best_prompt_tps))
1483        .replace("__AVG_LATENCY__", &format!("{:.1}ms", avg_latency))
1484        .replace(
1485            "__MED_LATENCY__",
1486            &format!("{:.1}ms", median(&mut latency_sorted)),
1487        )
1488        .replace("__LAT_STD__", &format!("{:.1}", lat_std))
1489        .replace("__MIN_LAT__", &format!("{:.1}", min_latency))
1490        .replace("__MAX_LAT__", &format!("{:.1}", best_latency))
1491        .replace("__AVG_FT__", &format!("{:.0}ms", avg_first_token))
1492        .replace("__MED_FT__", &format!("{:.0}ms", median(&mut first_token)))
1493        .replace("__FT_STD__", &format!("{:.0}", ft_std))
1494        .replace("__MIN_FT__", &format!("{:.0}ms", min_first_token))
1495        .replace("__MAX_FT__", &format!("{:.0}ms", best_first_token))
1496        .replace("__BEST_GEN__", &format!("{:.1}", best_gen_tps))
1497        .replace("__TOP_N__", &top_n.to_string())
1498        .replace("__IMPACT_HTML__", &impact_html)
1499        .replace("__METRICS_JSON__", &metrics_json)
1500        .replace("__PARAM_HEADERS_JSON__", &param_headers_json)
1501        .replace("__PARAM_VALS_JSON__", &param_vals_json)
1502        .replace("__TOP_LABELS_JSON__", &top_labels_json)
1503        .replace("__TOP_GEN_TPS_JSON__", &top_gen_tps_json)
1504        .replace("__SCATTER_DATA_JSON__", &scatter_data_json)
1505        .replace("__SCATTER_DATA2_JSON__", &scatter_data2_json)
1506        .replace("__COLUMN_DEFS_JSON__", &column_defs_json)
1507        .replace("__CSV_B64__", &csv_b64)
1508        .replace(
1509            "__MODEL_META_JSON__",
1510            &serde_json::to_string(&model_meta_json).unwrap(),
1511        )
1512}
1513
1514/// Escape HTML special characters
1515fn escape_html(s: &str) -> String {
1516    s.replace('&', "&amp;")
1517        .replace('<', "&lt;")
1518        .replace('>', "&gt;")
1519        .replace('"', "&quot;")
1520}
1521
1522/// Base64 encode a string (no external dependency - simple encoding)
1523fn base64_encode(input: &str) -> String {
1524    const CHARS: &[u8] = b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
1525    let bytes = input.as_bytes();
1526    let mut result = String::new();
1527    for chunk in bytes.chunks(3) {
1528        let b0 = chunk[0] as u32;
1529        let b1 = if chunk.len() > 1 { chunk[1] as u32 } else { 0 };
1530        let b2 = if chunk.len() > 2 { chunk[2] as u32 } else { 0 };
1531        let triple = (b0 << 16) | (b1 << 8) | b2;
1532        result.push(CHARS[((triple >> 18) & 0x3F) as usize] as char);
1533        result.push(CHARS[((triple >> 12) & 0x3F) as usize] as char);
1534        if chunk.len() > 1 {
1535            result.push(CHARS[((triple >> 6) & 0x3F) as usize] as char);
1536        } else {
1537            result.push('=');
1538        }
1539        if chunk.len() > 2 {
1540            result.push(CHARS[(triple & 0x3F) as usize] as char);
1541        } else {
1542            result.push('=');
1543        }
1544    }
1545    result
1546}
1547
1548/// Result from a single inference request
1549struct InferenceResult {
1550    prompt_tokens: u64,
1551    generation_tokens: u64,
1552    prompt_time: Duration,
1553    generation_time: Duration,
1554    total_time: Duration,
1555    first_token_time: u128, // milliseconds
1556    content: String,
1557}