llm_manager/backend/
benchmark.rs

1use std::path::PathBuf;
2use std::time::{Duration, Instant};
3
4use tokio::sync::{mpsc, watch};
5
6use crate::backend::server::{spawn_server, SpawnServerRequest};
7use crate::models::{
8    BenchTuneConfig, BenchTuneMetrics, BenchTuneMode, BenchTuneParamValue, BenchTuneResult,
9    BenchTuneStatus, DiscoveredModel, ModelSettings, ServerMode,
10};
11
12/// Benchmark tuning constants
13const HEALTH_CHECK_ITERATIONS: u32 = 120;
14const HEALTH_CHECK_INTERVAL_MS: u64 = 500;
15const HEALTH_CHECK_LOG_INTERVAL: u32 = 10;
16const REQUEST_TIMEOUT_SECS: u64 = 120;
17
18struct BenchAccumulator {
19    params: BenchTuneParamValue,
20    total_prompt_tokens: u64,
21    total_generation_tokens: u64,
22    total_prompt_time: Duration,
23    total_generation_time: Duration,
24    total_time: Duration,
25    first_token_times: Vec<u128>,
26    outputs: Vec<String>,
27    per_iteration_metrics: Vec<BenchTuneMetrics>,
28    base_settings: Option<ModelSettings>,
29}
30
31fn build_bench_result(acc: BenchAccumulator) -> BenchTuneResult {
32    let BenchAccumulator {
33        params,
34        total_prompt_tokens,
35        total_generation_tokens,
36        total_prompt_time,
37        total_generation_time,
38        total_time,
39        first_token_times,
40        outputs,
41        per_iteration_metrics,
42        base_settings,
43    } = acc;
44    let prompt_tps = if total_prompt_time.as_secs_f64() > 0.0 {
45        (total_prompt_tokens as f64) / total_prompt_time.as_secs_f64()
46    } else {
47        0.0
48    };
49
50    let generation_tps = if total_generation_time.as_secs_f64() > 0.0 {
51        (total_generation_tokens as f64) / total_generation_time.as_secs_f64()
52    } else {
53        0.0
54    };
55
56    let combined_tps = if total_time.as_secs_f64() > 0.0 {
57        ((total_prompt_tokens + total_generation_tokens) as f64) / total_time.as_secs_f64()
58    } else {
59        0.0
60    };
61
62    let avg_latency_per_token = if total_generation_tokens > 0 {
63        total_generation_time.as_millis() as f64 / (total_generation_tokens as f64)
64    } else {
65        0.0
66    };
67
68    let avg_first_token_time = if !first_token_times.is_empty() {
69        first_token_times.iter().sum::<u128>() as f64 / first_token_times.len() as f64
70    } else {
71        0.0
72    };
73
74    BenchTuneResult {
75        params,
76        metrics: BenchTuneMetrics {
77            prompt_tps,
78            generation_tps,
79            combined_tps,
80            latency_per_token: avg_latency_per_token,
81            first_token_time: avg_first_token_time,
82        },
83        outputs,
84        per_iteration_metrics,
85        base_settings,
86        server_command: None,
87    }
88}
89
90pub struct BenchTuneRequest<'a> {
91    pub main_config: &'a crate::config::Config,
92    pub config: &'a BenchTuneConfig,
93    pub model: &'a DiscoveredModel,
94    pub settings: &'a ModelSettings,
95    pub progress_tx: mpsc::Sender<BenchTuneStatus>,
96    pub log_tx: mpsc::Sender<String>,
97    pub cancel_rx: &'a mut watch::Receiver<bool>,
98}
99
100/// Run a benchmark tuning test with multiple parameter combinations
101pub async fn run_bench_tune(
102    req: BenchTuneRequest<'_>,
103) -> Result<Vec<BenchTuneResult>, Box<dyn std::error::Error + Send + Sync>> {
104    let BenchTuneRequest {
105        main_config,
106        config,
107        model,
108        settings,
109        progress_tx,
110        log_tx,
111        cancel_rx,
112    } = req;
113    let start_time = Instant::now();
114    let total_tests = config.get_total_tests_count();
115
116    // Warn on large runs
117    if total_tests > 500 {
118        let _ = log_tx
119            .send(format!(
120                "WARNING: Benchmark will run {} combinations. This may take a long time.",
121                total_tests
122            ))
123            .await;
124    }
125
126    // Generate all parameter combinations
127    let combinations = config.generate_combinations();
128
129    // Results storage
130    let mut results = Vec::new();
131    let mut failed_tests: Vec<(usize, String)> = Vec::new();
132
133    // Apply chat_template_kwargs from config to settings
134    let mut settings = settings.clone();
135    if let Some(kwargs) = &config.chat_template_kwargs {
136        settings.chat_template_kwargs = Some(kwargs.clone());
137    }
138
139    // Create a shared HTTP client for all inference requests
140    let client = reqwest::Client::builder()
141        .timeout(Duration::from_secs(REQUEST_TIMEOUT_SECS))
142        .build()?;
143
144    // If runtime-only mode, send params in request body (no server restarts)
145    if config.bench_mode == BenchTuneMode::RuntimeOnly {
146        // Spawn a single server for all runtime-only iterations
147        let (exit_tx, _exit_rx) = tokio::sync::mpsc::channel(1);
148        let (server_handle, server_command) = spawn_server(SpawnServerRequest {
149            config: main_config,
150            model: Some(model),
151            settings: &settings,
152            log_tx: log_tx.clone(),
153            progress_tx: None,
154            server_mode: ServerMode::Normal,
155            router_max_models: 1,
156            exit_tx,
157        })
158        .await?;
159
160        let host = if server_handle.host == "0.0.0.0" {
161            "127.0.0.1"
162        } else {
163            &server_handle.host
164        };
165
166        // Wait for server to be ready
167        for i in 0..HEALTH_CHECK_ITERATIONS {
168            if *cancel_rx.borrow() {
169                let _ = crate::backend::server::kill_server(server_handle).await;
170                let elapsed = start_time.elapsed();
171                progress_tx
172                    .send(BenchTuneStatus::Cancelled {
173                        total_tests,
174                        successful_tests: results.len(),
175                        failed_tests: failed_tests.len(),
176                        elapsed,
177                    })
178                    .await?;
179                return Ok(results);
180            }
181            if crate::backend::server::check_health(host, server_handle.port).await {
182                break;
183            }
184            if i % HEALTH_CHECK_LOG_INTERVAL == 0 && i > 0 {
185                let _ = log_tx
186                    .send(format!(
187                        "  ... still waiting ({:.0}s)...",
188                        i as f32 * (HEALTH_CHECK_INTERVAL_MS as f32 / 1000.0)
189                    ))
190                    .await;
191            }
192            tokio::time::sleep(Duration::from_millis(HEALTH_CHECK_INTERVAL_MS)).await;
193        }
194
195        let server_port = server_handle.port;
196        let server_host = host.to_string();
197
198        for (idx, combination) in combinations.iter().enumerate() {
199            // Check cancellation before each test
200            if *cancel_rx.borrow() {
201                let _ = crate::backend::server::kill_server(server_handle).await;
202                let elapsed = start_time.elapsed();
203                progress_tx
204                    .send(BenchTuneStatus::Cancelled {
205                        total_tests,
206                        successful_tests: results.len(),
207                        failed_tests: failed_tests.len(),
208                        elapsed,
209                    })
210                    .await?;
211                return Ok(results);
212            }
213
214            let progress = (idx as f32 / total_tests as f32) * 100.0;
215            progress_tx
216                .send(BenchTuneStatus::Running {
217                    current: idx + 1,
218                    total: total_tests,
219                    progress,
220                    current_params: combination.clone(),
221                })
222                .await?;
223
224            let result = run_bench_tune_runtime_only(RuntimeOnlyCtx {
225                params: combination,
226                settings: &settings,
227                num_iterations: config.num_iterations,
228                prompt: config.prompt.clone(),
229                server_host: &server_host,
230                server_port,
231                log_tx: log_tx.clone(),
232                config,
233                client: &client,
234                server_command: &server_command,
235            })
236            .await;
237
238            match result {
239                Ok(test_result) => results.push(test_result),
240                Err(e) => {
241                    failed_tests.push((idx + 1, e.to_string()));
242                    let _ = log_tx
243                        .send(format!(
244                            "Benchmark test {}/{} failed: {}",
245                            idx + 1,
246                            total_tests,
247                            e
248                        ))
249                        .await;
250                }
251            }
252        }
253
254        let _ = crate::backend::server::kill_server(server_handle).await;
255    } else {
256        // Full mode: spawn a new server for each parameter combination
257        for (idx, combination) in combinations.iter().enumerate() {
258            // Check cancellation before each test
259            if *cancel_rx.borrow() {
260                let elapsed = start_time.elapsed();
261                progress_tx
262                    .send(BenchTuneStatus::Cancelled {
263                        total_tests,
264                        successful_tests: results.len(),
265                        failed_tests: failed_tests.len(),
266                        elapsed,
267                    })
268                    .await?;
269                return Ok(results);
270            }
271
272            let progress = (idx as f32 / total_tests as f32) * 100.0;
273            progress_tx
274                .send(BenchTuneStatus::Running {
275                    current: idx + 1,
276                    total: total_tests,
277                    progress,
278                    current_params: combination.clone(),
279                })
280                .await?;
281
282            let result = run_bench_tune_single_test(SingleTestCtx {
283                main_config,
284                params: combination,
285                model,
286                base_settings: &settings,
287                num_iterations: config.num_iterations,
288                prompt: config.prompt.clone(),
289                log_tx: log_tx.clone(),
290                config,
291                client: &client,
292            })
293            .await;
294
295            match result {
296                Ok(test_result) => results.push(test_result),
297                Err(e) => {
298                    failed_tests.push((idx + 1, e.to_string()));
299                    let _ = log_tx
300                        .send(format!(
301                            "Benchmark test {}/{} failed: {}",
302                            idx + 1,
303                            total_tests,
304                            e
305                        ))
306                        .await;
307                }
308            }
309        }
310    }
311
312    // Sort results by combined_tps (descending)
313    results.sort_by(|a, b| {
314        b.metrics
315            .combined_tps
316            .partial_cmp(&a.metrics.combined_tps)
317            .unwrap_or(std::cmp::Ordering::Equal)
318    });
319
320    let elapsed = start_time.elapsed();
321    let successful_tests = results.len();
322    let failed_count = failed_tests.len();
323
324    // Final progress update - distinguish between full success and partial success
325    if failed_count > 0 {
326        progress_tx
327            .send(BenchTuneStatus::PartiallyCompleted {
328                total_tests,
329                successful_tests,
330                failed_tests: failed_count,
331                elapsed,
332            })
333            .await?;
334    } else {
335        progress_tx
336            .send(BenchTuneStatus::Completed {
337                total_tests,
338                successful_tests,
339                elapsed,
340            })
341            .await?;
342    }
343
344    Ok(results)
345}
346
347/// Run inference iterations and accumulate metrics into a BenchTuneResult.
348struct IterationLoopCtx<'a> {
349    prompt: &'a str,
350    host: &'a str,
351    port: u16,
352    params: &'a BenchTuneParamValue,
353    num_iterations: u32,
354    config: &'a BenchTuneConfig,
355    client: &'a reqwest::Client,
356    log_tx: mpsc::Sender<String>,
357    log_prefix: &'a str,
358}
359
360/// Shared by both runtime-only and full benchmark modes.
361async fn run_iteration_loop(
362    ctx: IterationLoopCtx<'_>,
363) -> Result<BenchTuneResult, Box<dyn std::error::Error + Send + Sync>> {
364    let IterationLoopCtx {
365        prompt,
366        host,
367        port,
368        params,
369        num_iterations,
370        config,
371        client,
372        log_tx,
373        log_prefix,
374    } = ctx;
375    let mut total_prompt_tokens = 0u64;
376    let mut total_generation_tokens = 0u64;
377    let mut total_prompt_time = Duration::ZERO;
378    let mut total_generation_time = Duration::ZERO;
379    let mut total_time = Duration::ZERO;
380    let mut first_token_times = Vec::new();
381    let mut outputs = Vec::new();
382    let mut per_iteration_metrics = Vec::new();
383
384    let _ = log_tx
385        .send(format!(
386            "Running {} inference iterations {}...",
387            num_iterations, log_prefix
388        ))
389        .await;
390
391    for i in 0..num_iterations {
392        let result = send_inference_request(prompt, host, port, params, config, client).await;
393
394        match result {
395            Ok(res) => {
396                total_prompt_tokens += res.prompt_tokens;
397                total_generation_tokens += res.generation_tokens;
398                total_prompt_time += res.prompt_time;
399                total_generation_time += res.generation_time;
400                total_time += res.total_time;
401                first_token_times.push(res.first_token_time);
402                outputs.push(res.content.clone());
403
404                let iter_prompt_tps = if res.prompt_time.as_secs_f64() > 0.0 {
405                    res.prompt_tokens as f64 / res.prompt_time.as_secs_f64()
406                } else {
407                    0.0
408                };
409                let iter_gen_tps = if res.generation_time.as_secs_f64() > 0.0 {
410                    res.generation_tokens as f64 / res.generation_time.as_secs_f64()
411                } else {
412                    0.0
413                };
414                let iter_combined_tps = if res.total_time.as_secs_f64() > 0.0 {
415                    ((res.prompt_tokens + res.generation_tokens) as f64) / res.total_time.as_secs_f64()
416                } else {
417                    0.0
418                };
419                let iter_latency = if res.generation_tokens > 0 {
420                    res.generation_time.as_millis() as f64 / res.generation_tokens as f64
421                } else {
422                    0.0
423                };
424
425                per_iteration_metrics.push(BenchTuneMetrics {
426                    prompt_tps: iter_prompt_tps,
427                    generation_tps: iter_gen_tps,
428                    combined_tps: iter_combined_tps,
429                    latency_per_token: iter_latency,
430                    first_token_time: res.first_token_time as f64,
431                });
432
433                if num_iterations > 1 {
434                    let _ = log_tx
435                        .send(format!(
436                            "  Iteration {}/{}: {:.2} gen t/s",
437                            i + 1,
438                            num_iterations,
439                            iter_gen_tps
440                        ))
441                        .await;
442                }
443
444                let _ = log_tx
445                    .send(format!(
446                        "--- Generated Output (Iter {}) ---\n{}\n----------------------------------",
447                        i + 1,
448                        res.content
449                    ))
450                    .await;
451            }
452            Err(e) => {
453                let _ = log_tx
454                    .send(format!(
455                        "  Iteration {}/{} FAILED: {}",
456                        i + 1,
457                        num_iterations,
458                        e
459                    ))
460                    .await;
461                if i == 0 {
462                    return Err(format!("Inference failed: {}", e).into());
463                }
464            }
465        }
466    }
467
468    Ok(build_bench_result(BenchAccumulator {
469        params: params.clone(),
470        total_prompt_tokens,
471        total_generation_tokens,
472        total_prompt_time,
473        total_generation_time,
474        total_time,
475        first_token_times,
476        outputs,
477        per_iteration_metrics,
478        base_settings: None,
479    }))
480}
481
482struct RuntimeOnlyCtx<'a> {
483    params: &'a BenchTuneParamValue,
484    settings: &'a ModelSettings,
485    num_iterations: u32,
486    prompt: String,
487    server_host: &'a str,
488    server_port: u16,
489    log_tx: mpsc::Sender<String>,
490    config: &'a BenchTuneConfig,
491    client: &'a reqwest::Client,
492    server_command: &'a str,
493}
494
495/// Run benchmark in runtime-only mode: sends params in /completion request body, no server restarts
496async fn run_bench_tune_runtime_only(
497    ctx: RuntimeOnlyCtx<'_>,
498) -> Result<BenchTuneResult, Box<dyn std::error::Error + Send + Sync>> {
499    let RuntimeOnlyCtx {
500        params,
501        settings,
502        num_iterations,
503        prompt,
504        server_host,
505        server_port,
506        log_tx,
507        config,
508        client,
509        server_command,
510    } = ctx;
511    let loop_fut = run_iteration_loop(IterationLoopCtx {
512        prompt: &prompt,
513        host: server_host,
514        port: server_port,
515        params,
516        num_iterations,
517        config,
518        client,
519        log_tx,
520        log_prefix: "(runtime-only mode)",
521    });
522    let result = tokio::time::timeout(config.test_timeout, loop_fut).await;
523    let result = match result {
524        Ok(inner) => inner,
525        Err(_) => return Err(format!("Test timed out after {:?}", config.test_timeout).into()),
526    };
527    result.map(|mut r| {
528        r.base_settings = Some(settings.clone());
529        r.server_command = Some(server_command.to_string());
530        r
531    })
532}
533
534struct SingleTestCtx<'a> {
535    main_config: &'a crate::config::Config,
536    params: &'a BenchTuneParamValue,
537    model: &'a DiscoveredModel,
538    base_settings: &'a ModelSettings,
539    num_iterations: u32,
540    prompt: String,
541    log_tx: mpsc::Sender<String>,
542    config: &'a BenchTuneConfig,
543    client: &'a reqwest::Client,
544}
545
546/// Run a single benchmark tuning test with specific parameters
547async fn run_bench_tune_single_test(
548    ctx: SingleTestCtx<'_>,
549) -> Result<BenchTuneResult, Box<dyn std::error::Error + Send + Sync>> {
550    let SingleTestCtx {
551        main_config,
552        params,
553        model,
554        base_settings,
555        num_iterations,
556        prompt,
557        log_tx,
558        config,
559        client,
560    } = ctx;
561    // Create settings with test parameters
562    let mut settings = base_settings.clone();
563
564    // Apply test parameters
565    if let Some(temperature) = params.temperature {
566        settings.temperature = temperature as f32;
567    }
568    if let Some(top_p) = params.top_p {
569        settings.top_p = top_p as f32;
570    }
571    if let Some(top_k) = params.top_k {
572        settings.top_k = top_k as i32;
573    }
574    if let Some(repeat_penalty) = params.repeat_penalty {
575        settings.repeat_penalty = repeat_penalty as f32;
576    }
577    if let Some(flash_attn) = params.flash_attn {
578        settings.flash_attn = flash_attn;
579    }
580    if let Some(threads) = params.threads {
581        settings.threads = threads;
582        settings.threads_batch = threads; // Usually keep them equal for benchmarks
583    }
584    if let Some(batch_size) = params.batch_size {
585        settings.batch_size = batch_size;
586        settings.ubatch_size = batch_size;
587    }
588    if let Some(expert_count) = params.expert_count {
589        settings.expert_count = expert_count;
590    }
591    if let Some(ref spec_type) = params.spec_type {
592        settings.spec_type = if spec_type == "Off" {
593            String::new()
594        } else {
595            spec_type.clone()
596        };
597    }
598    if let Some(draft_tokens) = params.draft_tokens {
599        settings.draft_tokens = draft_tokens;
600    }
601
602    // Spawn server with test parameters
603    let (exit_tx, _exit_rx) = tokio::sync::mpsc::channel(1);
604    let (server_handle, command) = spawn_server(SpawnServerRequest {
605        config: main_config,
606        model: Some(model),
607        settings: &settings,
608        log_tx: log_tx.clone(),
609        progress_tx: None,
610        server_mode: ServerMode::Normal,
611        router_max_models: 1,
612        exit_tx,
613    })
614    .await?;
615    // Wait for server to be ready
616    let mut ready = false;
617    let host = if server_handle.host == "0.0.0.0" {
618        "127.0.0.1"
619    } else {
620        &server_handle.host
621    };
622
623    let _ = log_tx
624        .send(format!(
625            "Waiting for server on {}:{}...",
626            host, server_handle.port
627        ))
628        .await;
629
630    for i in 0..HEALTH_CHECK_ITERATIONS {
631        if crate::backend::server::check_health(host, server_handle.port).await {
632            ready = true;
633            break;
634        }
635        if i % HEALTH_CHECK_LOG_INTERVAL == 0 && i > 0 {
636            let _ = log_tx
637                .send(format!(
638                    "  ... still waiting ({:.0}s)...",
639                    i as f32 * (HEALTH_CHECK_INTERVAL_MS as f32 / 1000.0)
640                ))
641                .await;
642        }
643        tokio::time::sleep(Duration::from_millis(HEALTH_CHECK_INTERVAL_MS)).await;
644    }
645
646    if !ready {
647        let _ = log_tx
648            .send("Error: Server health check timed out".to_string())
649            .await;
650        let _ = crate::backend::server::kill_server(server_handle).await;
651        return Err("Server failed to become healthy".into());
652    }
653
654    let loop_fut = run_iteration_loop(IterationLoopCtx {
655        prompt: &prompt,
656        host,
657        port: server_handle.port,
658        params,
659        num_iterations,
660        config,
661        client,
662        log_tx,
663        log_prefix: "",
664    });
665    let result = tokio::time::timeout(config.test_timeout, loop_fut).await;
666    let result = match result {
667        Ok(inner) => inner,
668        Err(_) => {
669            let _ = crate::backend::server::kill_server(server_handle).await;
670            return Err(format!("Test timed out after {:?}", config.test_timeout).into());
671        }
672    };
673
674    let _ = crate::backend::server::kill_server(server_handle).await;
675    tokio::time::sleep(Duration::from_secs(1)).await;
676
677    result.map(|mut r| {
678        r.base_settings = Some(base_settings.clone());
679        r.server_command = Some(command);
680        r
681    })
682}
683
684/// Send an inference request and measure response time
685async fn send_inference_request(
686    prompt: &str,
687    host: &str,
688    port: u16,
689    params: &BenchTuneParamValue,
690    config: &BenchTuneConfig,
691    client: &reqwest::Client,
692) -> Result<InferenceResult, Box<dyn std::error::Error + Send + Sync>> {
693    // Build request body with benchmark params
694    let mut body = serde_json::json!({
695        "prompt": prompt,
696        "n_predict": config.n_predict,
697        "stream": false
698    });
699
700    if let Some(temperature) = params.temperature {
701        body["temperature"] = serde_json::json!(temperature);
702    }
703    if let Some(top_p) = params.top_p {
704        body["top_p"] = serde_json::json!(top_p);
705    }
706    if let Some(top_k) = params.top_k {
707        body["top_k"] = serde_json::json!(top_k);
708    }
709    if let Some(repeat_penalty) = params.repeat_penalty {
710        body["repeat_penalty"] = serde_json::json!(repeat_penalty);
711    }
712
713    let url = format!("http://{}:{}/completion", host, port);
714    let start = Instant::now();
715    let resp = client.post(url).json(&body).send().await?;
716
717    if !resp.status().is_success() {
718        let status = resp.status();
719        let body = resp.text().await.unwrap_or_else(|_| "no body".to_string());
720        return Err(format!("Server returned error {}: {}", status, body).into());
721    }
722
723    let total_time = start.elapsed();
724    let json: serde_json::Value = resp.json().await?;
725
726    // Robust timings parsing
727    let prompt_tokens = json["tokens_evaluated"]
728        .as_u64()
729        .or_else(|| json["prompt_n"].as_u64())
730        .unwrap_or(0);
731
732    let generation_tokens = json["tokens_predicted"]
733        .as_u64()
734        .or_else(|| json["predicted_n"].as_u64())
735        .unwrap_or(0);
736
737    let timings = &json["timings"];
738    let prompt_time_ms = timings["prompt_ms"]
739        .as_f64()
740        .or_else(|| timings["prompt_eval_ms"].as_f64())
741        .unwrap_or(0.0);
742
743    let generation_time_ms = timings["predicted_ms"]
744        .as_f64()
745        .or_else(|| timings["eval_ms"].as_f64())
746        .unwrap_or(0.0);
747
748    Ok(InferenceResult {
749        prompt_tokens,
750        generation_tokens,
751        prompt_time: Duration::from_millis(prompt_time_ms as u64),
752        generation_time: Duration::from_millis(generation_time_ms as u64),
753        total_time,
754        first_token_time: prompt_time_ms as u128,
755        content: json["content"].as_str().unwrap_or("").to_string(),
756    })
757}
758
759/// Save benchmark results to disk in Markdown format
760pub async fn save_results(
761    results: &[BenchTuneResult],
762    output_dir: &PathBuf,
763    config: &BenchTuneConfig,
764) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
765    // Create output directory if it doesn't exist
766    std::fs::create_dir_all(output_dir)?;
767
768    // Generate timestamp for the filename
769    let timestamp = chrono::Local::now().format("%Y%m%d_%H%M%S");
770    let filename = format!("benchmark_{}.md", timestamp);
771    let filepath = output_dir.join(filename);
772
773    let mut md = String::new();
774    md.push_str("# LLM Benchmark Results\n\n");
775    md.push_str(&format!(
776        "Generated on: {}\n\n",
777        chrono::Local::now().format("%Y-%m-%d %H:%M:%S")
778    ));
779
780    md.push_str("| Temp | Top-P | Top-K | RepPen | FA | Threads | Batch | Exp | Spec | Draft | Prompt t/s | Gen t/s | Latency (ms) | First Tok (ms) |\n");
781    md.push_str("|------|-------|-------|--------|----|---------|-------|-----|------|-------|------------|---------|--------------|----------------|\n");
782
783    for r in results {
784        let temp = r
785            .params
786            .temperature
787            .map(|v| format!("{:.2}", v))
788            .unwrap_or_else(|| "-".to_string());
789        let top_p = r
790            .params
791            .top_p
792            .map(|v| format!("{:.2}", v))
793            .unwrap_or_else(|| "-".to_string());
794        let top_k = r
795            .params
796            .top_k
797            .map(|v| v.to_string())
798            .unwrap_or_else(|| "-".to_string());
799        let rep_pen = r
800            .params
801            .repeat_penalty
802            .map(|v| format!("{:.2}", v))
803            .unwrap_or_else(|| "-".to_string());
804        let fa = r
805            .params
806            .flash_attn
807            .map(|v| if v { "ON" } else { "OFF" })
808            .unwrap_or("-");
809        let threads = r
810            .params
811            .threads
812            .map(|v| v.to_string())
813            .unwrap_or_else(|| "-".to_string());
814        let batch = r
815            .params
816            .batch_size
817            .map(|v| v.to_string())
818            .unwrap_or_else(|| "-".to_string());
819        let exp = r
820            .params
821            .expert_count
822            .map(|v| v.to_string())
823            .unwrap_or_else(|| "-".to_string());
824
825        let spec = r
826            .params
827            .spec_type
828            .as_ref()
829            .map(|s| {
830                if s.is_empty() {
831                    "-".to_string()
832                } else {
833                    s.clone()
834                }
835            })
836            .unwrap_or_else(|| "-".to_string());
837        let draft = r
838            .params
839            .draft_tokens
840            .map(|v| v.to_string())
841            .unwrap_or_else(|| "-".to_string());
842
843        md.push_str(&format!(
844            "| {} | {} | {} | {} | {} | {} | {} | {} | {} | {} | {:.2} | {:.2} | {:.2} | {:.2} |\n",
845            temp,
846            top_p,
847            top_k,
848            rep_pen,
849            fa,
850            threads,
851            batch,
852            exp,
853            spec,
854            draft,
855            r.metrics.prompt_tps,
856            r.metrics.generation_tps,
857            r.metrics.latency_per_token,
858            r.metrics.first_token_time
859        ));
860    }
861
862    tokio::fs::write(&filepath, md).await?;
863
864    // Save full results as JSON with outputs
865    let json_filename = format!("benchmark_{}.json", timestamp);
866    let json_filepath = output_dir.join(&json_filename);
867    let json_content = serde_json::to_string_pretty(&results)?;
868    tokio::fs::write(&json_filepath, json_content).await?;
869
870    // Also save full results as YAML with outputs
871    let yaml_filename = format!("benchmark_{}.yaml", timestamp);
872    let yaml_filepath = output_dir.join(&yaml_filename);
873    let yaml_content = serde_yaml::to_string(&results)?;
874    tokio::fs::write(&yaml_filepath, yaml_content).await?;
875
876    // Generate HTML report
877    let html_filename = format!("benchmark_{}.html", timestamp);
878    let html_filepath = output_dir.join(&html_filename);
879    let html_content = generate_html_report(results, config);
880    tokio::fs::write(&html_filepath, html_content).await?;
881
882    Ok(())
883}
884
885fn generate_html_report(results: &[BenchTuneResult], config: &BenchTuneConfig) -> String {
886    use chrono::Local;
887
888    let total_tests = results.len();
889    let timestamp = Local::now().format("%Y-%m-%d %H:%M:%S").to_string();
890
891    // Extract model metadata from first result's base_settings
892    let model_info = results.first().and_then(|r| {
893        r.base_settings.as_ref().map(|s| {
894            let model_name = if config.model_path.file_name().is_some() {
895                config
896                    .model_path
897                    .file_name()
898                    .unwrap()
899                    .to_string_lossy()
900                    .to_string()
901            } else {
902                config.model_path.display().to_string()
903            };
904            let file_size_mb = results
905                .first()
906                .and_then(|r| {
907                    r.base_settings.as_ref().map(|_s| {
908                        // We don't have file_size in settings, use a placeholder
909                        0u64
910                    })
911                })
912                .unwrap_or(0);
913            (model_name, file_size_mb, s.clone())
914        })
915    });
916
917    // Resolve benchmark params against base settings (fill in None with base values)
918    struct ResolvedParams {
919        temperature: f64,
920        top_p: f64,
921        top_k: i64,
922        repeat_penalty: f64,
923        flash_attn: bool,
924        threads: u32,
925        batch_size: u32,
926        expert_count: i32,
927        spec_type: String,
928        draft_tokens: u32,
929    }
930
931    fn resolve_params(
932        params: &BenchTuneParamValue,
933        base: &crate::models::ModelSettings,
934    ) -> ResolvedParams {
935        ResolvedParams {
936            temperature: params.temperature.unwrap_or(base.temperature as f64),
937            top_p: params.top_p.unwrap_or(base.top_p as f64),
938            top_k: params.top_k.unwrap_or(base.top_k as i64),
939            repeat_penalty: params.repeat_penalty.unwrap_or(base.repeat_penalty as f64),
940            flash_attn: params.flash_attn.unwrap_or(base.flash_attn),
941            threads: params.threads.unwrap_or(base.threads),
942            batch_size: params.batch_size.unwrap_or(base.batch_size),
943            expert_count: params.expert_count.unwrap_or(base.expert_count),
944            spec_type: params
945                .spec_type
946                .clone()
947                .unwrap_or_else(|| base.spec_type.clone()),
948            draft_tokens: params.draft_tokens.unwrap_or(base.draft_tokens),
949        }
950    }
951
952    // Statistics helpers
953    fn mean(vals: &[f64]) -> f64 {
954        if vals.is_empty() {
955            return 0.0;
956        }
957        vals.iter().sum::<f64>() / vals.len() as f64
958    }
959    fn median(vals: &mut [f64]) -> f64 {
960        if vals.is_empty() {
961            return 0.0;
962        }
963        vals.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
964        let mid = vals.len() / 2;
965        if vals.len().is_multiple_of(2) {
966            (vals[mid - 1] + vals[mid]) / 2.0
967        } else {
968            vals[mid]
969        }
970    }
971    fn std_dev(vals: &[f64], avg: f64) -> f64 {
972        if vals.len() <= 1 {
973            return 0.0;
974        }
975        let variance =
976            vals.iter().map(|v| (v - avg).powi(2)).sum::<f64>() / (vals.len() - 1) as f64;
977        variance.sqrt()
978    }
979    fn min_val(vals: &[f64]) -> f64 {
980        vals.iter().cloned().fold(f64::INFINITY, f64::min)
981    }
982    fn max_val(vals: &[f64]) -> f64 {
983        vals.iter().cloned().fold(f64::NEG_INFINITY, f64::max)
984    }
985
986    let gen_tps: Vec<f64> = results.iter().map(|r| r.metrics.generation_tps).collect();
987    let mut prompt_tps: Vec<f64> = results.iter().map(|r| r.metrics.prompt_tps).collect();
988    let latency: Vec<f64> = results
989        .iter()
990        .map(|r| r.metrics.latency_per_token)
991        .collect();
992    let mut first_token: Vec<f64> = results.iter().map(|r| r.metrics.first_token_time).collect();
993
994    let mut gen_tps_sorted = gen_tps.clone();
995    let mut latency_sorted = latency.clone();
996
997    let avg_gen_tps = mean(&gen_tps);
998    let avg_prompt_tps = mean(&prompt_tps);
999    let avg_latency = mean(&latency);
1000    let avg_first_token = mean(&first_token);
1001    let _avg_combined_tps = mean(
1002        &results
1003            .iter()
1004            .map(|r| r.metrics.combined_tps)
1005            .collect::<Vec<f64>>(),
1006    );
1007
1008    let gen_std = std_dev(&gen_tps, avg_gen_tps);
1009    let prompt_std = std_dev(&prompt_tps, avg_prompt_tps);
1010    let lat_std = std_dev(&latency, avg_latency);
1011    let ft_std = std_dev(&first_token, avg_first_token);
1012
1013    let best_idx = results
1014        .iter()
1015        .enumerate()
1016        .max_by(|a, b| {
1017            a.1.metrics
1018                .generation_tps
1019                .partial_cmp(&b.1.metrics.generation_tps)
1020                .unwrap_or(std::cmp::Ordering::Equal)
1021        })
1022        .map(|(i, _)| i);
1023    let best_gen_tps = if !gen_tps.is_empty() {
1024        max_val(&gen_tps)
1025    } else {
1026        0.0
1027    };
1028    let best_prompt_tps = if !prompt_tps.is_empty() {
1029        max_val(&prompt_tps)
1030    } else {
1031        0.0
1032    };
1033    let best_latency = if !latency.is_empty() {
1034        min_val(&latency)
1035    } else {
1036        0.0
1037    };
1038    let best_first_token = if !first_token.is_empty() {
1039        min_val(&first_token)
1040    } else {
1041        0.0
1042    };
1043    let min_gen_tps = min_val(&gen_tps);
1044    let min_prompt_tps = min_val(&prompt_tps);
1045    let min_latency = min_val(&latency);
1046    let min_first_token = min_val(&first_token);
1047
1048    // Per-parameter impact analysis
1049    let param_names = [("temperature", "Temperature"),
1050        ("top_p", "Top-P"),
1051        ("top_k", "Top-K"),
1052        ("repeat_penalty", "Repeat Penalty"),
1053        ("flash_attn", "Flash Attention"),
1054        ("threads", "Threads"),
1055        ("batch_size", "Batch Size"),
1056        ("expert_count", "Experts")];
1057
1058    let impact_data: Vec<(String, String, f64)> = param_names
1059        .iter()
1060        .filter_map(|(key, label)| {
1061            let values: Vec<f64> = results
1062                .iter()
1063                .filter_map(|r| {
1064                    let base = r.base_settings.as_ref()?;
1065                    let rp = resolve_params(&r.params, base);
1066                    Some(match *key {
1067                        "temperature" => rp.temperature,
1068                        "top_p" => rp.top_p,
1069                        "top_k" => rp.top_k as f64,
1070                        "repeat_penalty" => rp.repeat_penalty,
1071                        "flash_attn" => {
1072                            if rp.flash_attn {
1073                                1.0
1074                            } else {
1075                                0.0
1076                            }
1077                        }
1078                        "threads" => rp.threads as f64,
1079                        "batch_size" => rp.batch_size as f64,
1080                        "expert_count" => rp.expert_count as f64,
1081                        _ => return None,
1082                    })
1083                })
1084                .collect();
1085
1086            // Group by parameter value and compute mean gen_tps per group
1087            let mut groups: std::collections::HashMap<String, Vec<f64>> =
1088                std::collections::HashMap::new();
1089            for (r, v) in results.iter().zip(values.iter()) {
1090                let key_str = if *key == "flash_attn" {
1091                    if *v > 0.5 {
1092                        "ON".to_string()
1093                    } else {
1094                        "OFF".to_string()
1095                    }
1096                } else {
1097                    format!("{:.2}", v)
1098                };
1099                groups
1100                    .entry(key_str)
1101                    .or_default()
1102                    .push(r.metrics.generation_tps);
1103            }
1104
1105            if groups.len() <= 1 {
1106                return None;
1107            } // Parameter doesn't vary
1108
1109            let group_means: Vec<f64> = groups.values().map(|vals| mean(vals)).collect();
1110            let spread = max_val(&group_means) - min_val(&group_means);
1111            Some((label.to_string(), format!("{:.1}", spread), spread))
1112        })
1113        .collect();
1114
1115    // Sort by impact (spread) descending
1116    let mut impact_sorted = impact_data.clone();
1117    impact_sorted.sort_by(|a, b| b.2.partial_cmp(&a.2).unwrap_or(std::cmp::Ordering::Equal));
1118
1119    // Consistency indicator (coefficient of variation from per-iteration metrics)
1120    let consistency_data: Vec<f64> = results
1121        .iter()
1122        .map(|r| {
1123            if r.per_iteration_metrics.len() <= 1 {
1124                return 1.0; // No consistency data = neutral
1125            }
1126            let iter_gen_tps: Vec<f64> = r
1127                .per_iteration_metrics
1128                .iter()
1129                .map(|m| m.generation_tps)
1130                .collect();
1131            let iter_mean = mean(&iter_gen_tps);
1132            if iter_mean == 0.0 {
1133                return 1.0;
1134            }
1135            let iter_std = std_dev(&iter_gen_tps, iter_mean);
1136            let cv = iter_std / iter_mean; // Coefficient of variation
1137            // Map CV to 0-1 score (lower CV = more consistent = higher score)
1138            // CV of 0% = 1.0 (perfect), CV of 20%+ = 0.0 (poor)
1139            (1.0 - (cv * 5.0)).clamp(0.0, 1.0)
1140        })
1141        .collect();
1142
1143    // Top N for charts
1144    let top_n = std::cmp::min(20, total_tests);
1145    let top_indices: Vec<(usize, usize)> = (0..total_tests)
1146        .map(|i| (i, results[i].metrics.generation_tps))
1147        .enumerate()
1148        .take(top_n)
1149        .map(|(rank, (idx, _))| (rank + 1, idx))
1150        .collect();
1151
1152    let top_labels: Vec<String> = top_indices
1153        .iter()
1154        .map(|(_rank, idx)| {
1155            let base = results[*idx].base_settings.as_ref().unwrap();
1156            let rp = resolve_params(&results[*idx].params, base);
1157            format!("T={:.2} TP={:.2}", rp.temperature, rp.top_p)
1158        })
1159        .collect();
1160    let top_gen_tps: Vec<f64> = top_indices
1161        .iter()
1162        .map(|(_, idx)| results[*idx].metrics.generation_tps)
1163        .collect();
1164
1165    // Scatter data with labels
1166    let scatter_gen_tps: Vec<f64> = results.iter().map(|r| r.metrics.generation_tps).collect();
1167    let scatter_latency: Vec<f64> = results
1168        .iter()
1169        .map(|r| r.metrics.latency_per_token)
1170        .collect();
1171    let scatter_first_token: Vec<f64> =
1172        results.iter().map(|r| r.metrics.first_token_time).collect();
1173
1174    let param_headers: Vec<String> = vec![
1175        "Temp".to_string(),
1176        "Top-P".to_string(),
1177        "Top-K".to_string(),
1178        "RepPen".to_string(),
1179        "FA".to_string(),
1180        "Threads".to_string(),
1181        "Batch".to_string(),
1182        "Exp".to_string(),
1183        "Spec".to_string(),
1184        "Draft".to_string(),
1185    ];
1186    let param_vals: Vec<Vec<String>> = results
1187        .iter()
1188        .map(|r| {
1189            let base = r.base_settings.as_ref().unwrap();
1190            let rp = resolve_params(&r.params, base);
1191            vec![
1192                format!("{:.2}", rp.temperature),
1193                format!("{:.2}", rp.top_p),
1194                rp.top_k.to_string(),
1195                format!("{:.2}", rp.repeat_penalty),
1196                if rp.flash_attn {
1197                    "ON".to_string()
1198                } else {
1199                    "OFF".to_string()
1200                },
1201                rp.threads.to_string(),
1202                rp.batch_size.to_string(),
1203                rp.expert_count.to_string(),
1204                if rp.spec_type.is_empty() {
1205                    "-".to_string()
1206                } else {
1207                    rp.spec_type.clone()
1208                },
1209                rp.draft_tokens.to_string(),
1210            ]
1211        })
1212        .collect();
1213
1214    // Build metrics JSON with consistency data
1215    let metrics_data: Vec<serde_json::Value> = results
1216        .iter()
1217        .enumerate()
1218        .map(|(i, r)| {
1219            let base = r.base_settings.as_ref().unwrap();
1220            let rp = resolve_params(&r.params, base);
1221            serde_json::json!({
1222                "idx": i,
1223                "temp": rp.temperature,
1224                "top_p": rp.top_p,
1225                "top_k": rp.top_k,
1226                "repeat_penalty": rp.repeat_penalty,
1227                "flash_attn": rp.flash_attn,
1228                "threads": rp.threads,
1229                "batch_size": rp.batch_size,
1230                "expert_count": rp.expert_count,
1231                "spec_type": rp.spec_type,
1232                "draft_tokens": rp.draft_tokens,
1233                "prompt_tps": r.metrics.prompt_tps,
1234                "generation_tps": r.metrics.generation_tps,
1235                "combined_tps": r.metrics.combined_tps,
1236                "latency_per_token": r.metrics.latency_per_token,
1237                "first_token_time": r.metrics.first_token_time,
1238                "consistency": consistency_data[i],
1239                "outputs": r.outputs,
1240                "per_iteration_metrics": r.per_iteration_metrics.iter().map(|m| {
1241                    serde_json::json!({
1242                        "prompt_tps": m.prompt_tps,
1243                        "generation_tps": m.generation_tps,
1244                        "combined_tps": m.combined_tps,
1245                        "latency_per_token": m.latency_per_token,
1246                        "first_token_time": m.first_token_time,
1247                    })
1248                }).collect::<Vec<_>>(),
1249                "server_command": r.server_command.as_deref().unwrap_or("-"),
1250            })
1251        })
1252        .collect();
1253
1254    // Scatter data with labels for tooltips
1255    let scatter_data_json = serde_json::to_string(
1256        &scatter_gen_tps
1257            .iter()
1258            .zip(scatter_latency.iter())
1259            .zip(scatter_first_token.iter())
1260            .map(|((g, l), f)| {
1261                let mut s = String::from("{x:");
1262                s.push_str(&format!("{:.2}", g));
1263                s.push_str(",y:");
1264                s.push_str(&format!("{:.2}", l));
1265                s.push_str(",ft:");
1266                s.push_str(&format!("{:.2}", f));
1267                s.push('}');
1268                s
1269            })
1270            .collect::<Vec<_>>(),
1271    )
1272    .unwrap();
1273    let scatter_data2_json = serde_json::to_string(
1274        &scatter_gen_tps
1275            .iter()
1276            .zip(scatter_first_token.iter())
1277            .map(|(g, f)| {
1278                let mut s = String::from("{x:");
1279                s.push_str(&format!("{:.2}", g));
1280                s.push_str(",y:");
1281                s.push_str(&format!("{:.2}", f));
1282                s.push_str(",lat:");
1283                s.push_str(&format!("{:.2}", min_val(&latency)));
1284                s.push('}');
1285                s
1286            })
1287            .collect::<Vec<_>>(),
1288    )
1289    .unwrap();
1290
1291    // Model metadata JSON
1292    let model_meta_json = model_info.as_ref().map(|(name, _size, settings)| {
1293        serde_json::json!({
1294            "model_name": name,
1295            "context_length": settings.context_length,
1296            "threads": settings.threads,
1297            "temperature": settings.temperature,
1298            "top_p": settings.top_p,
1299            "top_k": settings.top_k,
1300            "repeat_penalty": settings.repeat_penalty,
1301            "flash_attn": settings.flash_attn,
1302            "kv_cache_offload": settings.kv_cache_offload,
1303            "mlock": settings.mlock,
1304            "system_prompt": settings.system_prompt,
1305        })
1306    });
1307
1308    // Impact analysis JSON
1309    let _impact_json = serde_json::to_string(&impact_sorted).unwrap();
1310
1311    // Column definitions for visibility toggle
1312    let column_defs_json = serde_json::to_string(&vec![
1313        ("col-rank", "#", true),
1314        ("col-temp", "Temp", true),
1315        ("col-top-p", "Top-P", true),
1316        ("col-top-k", "Top-K", true),
1317        ("col-rep-pen", "RepPen", true),
1318        ("col-fa", "FA", true),
1319        ("col-threads", "Threads", true),
1320        ("col-batch", "Batch", true),
1321        ("col-exp", "Exp", true),
1322        ("col-spec", "Spec", true),
1323        ("col-draft", "Draft", true),
1324        ("col-gen-tps", "Gen t/s", true),
1325        ("col-prompt-tps", "Prompt t/s", true),
1326        ("col-latency", "Latency", true),
1327        ("col-first-token", "First Tok", true),
1328        ("col-combined", "Combined", true),
1329        ("col-consistency", "Consistency", true),
1330    ])
1331    .unwrap();
1332
1333    // CSV data
1334    let csv_header = "Rank,Temp,Top-P,Top-K,RepPen,FA,Threads,Batch,Exp,Spec,Draft,Gen t/s,Prompt t/s,Latency (ms),First Tok (ms),Combined,Consistency";
1335    let csv_rows: Vec<String> = (0..total_tests)
1336        .map(|i| {
1337            let d = &metrics_data[i];
1338            let rank = i + 1;
1339            let spec = d
1340                .get("spec_type")
1341                .map(|v| v.as_str().unwrap_or("-"))
1342                .unwrap_or("-")
1343                .to_string();
1344            let draft = d
1345                .get("draft_tokens")
1346                .map(|v| v.as_u64().unwrap_or(0).to_string())
1347                .unwrap_or("-".to_string());
1348            format!(
1349                "{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{:.1}",
1350                rank,
1351                d["temp"].as_f64().unwrap_or(0.0),
1352                d["top_p"].as_f64().unwrap_or(0.0),
1353                d["top_k"].as_i64().unwrap_or(0),
1354                d["repeat_penalty"].as_f64().unwrap_or(0.0),
1355                if d["flash_attn"].as_bool().unwrap_or(false) {
1356                    "ON"
1357                } else {
1358                    "OFF"
1359                },
1360                d["threads"].as_u64().unwrap_or(0),
1361                d["batch_size"].as_u64().unwrap_or(0),
1362                d["expert_count"].as_i64().unwrap_or(0),
1363                spec,
1364                draft,
1365                d["generation_tps"].as_f64().unwrap_or(0.0),
1366                d["prompt_tps"].as_f64().unwrap_or(0.0),
1367                d["latency_per_token"].as_f64().unwrap_or(0.0),
1368                d["first_token_time"].as_f64().unwrap_or(0.0),
1369                d["combined_tps"].as_f64().unwrap_or(0.0),
1370                d["consistency"].as_f64().unwrap_or(1.0)
1371            )
1372        })
1373        .collect();
1374    let csv_content = format!("{}\n{}", csv_header, csv_rows.join("\n"));
1375    let csv_b64 = base64_encode(&csv_content);
1376
1377    let metrics_json = serde_json::to_string(&metrics_data).unwrap();
1378    let param_headers_json = serde_json::to_string(&param_headers).unwrap();
1379    let param_vals_json = serde_json::to_string(&param_vals).unwrap();
1380    let top_labels_json = serde_json::to_string(&top_labels).unwrap();
1381    let top_gen_tps_json = serde_json::to_string(&top_gen_tps).unwrap();
1382
1383    // Build model metadata HTML
1384    let model_meta_html = model_info
1385        .as_ref()
1386        .map(|(name, _size, s)| {
1387            format!(
1388                r#"
1389<div class="meta-section">
1390<h2>Model &amp; Configuration</h2>
1391<div class="meta-grid">
1392<div class="meta-item"><div class="ml">Model</div><div class="mv">{}</div></div>
1393<div class="meta-item"><div class="ml">Context</div><div class="mv">{}</div></div>
1394<div class="meta-item"><div class="ml">Threads</div><div class="mv">{}</div></div>
1395<div class="meta-item"><div class="ml">Flash Attention</div><div class="mv">{}</div></div>
1396<div class="meta-item"><div class="ml">KV Cache Offload</div><div class="mv">{}</div></div>
1397<div class="meta-item"><div class="ml">MLOCK</div><div class="mv">{}</div></div>
1398<div class="meta-item"><div class="ml">Prompt</div><div class="mv meta-prompt">{}</div></div>
1399</div>
1400</div>"#,
1401                escape_html(name),
1402                s.context_length,
1403                s.threads,
1404                if s.flash_attn { "ON" } else { "OFF" },
1405                if s.kv_cache_offload { "ON" } else { "OFF" },
1406                if s.mlock { "ON" } else { "OFF" },
1407                escape_html(&s.system_prompt.chars().take(100).collect::<String>())
1408            )
1409        })
1410        .unwrap_or_default();
1411
1412    // Build winner section HTML
1413    let winner_html = best_idx.and_then(|idx| {
1414        let r = &results[idx];
1415        let base = r.base_settings.as_ref()?;
1416        let rp = resolve_params(&r.params, base);
1417        let m = &r.metrics;
1418        Some(format!(r#"
1419<div class="winner-section">
1420<div class="winner-icon">&#127942;</div>
1421<div class="winner-content">
1422<div class="winner-title">Best Configuration</div>
1423<div class="winner-metrics">
1424<div class="winner-metric"><span class="wm-label">Gen t/s</span><span class="wm-value" style="color:#3fb950;font-size:1.8em;">{:.2}</span></div>
1425<div class="winner-metric"><span class="wm-label">Prompt t/s</span><span class="wm-value">{:.2}</span></div>
1426<div class="winner-metric"><span class="wm-label">Latency</span><span class="wm-value">{:.2}ms</span></div>
1427<div class="winner-metric"><span class="wm-label">First Token</span><span class="wm-value">{:.0}ms</span></div>
1428</div>
1429<div class="winner-params">Temp: {:.2} &middot; Top-P: {:.2} &middot; Top-K: {} &middot; RepPen: {:.2} &middot; FA: {} &middot; Threads: {} &middot; Batch: {} &middot; Exp: {} &middot; Spec: {} &middot; Draft: {}</div>
1430</div>
1431</div>"#,
1432                m.generation_tps, m.prompt_tps, m.latency_per_token, m.first_token_time,
1433                rp.temperature, rp.top_p, rp.top_k, rp.repeat_penalty,
1434                if rp.flash_attn { "ON" } else { "OFF" }, rp.threads,
1435                rp.batch_size, rp.expert_count,
1436                if rp.spec_type.is_empty() { "Off".to_string() } else { rp.spec_type.clone() }, rp.draft_tokens
1437            ))
1438    }).unwrap_or_default();
1439
1440    // Build impact analysis HTML
1441    let impact_html = if !impact_sorted.is_empty() {
1442        let max_impact = impact_sorted[0].2;
1443        let rows: String = impact_sorted
1444            .iter()
1445            .map(|(label, spread, value)| {
1446                let bar_width = if max_impact > 0.0 {
1447                    (value / max_impact * 100.0) as i32
1448                } else {
1449                    0
1450                };
1451                let bar_color = if *value > max_impact * 0.7 {
1452                    "#f85149"
1453                } else if *value > max_impact * 0.4 {
1454                    "#d29922"
1455                } else {
1456                    "#3fb950"
1457                };
1458                format!(
1459                    r#"<div class="impact-row">
1460<div class="impact-label">{}</div>
1461<div class="impact-bar-bg"><div class="impact-bar-fill" style="width:{}%;background:{}"></div></div>
1462<div class="impact-value">{}</div>
1463</div>"#,
1464                    label, bar_width, bar_color, spread
1465                )
1466            })
1467            .collect();
1468        format!(
1469            r#"
1470<div class="impact-section">
1471<h2>Parameter Impact Analysis</h2>
1472<p class="impact-desc">Larger spread in generation throughput between parameter values indicates greater impact on performance.</p>
1473{}
1474</div>"#,
1475            rows
1476        )
1477    } else {
1478        r#"<div class="impact-section"><h2>Parameter Impact Analysis</h2><p class="impact-desc">All parameters were held constant — no impact data available.</p></div>"#.to_string()
1479    };
1480
1481    // Empty state
1482    let empty_html = if total_tests == 0 {
1483        r#"<div class="empty-state">
1484<div class="empty-icon">&#128202;</div>
1485<div class="empty-title">No Results</div>
1486<div class="empty-text">Run a benchmark tuning test to generate results here.</div>
1487</div>"#
1488    } else {
1489        ""
1490    };
1491
1492    let html = include_str!("benchmark_report.html");
1493
1494    // Replace placeholders
1495    html.replace("__TIMESTAMP__", &timestamp)
1496        .replace("__TOTAL_TESTS__", &total_tests.to_string())
1497        .replace("__EMPTY_STATE__", empty_html)
1498        .replace("__MODEL_META__", &model_meta_html)
1499        .replace("__WINNER__", &winner_html)
1500        .replace("__AVG_GEN_TPS__", &format!("{:.1}", avg_gen_tps))
1501        .replace(
1502            "__MED_GEN_TPS__",
1503            &format!("{:.1}", median(&mut gen_tps_sorted)),
1504        )
1505        .replace("__GEN_STD__", &format!("{:.1}", gen_std))
1506        .replace("__MIN_GEN__", &format!("{:.1}", min_gen_tps))
1507        .replace("__MAX_GEN__", &format!("{:.1}", best_gen_tps))
1508        .replace("__AVG_PROMPT_TPS__", &format!("{:.1}", avg_prompt_tps))
1509        .replace(
1510            "__MED_PROMPT_TPS__",
1511            &format!("{:.1}", median(&mut prompt_tps)),
1512        )
1513        .replace("__PROMPT_STD__", &format!("{:.1}", prompt_std))
1514        .replace("__MIN_PROMPT__", &format!("{:.1}", min_prompt_tps))
1515        .replace("__MAX_PROMPT__", &format!("{:.1}", best_prompt_tps))
1516        .replace("__AVG_LATENCY__", &format!("{:.1}ms", avg_latency))
1517        .replace(
1518            "__MED_LATENCY__",
1519            &format!("{:.1}ms", median(&mut latency_sorted)),
1520        )
1521        .replace("__LAT_STD__", &format!("{:.1}", lat_std))
1522        .replace("__MIN_LAT__", &format!("{:.1}", min_latency))
1523        .replace("__MAX_LAT__", &format!("{:.1}", best_latency))
1524        .replace("__AVG_FT__", &format!("{:.0}ms", avg_first_token))
1525        .replace("__MED_FT__", &format!("{:.0}ms", median(&mut first_token)))
1526        .replace("__FT_STD__", &format!("{:.0}", ft_std))
1527        .replace("__MIN_FT__", &format!("{:.0}ms", min_first_token))
1528        .replace("__MAX_FT__", &format!("{:.0}ms", best_first_token))
1529        .replace("__BEST_GEN__", &format!("{:.1}", best_gen_tps))
1530        .replace("__TOP_N__", &top_n.to_string())
1531        .replace("__IMPACT_HTML__", &impact_html)
1532        .replace("__METRICS_JSON__", &metrics_json)
1533        .replace("__PARAM_HEADERS_JSON__", &param_headers_json)
1534        .replace("__PARAM_VALS_JSON__", &param_vals_json)
1535        .replace("__TOP_LABELS_JSON__", &top_labels_json)
1536        .replace("__TOP_GEN_TPS_JSON__", &top_gen_tps_json)
1537        .replace("__SCATTER_DATA_JSON__", &scatter_data_json)
1538        .replace("__SCATTER_DATA2_JSON__", &scatter_data2_json)
1539        .replace("__COLUMN_DEFS_JSON__", &column_defs_json)
1540        .replace("__CSV_B64__", &csv_b64)
1541        .replace(
1542            "__MODEL_META_JSON__",
1543            &serde_json::to_string(&model_meta_json).unwrap(),
1544        )
1545}
1546
1547/// Escape HTML special characters
1548fn escape_html(s: &str) -> String {
1549    s.replace('&', "&amp;")
1550        .replace('<', "&lt;")
1551        .replace('>', "&gt;")
1552        .replace('"', "&quot;")
1553}
1554
1555/// Base64 encode a string (no external dependency - simple encoding)
1556fn base64_encode(input: &str) -> String {
1557    const CHARS: &[u8] = b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
1558    let bytes = input.as_bytes();
1559    let mut result = String::new();
1560    for chunk in bytes.chunks(3) {
1561        let b0 = chunk[0] as u32;
1562        let b1 = if chunk.len() > 1 { chunk[1] as u32 } else { 0 };
1563        let b2 = if chunk.len() > 2 { chunk[2] as u32 } else { 0 };
1564        let triple = (b0 << 16) | (b1 << 8) | b2;
1565        result.push(CHARS[((triple >> 18) & 0x3F) as usize] as char);
1566        result.push(CHARS[((triple >> 12) & 0x3F) as usize] as char);
1567        if chunk.len() > 1 {
1568            result.push(CHARS[((triple >> 6) & 0x3F) as usize] as char);
1569        } else {
1570            result.push('=');
1571        }
1572        if chunk.len() > 2 {
1573            result.push(CHARS[(triple & 0x3F) as usize] as char);
1574        } else {
1575            result.push('=');
1576        }
1577    }
1578    result
1579}
1580
1581/// Result from a single inference request
1582struct InferenceResult {
1583    prompt_tokens: u64,
1584    generation_tokens: u64,
1585    prompt_time: Duration,
1586    generation_time: Duration,
1587    total_time: Duration,
1588    first_token_time: u128, // milliseconds
1589    content: String,
1590}
llm_manager/backend/benchmark.rs

llm_manager/backend/
benchmark.rs