1use std::path::PathBuf;
2use std::time::{Duration, Instant};
3
4use tokio::sync::{mpsc, watch};
5
6use crate::backend::server::{spawn_server, SpawnServerRequest};
7use crate::models::{
8 BenchTuneConfig, BenchTuneMetrics, BenchTuneMode, BenchTuneParamValue, BenchTuneResult,
9 BenchTuneStatus, DiscoveredModel, ModelSettings, ServerMode,
10};
11
12const HEALTH_CHECK_ITERATIONS: u32 = 120;
14const HEALTH_CHECK_INTERVAL_MS: u64 = 500;
15const HEALTH_CHECK_LOG_INTERVAL: u32 = 10;
16const REQUEST_TIMEOUT_SECS: u64 = 120;
17
18struct BenchAccumulator {
19 params: BenchTuneParamValue,
20 total_prompt_tokens: u64,
21 total_generation_tokens: u64,
22 total_prompt_time: Duration,
23 total_generation_time: Duration,
24 total_time: Duration,
25 first_token_times: Vec<u128>,
26 outputs: Vec<String>,
27 per_iteration_metrics: Vec<BenchTuneMetrics>,
28 base_settings: Option<ModelSettings>,
29}
30
31fn build_bench_result(acc: BenchAccumulator) -> BenchTuneResult {
32 let BenchAccumulator {
33 params,
34 total_prompt_tokens,
35 total_generation_tokens,
36 total_prompt_time,
37 total_generation_time,
38 total_time,
39 first_token_times,
40 outputs,
41 per_iteration_metrics,
42 base_settings,
43 } = acc;
44 let prompt_tps = if total_prompt_time.as_secs_f64() > 0.0 {
45 (total_prompt_tokens as f64) / total_prompt_time.as_secs_f64()
46 } else {
47 0.0
48 };
49
50 let generation_tps = if total_generation_time.as_secs_f64() > 0.0 {
51 (total_generation_tokens as f64) / total_generation_time.as_secs_f64()
52 } else {
53 0.0
54 };
55
56 let combined_tps = if total_time.as_secs_f64() > 0.0 {
57 ((total_prompt_tokens + total_generation_tokens) as f64) / total_time.as_secs_f64()
58 } else {
59 0.0
60 };
61
62 let avg_latency_per_token = if total_generation_tokens > 0 {
63 total_generation_time.as_millis() as f64 / (total_generation_tokens as f64)
64 } else {
65 0.0
66 };
67
68 let avg_first_token_time = if !first_token_times.is_empty() {
69 first_token_times.iter().sum::<u128>() as f64 / first_token_times.len() as f64
70 } else {
71 0.0
72 };
73
74 BenchTuneResult {
75 params,
76 metrics: BenchTuneMetrics {
77 prompt_tps,
78 generation_tps,
79 combined_tps,
80 latency_per_token: avg_latency_per_token,
81 first_token_time: avg_first_token_time,
82 },
83 outputs,
84 per_iteration_metrics,
85 base_settings,
86 }
87}
88
89pub struct BenchTuneRequest<'a> {
90 pub main_config: &'a crate::config::Config,
91 pub config: &'a BenchTuneConfig,
92 pub model: &'a DiscoveredModel,
93 pub settings: &'a ModelSettings,
94 pub progress_tx: mpsc::Sender<BenchTuneStatus>,
95 pub log_tx: mpsc::Sender<String>,
96 pub cancel_rx: &'a mut watch::Receiver<bool>,
97}
98
99pub async fn run_bench_tune(
101 req: BenchTuneRequest<'_>,
102) -> Result<Vec<BenchTuneResult>, Box<dyn std::error::Error + Send + Sync>> {
103 let BenchTuneRequest {
104 main_config,
105 config,
106 model,
107 settings,
108 progress_tx,
109 log_tx,
110 cancel_rx,
111 } = req;
112 let start_time = Instant::now();
113 let total_tests = config.get_total_tests_count();
114
115 if total_tests > 500 {
117 let _ = log_tx
118 .send(format!(
119 "WARNING: Benchmark will run {} combinations. This may take a long time.",
120 total_tests
121 ))
122 .await;
123 }
124
125 let combinations = config.generate_combinations();
127
128 let mut results = Vec::new();
130 let mut failed_tests: Vec<(usize, String)> = Vec::new();
131
132 let mut settings = settings.clone();
134 if let Some(kwargs) = &config.chat_template_kwargs {
135 settings.chat_template_kwargs = Some(kwargs.clone());
136 }
137
138 let client = reqwest::Client::builder()
140 .timeout(Duration::from_secs(REQUEST_TIMEOUT_SECS))
141 .build()?;
142
143 if config.bench_mode == BenchTuneMode::RuntimeOnly {
145 let (exit_tx, _exit_rx) = tokio::sync::mpsc::channel(1);
147 let (server_handle, _command) = spawn_server(SpawnServerRequest {
148 config: main_config,
149 model: Some(model),
150 settings: &settings,
151 log_tx: log_tx.clone(),
152 progress_tx: None,
153 server_mode: ServerMode::Normal,
154 router_max_models: 1,
155 exit_tx,
156 })
157 .await?;
158
159 let host = if server_handle.host == "0.0.0.0" {
160 "127.0.0.1"
161 } else {
162 &server_handle.host
163 };
164
165 for i in 0..HEALTH_CHECK_ITERATIONS {
167 if *cancel_rx.borrow() {
168 let _ = crate::backend::server::kill_server(server_handle).await;
169 let elapsed = start_time.elapsed();
170 progress_tx
171 .send(BenchTuneStatus::Cancelled {
172 total_tests,
173 successful_tests: results.len(),
174 failed_tests: failed_tests.len(),
175 elapsed,
176 })
177 .await?;
178 return Ok(results);
179 }
180 if crate::backend::server::check_health(host, server_handle.port).await {
181 break;
182 }
183 if i % HEALTH_CHECK_LOG_INTERVAL == 0 && i > 0 {
184 let _ = log_tx
185 .send(format!(
186 " ... still waiting ({:.0}s)...",
187 i as f32 * (HEALTH_CHECK_INTERVAL_MS as f32 / 1000.0)
188 ))
189 .await;
190 }
191 tokio::time::sleep(Duration::from_millis(HEALTH_CHECK_INTERVAL_MS)).await;
192 }
193
194 let server_port = server_handle.port;
195 let server_host = host.to_string();
196
197 for (idx, combination) in combinations.iter().enumerate() {
198 if *cancel_rx.borrow() {
200 let _ = crate::backend::server::kill_server(server_handle).await;
201 let elapsed = start_time.elapsed();
202 progress_tx
203 .send(BenchTuneStatus::Cancelled {
204 total_tests,
205 successful_tests: results.len(),
206 failed_tests: failed_tests.len(),
207 elapsed,
208 })
209 .await?;
210 return Ok(results);
211 }
212
213 let progress = (idx as f32 / total_tests as f32) * 100.0;
214 progress_tx
215 .send(BenchTuneStatus::Running {
216 current: idx + 1,
217 total: total_tests,
218 progress,
219 current_params: combination.clone(),
220 })
221 .await?;
222
223 let result = run_bench_tune_runtime_only(RuntimeOnlyCtx {
224 params: combination,
225 settings: &settings,
226 num_iterations: config.num_iterations,
227 prompt: config.prompt.clone(),
228 server_host: &server_host,
229 server_port,
230 log_tx: log_tx.clone(),
231 config,
232 client: &client,
233 })
234 .await;
235
236 match result {
237 Ok(test_result) => results.push(test_result),
238 Err(e) => {
239 failed_tests.push((idx + 1, e.to_string()));
240 let _ = log_tx
241 .send(format!(
242 "Benchmark test {}/{} failed: {}",
243 idx + 1,
244 total_tests,
245 e
246 ))
247 .await;
248 }
249 }
250 }
251
252 let _ = crate::backend::server::kill_server(server_handle).await;
253 } else {
254 for (idx, combination) in combinations.iter().enumerate() {
256 if *cancel_rx.borrow() {
258 let elapsed = start_time.elapsed();
259 progress_tx
260 .send(BenchTuneStatus::Cancelled {
261 total_tests,
262 successful_tests: results.len(),
263 failed_tests: failed_tests.len(),
264 elapsed,
265 })
266 .await?;
267 return Ok(results);
268 }
269
270 let progress = (idx as f32 / total_tests as f32) * 100.0;
271 progress_tx
272 .send(BenchTuneStatus::Running {
273 current: idx + 1,
274 total: total_tests,
275 progress,
276 current_params: combination.clone(),
277 })
278 .await?;
279
280 let result = run_bench_tune_single_test(SingleTestCtx {
281 main_config,
282 params: combination,
283 model,
284 base_settings: &settings,
285 num_iterations: config.num_iterations,
286 prompt: config.prompt.clone(),
287 log_tx: log_tx.clone(),
288 config,
289 client: &client,
290 })
291 .await;
292
293 match result {
294 Ok(test_result) => results.push(test_result),
295 Err(e) => {
296 failed_tests.push((idx + 1, e.to_string()));
297 let _ = log_tx
298 .send(format!(
299 "Benchmark test {}/{} failed: {}",
300 idx + 1,
301 total_tests,
302 e
303 ))
304 .await;
305 }
306 }
307 }
308 }
309
310 results.sort_by(|a, b| {
312 b.metrics
313 .combined_tps
314 .partial_cmp(&a.metrics.combined_tps)
315 .unwrap_or(std::cmp::Ordering::Equal)
316 });
317
318 let elapsed = start_time.elapsed();
319 let successful_tests = results.len();
320 let failed_count = failed_tests.len();
321
322 if failed_count > 0 {
324 progress_tx
325 .send(BenchTuneStatus::PartiallyCompleted {
326 total_tests,
327 successful_tests,
328 failed_tests: failed_count,
329 elapsed,
330 })
331 .await?;
332 } else {
333 progress_tx
334 .send(BenchTuneStatus::Completed {
335 total_tests,
336 successful_tests,
337 elapsed,
338 })
339 .await?;
340 }
341
342 Ok(results)
343}
344
345struct IterationLoopCtx<'a> {
347 prompt: &'a str,
348 host: &'a str,
349 port: u16,
350 params: &'a BenchTuneParamValue,
351 num_iterations: u32,
352 config: &'a BenchTuneConfig,
353 client: &'a reqwest::Client,
354 log_tx: mpsc::Sender<String>,
355 log_prefix: &'a str,
356}
357
358async fn run_iteration_loop(
360 ctx: IterationLoopCtx<'_>,
361) -> Result<BenchTuneResult, Box<dyn std::error::Error + Send + Sync>> {
362 let IterationLoopCtx {
363 prompt,
364 host,
365 port,
366 params,
367 num_iterations,
368 config,
369 client,
370 log_tx,
371 log_prefix,
372 } = ctx;
373 let mut total_prompt_tokens = 0u64;
374 let mut total_generation_tokens = 0u64;
375 let mut total_prompt_time = Duration::ZERO;
376 let mut total_generation_time = Duration::ZERO;
377 let mut total_time = Duration::ZERO;
378 let mut first_token_times = Vec::new();
379 let mut outputs = Vec::new();
380 let mut per_iteration_metrics = Vec::new();
381
382 let _ = log_tx
383 .send(format!(
384 "Running {} inference iterations {}...",
385 num_iterations, log_prefix
386 ))
387 .await;
388
389 for i in 0..num_iterations {
390 let result = send_inference_request(prompt, host, port, params, config, client).await;
391
392 match result {
393 Ok(res) => {
394 total_prompt_tokens += res.prompt_tokens;
395 total_generation_tokens += res.generation_tokens;
396 total_prompt_time += res.prompt_time;
397 total_generation_time += res.generation_time;
398 total_time += res.total_time;
399 first_token_times.push(res.first_token_time);
400 outputs.push(res.content.clone());
401
402 let iter_prompt_tps = if res.prompt_time.as_secs_f64() > 0.0 {
403 res.prompt_tokens as f64 / res.prompt_time.as_secs_f64()
404 } else {
405 0.0
406 };
407 let iter_gen_tps = if res.generation_time.as_secs_f64() > 0.0 {
408 res.generation_tokens as f64 / res.generation_time.as_secs_f64()
409 } else {
410 0.0
411 };
412 let iter_latency = if res.generation_tokens > 0 {
413 res.generation_time.as_millis() as f64 / res.generation_tokens as f64
414 } else {
415 0.0
416 };
417
418 per_iteration_metrics.push(BenchTuneMetrics {
419 prompt_tps: iter_prompt_tps,
420 generation_tps: iter_gen_tps,
421 combined_tps: 0.0,
422 latency_per_token: iter_latency,
423 first_token_time: res.first_token_time as f64,
424 });
425
426 if num_iterations > 1 {
427 let _ = log_tx
428 .send(format!(
429 " Iteration {}/{}: {:.2} gen t/s",
430 i + 1,
431 num_iterations,
432 iter_gen_tps
433 ))
434 .await;
435 }
436
437 let _ = log_tx
438 .send(format!(
439 "--- Generated Output (Iter {}) ---\n{}\n----------------------------------",
440 i + 1,
441 res.content
442 ))
443 .await;
444 }
445 Err(e) => {
446 let _ = log_tx
447 .send(format!(
448 " Iteration {}/{} FAILED: {}",
449 i + 1,
450 num_iterations,
451 e
452 ))
453 .await;
454 if i == 0 {
455 return Err(format!("Inference failed: {}", e).into());
456 }
457 }
458 }
459 }
460
461 Ok(build_bench_result(BenchAccumulator {
462 params: params.clone(),
463 total_prompt_tokens,
464 total_generation_tokens,
465 total_prompt_time,
466 total_generation_time,
467 total_time,
468 first_token_times,
469 outputs,
470 per_iteration_metrics,
471 base_settings: None,
472 }))
473}
474
475struct RuntimeOnlyCtx<'a> {
476 params: &'a BenchTuneParamValue,
477 settings: &'a ModelSettings,
478 num_iterations: u32,
479 prompt: String,
480 server_host: &'a str,
481 server_port: u16,
482 log_tx: mpsc::Sender<String>,
483 config: &'a BenchTuneConfig,
484 client: &'a reqwest::Client,
485}
486
487async fn run_bench_tune_runtime_only(
489 ctx: RuntimeOnlyCtx<'_>,
490) -> Result<BenchTuneResult, Box<dyn std::error::Error + Send + Sync>> {
491 let RuntimeOnlyCtx {
492 params,
493 settings,
494 num_iterations,
495 prompt,
496 server_host,
497 server_port,
498 log_tx,
499 config,
500 client,
501 } = ctx;
502 run_iteration_loop(IterationLoopCtx {
503 prompt: &prompt,
504 host: server_host,
505 port: server_port,
506 params,
507 num_iterations,
508 config,
509 client,
510 log_tx,
511 log_prefix: "(runtime-only mode)",
512 })
513 .await
514 .map(|mut r| {
515 r.base_settings = Some(settings.clone());
516 r
517 })
518}
519
520struct SingleTestCtx<'a> {
521 main_config: &'a crate::config::Config,
522 params: &'a BenchTuneParamValue,
523 model: &'a DiscoveredModel,
524 base_settings: &'a ModelSettings,
525 num_iterations: u32,
526 prompt: String,
527 log_tx: mpsc::Sender<String>,
528 config: &'a BenchTuneConfig,
529 client: &'a reqwest::Client,
530}
531
532async fn run_bench_tune_single_test(
534 ctx: SingleTestCtx<'_>,
535) -> Result<BenchTuneResult, Box<dyn std::error::Error + Send + Sync>> {
536 let SingleTestCtx {
537 main_config,
538 params,
539 model,
540 base_settings,
541 num_iterations,
542 prompt,
543 log_tx,
544 config,
545 client,
546 } = ctx;
547 let mut settings = base_settings.clone();
549
550 if let Some(temperature) = params.temperature {
552 settings.temperature = temperature as f32;
553 }
554 if let Some(top_p) = params.top_p {
555 settings.top_p = top_p as f32;
556 }
557 if let Some(top_k) = params.top_k {
558 settings.top_k = top_k as i32;
559 }
560 if let Some(repeat_penalty) = params.repeat_penalty {
561 settings.repeat_penalty = repeat_penalty as f32;
562 }
563 if let Some(flash_attn) = params.flash_attn {
564 settings.flash_attn = flash_attn;
565 }
566 if let Some(threads) = params.threads {
567 settings.threads = threads;
568 settings.threads_batch = threads; }
570 if let Some(batch_size) = params.batch_size {
571 settings.batch_size = batch_size;
572 settings.ubatch_size = batch_size;
573 }
574 if let Some(expert_count) = params.expert_count {
575 settings.expert_count = expert_count;
576 }
577
578 let (exit_tx, _exit_rx) = tokio::sync::mpsc::channel(1);
580 let (server_handle, _command) = spawn_server(SpawnServerRequest {
581 config: main_config,
582 model: Some(model),
583 settings: &settings,
584 log_tx: log_tx.clone(),
585 progress_tx: None,
586 server_mode: ServerMode::Normal,
587 router_max_models: 1,
588 exit_tx,
589 })
590 .await?;
591 let mut ready = false;
593 let host = if server_handle.host == "0.0.0.0" {
594 "127.0.0.1"
595 } else {
596 &server_handle.host
597 };
598
599 let _ = log_tx
600 .send(format!(
601 "Waiting for server on {}:{}...",
602 host, server_handle.port
603 ))
604 .await;
605
606 for i in 0..HEALTH_CHECK_ITERATIONS {
607 if crate::backend::server::check_health(host, server_handle.port).await {
608 ready = true;
609 break;
610 }
611 if i % HEALTH_CHECK_LOG_INTERVAL == 0 && i > 0 {
612 let _ = log_tx
613 .send(format!(
614 " ... still waiting ({:.0}s)...",
615 i as f32 * (HEALTH_CHECK_INTERVAL_MS as f32 / 1000.0)
616 ))
617 .await;
618 }
619 tokio::time::sleep(Duration::from_millis(HEALTH_CHECK_INTERVAL_MS)).await;
620 }
621
622 if !ready {
623 let _ = log_tx
624 .send("Error: Server health check timed out".to_string())
625 .await;
626 let _ = crate::backend::server::kill_server(server_handle).await;
627 return Err("Server failed to become healthy".into());
628 }
629
630 let result = run_iteration_loop(IterationLoopCtx {
631 prompt: &prompt,
632 host,
633 port: server_handle.port,
634 params,
635 num_iterations,
636 config,
637 client,
638 log_tx,
639 log_prefix: "",
640 })
641 .await;
642
643 let _ = crate::backend::server::kill_server(server_handle).await;
644 tokio::time::sleep(Duration::from_secs(1)).await;
645
646 result.map(|mut r| {
647 r.base_settings = Some(base_settings.clone());
648 r
649 })
650}
651
652async fn send_inference_request(
654 prompt: &str,
655 host: &str,
656 port: u16,
657 params: &BenchTuneParamValue,
658 config: &BenchTuneConfig,
659 client: &reqwest::Client,
660) -> Result<InferenceResult, Box<dyn std::error::Error + Send + Sync>> {
661 let mut body = serde_json::json!({
663 "prompt": prompt,
664 "n_predict": config.n_predict,
665 "stream": false
666 });
667
668 if let Some(temperature) = params.temperature {
669 body["temperature"] = serde_json::json!(temperature);
670 }
671 if let Some(top_p) = params.top_p {
672 body["top_p"] = serde_json::json!(top_p);
673 }
674 if let Some(top_k) = params.top_k {
675 body["top_k"] = serde_json::json!(top_k);
676 }
677 if let Some(repeat_penalty) = params.repeat_penalty {
678 body["repeat_penalty"] = serde_json::json!(repeat_penalty);
679 }
680
681 let url = format!("http://{}:{}/completion", host, port);
682 let start = Instant::now();
683 let resp = client.post(url).json(&body).send().await?;
684
685 if !resp.status().is_success() {
686 let status = resp.status();
687 let body = resp.text().await.unwrap_or_else(|_| "no body".to_string());
688 return Err(format!("Server returned error {}: {}", status, body).into());
689 }
690
691 let total_time = start.elapsed();
692 let json: serde_json::Value = resp.json().await?;
693
694 let prompt_tokens = json["tokens_evaluated"]
696 .as_u64()
697 .or_else(|| json["prompt_n"].as_u64())
698 .unwrap_or(0);
699
700 let generation_tokens = json["tokens_predicted"]
701 .as_u64()
702 .or_else(|| json["predicted_n"].as_u64())
703 .unwrap_or(0);
704
705 let timings = &json["timings"];
706 let prompt_time_ms = timings["prompt_ms"]
707 .as_f64()
708 .or_else(|| timings["prompt_eval_ms"].as_f64())
709 .unwrap_or(0.0);
710
711 let generation_time_ms = timings["predicted_ms"]
712 .as_f64()
713 .or_else(|| timings["eval_ms"].as_f64())
714 .unwrap_or(0.0);
715
716 Ok(InferenceResult {
717 prompt_tokens,
718 generation_tokens,
719 prompt_time: Duration::from_millis(prompt_time_ms as u64),
720 generation_time: Duration::from_millis(generation_time_ms as u64),
721 total_time,
722 first_token_time: prompt_time_ms as u128,
723 content: json["content"].as_str().unwrap_or("").to_string(),
724 })
725}
726
727pub async fn save_results(
729 results: &[BenchTuneResult],
730 output_dir: &PathBuf,
731 config: &BenchTuneConfig,
732) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
733 std::fs::create_dir_all(output_dir)?;
735
736 let timestamp = chrono::Local::now().format("%Y%m%d_%H%M%S");
738 let filename = format!("benchmark_{}.md", timestamp);
739 let filepath = output_dir.join(filename);
740
741 let mut md = String::new();
742 md.push_str("# LLM Benchmark Results\n\n");
743 md.push_str(&format!(
744 "Generated on: {}\n\n",
745 chrono::Local::now().format("%Y-%m-%d %H:%M:%S")
746 ));
747
748 md.push_str("| Temp | Top-P | Top-K | RepPen | FA | Threads | Batch | Exp | Spec | Draft | Prompt t/s | Gen t/s | Latency (ms) | First Tok (ms) |\n");
749 md.push_str("|------|-------|-------|--------|----|---------|-------|-----|------|-------|------------|---------|--------------|----------------|\n");
750
751 for r in results {
752 let temp = r
753 .params
754 .temperature
755 .map(|v| format!("{:.2}", v))
756 .unwrap_or_else(|| "-".to_string());
757 let top_p = r
758 .params
759 .top_p
760 .map(|v| format!("{:.2}", v))
761 .unwrap_or_else(|| "-".to_string());
762 let top_k = r
763 .params
764 .top_k
765 .map(|v| v.to_string())
766 .unwrap_or_else(|| "-".to_string());
767 let rep_pen = r
768 .params
769 .repeat_penalty
770 .map(|v| format!("{:.2}", v))
771 .unwrap_or_else(|| "-".to_string());
772 let fa = r
773 .params
774 .flash_attn
775 .map(|v| if v { "ON" } else { "OFF" })
776 .unwrap_or("-");
777 let threads = r
778 .params
779 .threads
780 .map(|v| v.to_string())
781 .unwrap_or_else(|| "-".to_string());
782 let batch = r
783 .params
784 .batch_size
785 .map(|v| v.to_string())
786 .unwrap_or_else(|| "-".to_string());
787 let exp = r
788 .params
789 .expert_count
790 .map(|v| v.to_string())
791 .unwrap_or_else(|| "-".to_string());
792
793 let spec = r
794 .params
795 .spec_type
796 .as_ref()
797 .map(|s| {
798 if s.is_empty() {
799 "-".to_string()
800 } else {
801 s.clone()
802 }
803 })
804 .unwrap_or_else(|| "-".to_string());
805 let draft = r
806 .params
807 .draft_tokens
808 .map(|v| v.to_string())
809 .unwrap_or_else(|| "-".to_string());
810
811 md.push_str(&format!(
812 "| {} | {} | {} | {} | {} | {} | {} | {} | {} | {} | {:.2} | {:.2} | {:.2} | {:.2} |\n",
813 temp,
814 top_p,
815 top_k,
816 rep_pen,
817 fa,
818 threads,
819 batch,
820 exp,
821 spec,
822 draft,
823 r.metrics.prompt_tps,
824 r.metrics.generation_tps,
825 r.metrics.latency_per_token,
826 r.metrics.first_token_time
827 ));
828 }
829
830 tokio::fs::write(&filepath, md).await?;
831
832 let json_filename = format!("benchmark_{}.json", timestamp);
834 let json_filepath = output_dir.join(&json_filename);
835 let json_content = serde_json::to_string_pretty(&results)?;
836 tokio::fs::write(&json_filepath, json_content).await?;
837
838 let yaml_filename = format!("benchmark_{}.yaml", timestamp);
840 let yaml_filepath = output_dir.join(&yaml_filename);
841 let yaml_content = serde_yaml::to_string(&results)?;
842 tokio::fs::write(&yaml_filepath, yaml_content).await?;
843
844 let html_filename = format!("benchmark_{}.html", timestamp);
846 let html_filepath = output_dir.join(&html_filename);
847 let html_content = generate_html_report(results, config);
848 tokio::fs::write(&html_filepath, html_content).await?;
849
850 Ok(())
851}
852
853fn generate_html_report(results: &[BenchTuneResult], config: &BenchTuneConfig) -> String {
854 use chrono::Local;
855
856 let total_tests = results.len();
857 let timestamp = Local::now().format("%Y-%m-%d %H:%M:%S").to_string();
858
859 let model_info = results.first().and_then(|r| {
861 r.base_settings.as_ref().map(|s| {
862 let model_name = if config.model_path.file_name().is_some() {
863 config
864 .model_path
865 .file_name()
866 .unwrap()
867 .to_string_lossy()
868 .to_string()
869 } else {
870 config.model_path.display().to_string()
871 };
872 let file_size_mb = results
873 .first()
874 .and_then(|r| {
875 r.base_settings.as_ref().map(|_s| {
876 0u64
878 })
879 })
880 .unwrap_or(0);
881 (model_name, file_size_mb, s.clone())
882 })
883 });
884
885 struct ResolvedParams {
887 temperature: f64,
888 top_p: f64,
889 top_k: i64,
890 repeat_penalty: f64,
891 flash_attn: bool,
892 threads: u32,
893 batch_size: u32,
894 expert_count: i32,
895 spec_type: String,
896 draft_tokens: u32,
897 }
898
899 fn resolve_params(
900 params: &BenchTuneParamValue,
901 base: &crate::models::ModelSettings,
902 ) -> ResolvedParams {
903 ResolvedParams {
904 temperature: params.temperature.unwrap_or(base.temperature as f64),
905 top_p: params.top_p.unwrap_or(base.top_p as f64),
906 top_k: params.top_k.unwrap_or(base.top_k as i64),
907 repeat_penalty: params.repeat_penalty.unwrap_or(base.repeat_penalty as f64),
908 flash_attn: params.flash_attn.unwrap_or(base.flash_attn),
909 threads: params.threads.unwrap_or(base.threads),
910 batch_size: params.batch_size.unwrap_or(base.batch_size),
911 expert_count: params.expert_count.unwrap_or(base.expert_count),
912 spec_type: params
913 .spec_type
914 .clone()
915 .unwrap_or_else(|| base.spec_type.clone()),
916 draft_tokens: params.draft_tokens.unwrap_or(base.draft_tokens),
917 }
918 }
919
920 fn mean(vals: &[f64]) -> f64 {
922 if vals.is_empty() {
923 return 0.0;
924 }
925 vals.iter().sum::<f64>() / vals.len() as f64
926 }
927 fn median(vals: &mut [f64]) -> f64 {
928 if vals.is_empty() {
929 return 0.0;
930 }
931 vals.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
932 let mid = vals.len() / 2;
933 if vals.len().is_multiple_of(2) {
934 (vals[mid - 1] + vals[mid]) / 2.0
935 } else {
936 vals[mid]
937 }
938 }
939 fn std_dev(vals: &[f64], avg: f64) -> f64 {
940 if vals.len() <= 1 {
941 return 0.0;
942 }
943 let variance =
944 vals.iter().map(|v| (v - avg).powi(2)).sum::<f64>() / (vals.len() - 1) as f64;
945 variance.sqrt()
946 }
947 fn min_val(vals: &[f64]) -> f64 {
948 vals.iter().cloned().fold(f64::INFINITY, f64::min)
949 }
950 fn max_val(vals: &[f64]) -> f64 {
951 vals.iter().cloned().fold(f64::NEG_INFINITY, f64::max)
952 }
953
954 let gen_tps: Vec<f64> = results.iter().map(|r| r.metrics.generation_tps).collect();
955 let mut prompt_tps: Vec<f64> = results.iter().map(|r| r.metrics.prompt_tps).collect();
956 let latency: Vec<f64> = results
957 .iter()
958 .map(|r| r.metrics.latency_per_token)
959 .collect();
960 let mut first_token: Vec<f64> = results.iter().map(|r| r.metrics.first_token_time).collect();
961
962 let mut gen_tps_sorted = gen_tps.clone();
963 let mut latency_sorted = latency.clone();
964
965 let avg_gen_tps = mean(&gen_tps);
966 let avg_prompt_tps = mean(&prompt_tps);
967 let avg_latency = mean(&latency);
968 let avg_first_token = mean(&first_token);
969 let _avg_combined_tps = mean(
970 &results
971 .iter()
972 .map(|r| r.metrics.combined_tps)
973 .collect::<Vec<f64>>(),
974 );
975
976 let gen_std = std_dev(&gen_tps, avg_gen_tps);
977 let prompt_std = std_dev(&prompt_tps, avg_prompt_tps);
978 let lat_std = std_dev(&latency, avg_latency);
979 let ft_std = std_dev(&first_token, avg_first_token);
980
981 let best_idx = results
982 .iter()
983 .enumerate()
984 .max_by(|a, b| {
985 a.1.metrics
986 .generation_tps
987 .partial_cmp(&b.1.metrics.generation_tps)
988 .unwrap_or(std::cmp::Ordering::Equal)
989 })
990 .map(|(i, _)| i);
991 let best_gen_tps = if !gen_tps.is_empty() {
992 max_val(&gen_tps)
993 } else {
994 0.0
995 };
996 let best_prompt_tps = if !prompt_tps.is_empty() {
997 max_val(&prompt_tps)
998 } else {
999 0.0
1000 };
1001 let best_latency = if !latency.is_empty() {
1002 min_val(&latency)
1003 } else {
1004 0.0
1005 };
1006 let best_first_token = if !first_token.is_empty() {
1007 min_val(&first_token)
1008 } else {
1009 0.0
1010 };
1011 let min_gen_tps = min_val(&gen_tps);
1012 let min_prompt_tps = min_val(&prompt_tps);
1013 let min_latency = min_val(&latency);
1014 let min_first_token = min_val(&first_token);
1015
1016 let param_names = [("temperature", "Temperature"),
1018 ("top_p", "Top-P"),
1019 ("top_k", "Top-K"),
1020 ("repeat_penalty", "Repeat Penalty"),
1021 ("flash_attn", "Flash Attention"),
1022 ("threads", "Threads"),
1023 ("batch_size", "Batch Size"),
1024 ("expert_count", "Experts")];
1025
1026 let impact_data: Vec<(String, String, f64)> = param_names
1027 .iter()
1028 .filter_map(|(key, label)| {
1029 let values: Vec<f64> = results
1030 .iter()
1031 .filter_map(|r| {
1032 let base = r.base_settings.as_ref()?;
1033 let rp = resolve_params(&r.params, base);
1034 Some(match *key {
1035 "temperature" => rp.temperature,
1036 "top_p" => rp.top_p,
1037 "top_k" => rp.top_k as f64,
1038 "repeat_penalty" => rp.repeat_penalty,
1039 "flash_attn" => {
1040 if rp.flash_attn {
1041 1.0
1042 } else {
1043 0.0
1044 }
1045 }
1046 "threads" => rp.threads as f64,
1047 "batch_size" => rp.batch_size as f64,
1048 "expert_count" => rp.expert_count as f64,
1049 _ => return None,
1050 })
1051 })
1052 .collect();
1053
1054 let mut groups: std::collections::HashMap<String, Vec<f64>> =
1056 std::collections::HashMap::new();
1057 for (r, v) in results.iter().zip(values.iter()) {
1058 let key_str = if *key == "flash_attn" {
1059 if *v > 0.5 {
1060 "ON".to_string()
1061 } else {
1062 "OFF".to_string()
1063 }
1064 } else {
1065 format!("{:.2}", v)
1066 };
1067 groups
1068 .entry(key_str)
1069 .or_default()
1070 .push(r.metrics.generation_tps);
1071 }
1072
1073 if groups.len() <= 1 {
1074 return None;
1075 } let group_means: Vec<f64> = groups.values().map(|vals| mean(vals)).collect();
1078 let spread = max_val(&group_means) - min_val(&group_means);
1079 Some((label.to_string(), format!("{:.1}", spread), spread))
1080 })
1081 .collect();
1082
1083 let mut impact_sorted = impact_data.clone();
1085 impact_sorted.sort_by(|a, b| b.2.partial_cmp(&a.2).unwrap_or(std::cmp::Ordering::Equal));
1086
1087 let consistency_data: Vec<f64> = results
1089 .iter()
1090 .map(|r| {
1091 if r.per_iteration_metrics.len() <= 1 {
1092 return 1.0; }
1094 let iter_gen_tps: Vec<f64> = r
1095 .per_iteration_metrics
1096 .iter()
1097 .map(|m| m.generation_tps)
1098 .collect();
1099 let iter_mean = mean(&iter_gen_tps);
1100 if iter_mean == 0.0 {
1101 return 1.0;
1102 }
1103 let iter_std = std_dev(&iter_gen_tps, iter_mean);
1104 let cv = iter_std / iter_mean; (1.0 - (cv * 5.0)).clamp(0.0, 1.0)
1108 })
1109 .collect();
1110
1111 let top_n = std::cmp::min(20, total_tests);
1113 let top_indices: Vec<(usize, usize)> = (0..total_tests)
1114 .map(|i| (i, results[i].metrics.generation_tps))
1115 .enumerate()
1116 .take(top_n)
1117 .map(|(rank, (idx, _))| (rank + 1, idx))
1118 .collect();
1119
1120 let top_labels: Vec<String> = top_indices
1121 .iter()
1122 .map(|(_rank, idx)| {
1123 let base = results[*idx].base_settings.as_ref().unwrap();
1124 let rp = resolve_params(&results[*idx].params, base);
1125 format!("T={:.2} TP={:.2}", rp.temperature, rp.top_p)
1126 })
1127 .collect();
1128 let top_gen_tps: Vec<f64> = top_indices
1129 .iter()
1130 .map(|(_, idx)| results[*idx].metrics.generation_tps)
1131 .collect();
1132
1133 let scatter_gen_tps: Vec<f64> = results.iter().map(|r| r.metrics.generation_tps).collect();
1135 let scatter_latency: Vec<f64> = results
1136 .iter()
1137 .map(|r| r.metrics.latency_per_token)
1138 .collect();
1139 let scatter_first_token: Vec<f64> =
1140 results.iter().map(|r| r.metrics.first_token_time).collect();
1141
1142 let param_headers: Vec<String> = vec![
1143 "Temp".to_string(),
1144 "Top-P".to_string(),
1145 "Top-K".to_string(),
1146 "RepPen".to_string(),
1147 "FA".to_string(),
1148 "Threads".to_string(),
1149 "Batch".to_string(),
1150 "Exp".to_string(),
1151 "Spec".to_string(),
1152 "Draft".to_string(),
1153 ];
1154 let param_vals: Vec<Vec<String>> = results
1155 .iter()
1156 .map(|r| {
1157 let base = r.base_settings.as_ref().unwrap();
1158 let rp = resolve_params(&r.params, base);
1159 vec![
1160 format!("{:.2}", rp.temperature),
1161 format!("{:.2}", rp.top_p),
1162 rp.top_k.to_string(),
1163 format!("{:.2}", rp.repeat_penalty),
1164 if rp.flash_attn {
1165 "ON".to_string()
1166 } else {
1167 "OFF".to_string()
1168 },
1169 rp.threads.to_string(),
1170 rp.batch_size.to_string(),
1171 rp.expert_count.to_string(),
1172 if rp.spec_type.is_empty() {
1173 "-".to_string()
1174 } else {
1175 rp.spec_type.clone()
1176 },
1177 rp.draft_tokens.to_string(),
1178 ]
1179 })
1180 .collect();
1181
1182 let metrics_data: Vec<serde_json::Value> = results
1184 .iter()
1185 .enumerate()
1186 .map(|(i, r)| {
1187 let base = r.base_settings.as_ref().unwrap();
1188 let rp = resolve_params(&r.params, base);
1189 serde_json::json!({
1190 "idx": i,
1191 "temp": rp.temperature,
1192 "top_p": rp.top_p,
1193 "top_k": rp.top_k,
1194 "repeat_penalty": rp.repeat_penalty,
1195 "flash_attn": rp.flash_attn,
1196 "threads": rp.threads,
1197 "batch_size": rp.batch_size,
1198 "expert_count": rp.expert_count,
1199 "spec_type": rp.spec_type,
1200 "draft_tokens": rp.draft_tokens,
1201 "prompt_tps": r.metrics.prompt_tps,
1202 "generation_tps": r.metrics.generation_tps,
1203 "combined_tps": r.metrics.combined_tps,
1204 "latency_per_token": r.metrics.latency_per_token,
1205 "first_token_time": r.metrics.first_token_time,
1206 "consistency": consistency_data[i],
1207 "outputs": r.outputs,
1208 "per_iteration_metrics": r.per_iteration_metrics.iter().map(|m| {
1209 serde_json::json!({
1210 "prompt_tps": m.prompt_tps,
1211 "generation_tps": m.generation_tps,
1212 "combined_tps": m.combined_tps,
1213 "latency_per_token": m.latency_per_token,
1214 "first_token_time": m.first_token_time,
1215 })
1216 }).collect::<Vec<_>>(),
1217 })
1218 })
1219 .collect();
1220
1221 let scatter_data_json = serde_json::to_string(
1223 &scatter_gen_tps
1224 .iter()
1225 .zip(scatter_latency.iter())
1226 .zip(scatter_first_token.iter())
1227 .map(|((g, l), f)| {
1228 let mut s = String::from("{x:");
1229 s.push_str(&format!("{:.2}", g));
1230 s.push_str(",y:");
1231 s.push_str(&format!("{:.2}", l));
1232 s.push_str(",ft:");
1233 s.push_str(&format!("{:.2}", f));
1234 s.push('}');
1235 s
1236 })
1237 .collect::<Vec<_>>(),
1238 )
1239 .unwrap();
1240 let scatter_data2_json = serde_json::to_string(
1241 &scatter_gen_tps
1242 .iter()
1243 .zip(scatter_first_token.iter())
1244 .map(|(g, f)| {
1245 let mut s = String::from("{x:");
1246 s.push_str(&format!("{:.2}", g));
1247 s.push_str(",y:");
1248 s.push_str(&format!("{:.2}", f));
1249 s.push_str(",lat:");
1250 s.push_str(&format!("{:.2}", min_val(&latency)));
1251 s.push('}');
1252 s
1253 })
1254 .collect::<Vec<_>>(),
1255 )
1256 .unwrap();
1257
1258 let model_meta_json = model_info.as_ref().map(|(name, _size, settings)| {
1260 serde_json::json!({
1261 "model_name": name,
1262 "context_length": settings.context_length,
1263 "threads": settings.threads,
1264 "temperature": settings.temperature,
1265 "top_p": settings.top_p,
1266 "top_k": settings.top_k,
1267 "repeat_penalty": settings.repeat_penalty,
1268 "flash_attn": settings.flash_attn,
1269 "kv_cache_offload": settings.kv_cache_offload,
1270 "mlock": settings.mlock,
1271 "system_prompt": settings.system_prompt,
1272 })
1273 });
1274
1275 let _impact_json = serde_json::to_string(&impact_sorted).unwrap();
1277
1278 let column_defs_json = serde_json::to_string(&vec![
1280 ("col-rank", "#", true),
1281 ("col-temp", "Temp", true),
1282 ("col-top-p", "Top-P", true),
1283 ("col-top-k", "Top-K", true),
1284 ("col-rep-pen", "RepPen", true),
1285 ("col-fa", "FA", true),
1286 ("col-threads", "Threads", true),
1287 ("col-batch", "Batch", true),
1288 ("col-exp", "Exp", true),
1289 ("col-spec", "Spec", true),
1290 ("col-draft", "Draft", true),
1291 ("col-gen-tps", "Gen t/s", true),
1292 ("col-prompt-tps", "Prompt t/s", true),
1293 ("col-latency", "Latency", true),
1294 ("col-first-token", "First Tok", true),
1295 ("col-combined", "Combined", true),
1296 ("col-consistency", "Consistency", true),
1297 ])
1298 .unwrap();
1299
1300 let csv_header = "Rank,Temp,Top-P,Top-K,RepPen,FA,Threads,Batch,Exp,Spec,Draft,Gen t/s,Prompt t/s,Latency (ms),First Tok (ms),Combined,Consistency";
1302 let csv_rows: Vec<String> = (0..total_tests)
1303 .map(|i| {
1304 let d = &metrics_data[i];
1305 let rank = i + 1;
1306 let spec = d
1307 .get("spec_type")
1308 .map(|v| v.as_str().unwrap_or("-"))
1309 .unwrap_or("-")
1310 .to_string();
1311 let draft = d
1312 .get("draft_tokens")
1313 .map(|v| v.as_u64().unwrap_or(0).to_string())
1314 .unwrap_or("-".to_string());
1315 format!(
1316 "{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{:.1}",
1317 rank,
1318 d["temp"].as_f64().unwrap_or(0.0),
1319 d["top_p"].as_f64().unwrap_or(0.0),
1320 d["top_k"].as_i64().unwrap_or(0),
1321 d["repeat_penalty"].as_f64().unwrap_or(0.0),
1322 if d["flash_attn"].as_bool().unwrap_or(false) {
1323 "ON"
1324 } else {
1325 "OFF"
1326 },
1327 d["threads"].as_u64().unwrap_or(0),
1328 d["batch_size"].as_u64().unwrap_or(0),
1329 d["expert_count"].as_i64().unwrap_or(0),
1330 spec,
1331 draft,
1332 d["generation_tps"].as_f64().unwrap_or(0.0),
1333 d["prompt_tps"].as_f64().unwrap_or(0.0),
1334 d["latency_per_token"].as_f64().unwrap_or(0.0),
1335 d["first_token_time"].as_f64().unwrap_or(0.0),
1336 d["combined_tps"].as_f64().unwrap_or(0.0),
1337 d["consistency"].as_f64().unwrap_or(1.0)
1338 )
1339 })
1340 .collect();
1341 let csv_content = format!("{}\n{}", csv_header, csv_rows.join("\n"));
1342 let csv_b64 = base64_encode(&csv_content);
1343
1344 let metrics_json = serde_json::to_string(&metrics_data).unwrap();
1345 let param_headers_json = serde_json::to_string(¶m_headers).unwrap();
1346 let param_vals_json = serde_json::to_string(¶m_vals).unwrap();
1347 let top_labels_json = serde_json::to_string(&top_labels).unwrap();
1348 let top_gen_tps_json = serde_json::to_string(&top_gen_tps).unwrap();
1349
1350 let model_meta_html = model_info
1352 .as_ref()
1353 .map(|(name, _size, s)| {
1354 format!(
1355 r#"
1356<div class="meta-section">
1357<h2>Model & Configuration</h2>
1358<div class="meta-grid">
1359<div class="meta-item"><div class="ml">Model</div><div class="mv">{}</div></div>
1360<div class="meta-item"><div class="ml">Context</div><div class="mv">{}</div></div>
1361<div class="meta-item"><div class="ml">Threads</div><div class="mv">{}</div></div>
1362<div class="meta-item"><div class="ml">Flash Attention</div><div class="mv">{}</div></div>
1363<div class="meta-item"><div class="ml">KV Cache Offload</div><div class="mv">{}</div></div>
1364<div class="meta-item"><div class="ml">MLOCK</div><div class="mv">{}</div></div>
1365<div class="meta-item"><div class="ml">Prompt</div><div class="mv meta-prompt">{}</div></div>
1366</div>
1367</div>"#,
1368 escape_html(name),
1369 s.context_length,
1370 s.threads,
1371 if s.flash_attn { "ON" } else { "OFF" },
1372 if s.kv_cache_offload { "ON" } else { "OFF" },
1373 if s.mlock { "ON" } else { "OFF" },
1374 escape_html(&s.system_prompt.chars().take(100).collect::<String>())
1375 )
1376 })
1377 .unwrap_or_default();
1378
1379 let winner_html = best_idx.and_then(|idx| {
1381 let r = &results[idx];
1382 let base = r.base_settings.as_ref()?;
1383 let rp = resolve_params(&r.params, base);
1384 let m = &r.metrics;
1385 Some(format!(r#"
1386<div class="winner-section">
1387<div class="winner-icon">🏆</div>
1388<div class="winner-content">
1389<div class="winner-title">Best Configuration</div>
1390<div class="winner-metrics">
1391<div class="winner-metric"><span class="wm-label">Gen t/s</span><span class="wm-value" style="color:#3fb950;font-size:1.8em;">{:.2}</span></div>
1392<div class="winner-metric"><span class="wm-label">Prompt t/s</span><span class="wm-value">{:.2}</span></div>
1393<div class="winner-metric"><span class="wm-label">Latency</span><span class="wm-value">{:.2}ms</span></div>
1394<div class="winner-metric"><span class="wm-label">First Token</span><span class="wm-value">{:.0}ms</span></div>
1395</div>
1396<div class="winner-params">Temp: {:.2} · Top-P: {:.2} · Top-K: {} · RepPen: {:.2} · FA: {} · Threads: {} · Batch: {} · Exp: {} · Spec: {} · Draft: {}</div>
1397</div>
1398</div>"#,
1399 m.generation_tps, m.prompt_tps, m.latency_per_token, m.first_token_time,
1400 rp.temperature, rp.top_p, rp.top_k, rp.repeat_penalty,
1401 if rp.flash_attn { "ON" } else { "OFF" }, rp.threads,
1402 rp.batch_size, rp.expert_count,
1403 if rp.spec_type.is_empty() { "Off".to_string() } else { rp.spec_type.clone() }, rp.draft_tokens
1404 ))
1405 }).unwrap_or_default();
1406
1407 let impact_html = if !impact_sorted.is_empty() {
1409 let max_impact = impact_sorted[0].2;
1410 let rows: String = impact_sorted
1411 .iter()
1412 .map(|(label, spread, value)| {
1413 let bar_width = if max_impact > 0.0 {
1414 (value / max_impact * 100.0) as i32
1415 } else {
1416 0
1417 };
1418 let bar_color = if *value > max_impact * 0.7 {
1419 "#f85149"
1420 } else if *value > max_impact * 0.4 {
1421 "#d29922"
1422 } else {
1423 "#3fb950"
1424 };
1425 format!(
1426 r#"<div class="impact-row">
1427<div class="impact-label">{}</div>
1428<div class="impact-bar-bg"><div class="impact-bar-fill" style="width:{}%;background:{}"></div></div>
1429<div class="impact-value">{}</div>
1430</div>"#,
1431 label, bar_width, bar_color, spread
1432 )
1433 })
1434 .collect();
1435 format!(
1436 r#"
1437<div class="impact-section">
1438<h2>Parameter Impact Analysis</h2>
1439<p class="impact-desc">Larger spread in generation throughput between parameter values indicates greater impact on performance.</p>
1440{}
1441</div>"#,
1442 rows
1443 )
1444 } else {
1445 r#"<div class="impact-section"><h2>Parameter Impact Analysis</h2><p class="impact-desc">All parameters were held constant — no impact data available.</p></div>"#.to_string()
1446 };
1447
1448 let empty_html = if total_tests == 0 {
1450 r#"<div class="empty-state">
1451<div class="empty-icon">📊</div>
1452<div class="empty-title">No Results</div>
1453<div class="empty-text">Run a benchmark tuning test to generate results here.</div>
1454</div>"#
1455 } else {
1456 ""
1457 };
1458
1459 let html = include_str!("benchmark_report.html");
1460
1461 html.replace("__TIMESTAMP__", ×tamp)
1463 .replace("__TOTAL_TESTS__", &total_tests.to_string())
1464 .replace("__EMPTY_STATE__", empty_html)
1465 .replace("__MODEL_META__", &model_meta_html)
1466 .replace("__WINNER__", &winner_html)
1467 .replace("__AVG_GEN_TPS__", &format!("{:.1}", avg_gen_tps))
1468 .replace(
1469 "__MED_GEN_TPS__",
1470 &format!("{:.1}", median(&mut gen_tps_sorted)),
1471 )
1472 .replace("__GEN_STD__", &format!("{:.1}", gen_std))
1473 .replace("__MIN_GEN__", &format!("{:.1}", min_gen_tps))
1474 .replace("__MAX_GEN__", &format!("{:.1}", best_gen_tps))
1475 .replace("__AVG_PROMPT_TPS__", &format!("{:.1}", avg_prompt_tps))
1476 .replace(
1477 "__MED_PROMPT_TPS__",
1478 &format!("{:.1}", median(&mut prompt_tps)),
1479 )
1480 .replace("__PROMPT_STD__", &format!("{:.1}", prompt_std))
1481 .replace("__MIN_PROMPT__", &format!("{:.1}", min_prompt_tps))
1482 .replace("__MAX_PROMPT__", &format!("{:.1}", best_prompt_tps))
1483 .replace("__AVG_LATENCY__", &format!("{:.1}ms", avg_latency))
1484 .replace(
1485 "__MED_LATENCY__",
1486 &format!("{:.1}ms", median(&mut latency_sorted)),
1487 )
1488 .replace("__LAT_STD__", &format!("{:.1}", lat_std))
1489 .replace("__MIN_LAT__", &format!("{:.1}", min_latency))
1490 .replace("__MAX_LAT__", &format!("{:.1}", best_latency))
1491 .replace("__AVG_FT__", &format!("{:.0}ms", avg_first_token))
1492 .replace("__MED_FT__", &format!("{:.0}ms", median(&mut first_token)))
1493 .replace("__FT_STD__", &format!("{:.0}", ft_std))
1494 .replace("__MIN_FT__", &format!("{:.0}ms", min_first_token))
1495 .replace("__MAX_FT__", &format!("{:.0}ms", best_first_token))
1496 .replace("__BEST_GEN__", &format!("{:.1}", best_gen_tps))
1497 .replace("__TOP_N__", &top_n.to_string())
1498 .replace("__IMPACT_HTML__", &impact_html)
1499 .replace("__METRICS_JSON__", &metrics_json)
1500 .replace("__PARAM_HEADERS_JSON__", ¶m_headers_json)
1501 .replace("__PARAM_VALS_JSON__", ¶m_vals_json)
1502 .replace("__TOP_LABELS_JSON__", &top_labels_json)
1503 .replace("__TOP_GEN_TPS_JSON__", &top_gen_tps_json)
1504 .replace("__SCATTER_DATA_JSON__", &scatter_data_json)
1505 .replace("__SCATTER_DATA2_JSON__", &scatter_data2_json)
1506 .replace("__COLUMN_DEFS_JSON__", &column_defs_json)
1507 .replace("__CSV_B64__", &csv_b64)
1508 .replace(
1509 "__MODEL_META_JSON__",
1510 &serde_json::to_string(&model_meta_json).unwrap(),
1511 )
1512}
1513
1514fn escape_html(s: &str) -> String {
1516 s.replace('&', "&")
1517 .replace('<', "<")
1518 .replace('>', ">")
1519 .replace('"', """)
1520}
1521
1522fn base64_encode(input: &str) -> String {
1524 const CHARS: &[u8] = b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
1525 let bytes = input.as_bytes();
1526 let mut result = String::new();
1527 for chunk in bytes.chunks(3) {
1528 let b0 = chunk[0] as u32;
1529 let b1 = if chunk.len() > 1 { chunk[1] as u32 } else { 0 };
1530 let b2 = if chunk.len() > 2 { chunk[2] as u32 } else { 0 };
1531 let triple = (b0 << 16) | (b1 << 8) | b2;
1532 result.push(CHARS[((triple >> 18) & 0x3F) as usize] as char);
1533 result.push(CHARS[((triple >> 12) & 0x3F) as usize] as char);
1534 if chunk.len() > 1 {
1535 result.push(CHARS[((triple >> 6) & 0x3F) as usize] as char);
1536 } else {
1537 result.push('=');
1538 }
1539 if chunk.len() > 2 {
1540 result.push(CHARS[(triple & 0x3F) as usize] as char);
1541 } else {
1542 result.push('=');
1543 }
1544 }
1545 result
1546}
1547
1548struct InferenceResult {
1550 prompt_tokens: u64,
1551 generation_tokens: u64,
1552 prompt_time: Duration,
1553 generation_time: Duration,
1554 total_time: Duration,
1555 first_token_time: u128, content: String,
1557}