1use std::path::PathBuf;
2use std::time::{Duration, Instant};
3
4use tokio::sync::{mpsc, watch};
5
6use crate::backend::server::{spawn_server, SpawnServerRequest};
7use crate::models::{
8 BenchTuneConfig, BenchTuneMetrics, BenchTuneMode, BenchTuneParamValue, BenchTuneResult,
9 BenchTuneStatus, DiscoveredModel, ModelSettings, ServerMode,
10};
11
12const HEALTH_CHECK_ITERATIONS: u32 = 120;
14const HEALTH_CHECK_INTERVAL_MS: u64 = 500;
15const HEALTH_CHECK_LOG_INTERVAL: u32 = 10;
16const REQUEST_TIMEOUT_SECS: u64 = 120;
17
18struct BenchAccumulator {
19 params: BenchTuneParamValue,
20 total_prompt_tokens: u64,
21 total_generation_tokens: u64,
22 total_prompt_time: Duration,
23 total_generation_time: Duration,
24 total_time: Duration,
25 first_token_times: Vec<u128>,
26 outputs: Vec<String>,
27 per_iteration_metrics: Vec<BenchTuneMetrics>,
28 base_settings: Option<ModelSettings>,
29}
30
31fn build_bench_result(acc: BenchAccumulator) -> BenchTuneResult {
32 let BenchAccumulator {
33 params,
34 total_prompt_tokens,
35 total_generation_tokens,
36 total_prompt_time,
37 total_generation_time,
38 total_time,
39 first_token_times,
40 outputs,
41 per_iteration_metrics,
42 base_settings,
43 } = acc;
44 let prompt_tps = if total_prompt_time.as_secs_f64() > 0.0 {
45 (total_prompt_tokens as f64) / total_prompt_time.as_secs_f64()
46 } else {
47 0.0
48 };
49
50 let generation_tps = if total_generation_time.as_secs_f64() > 0.0 {
51 (total_generation_tokens as f64) / total_generation_time.as_secs_f64()
52 } else {
53 0.0
54 };
55
56 let combined_tps = if total_time.as_secs_f64() > 0.0 {
57 ((total_prompt_tokens + total_generation_tokens) as f64) / total_time.as_secs_f64()
58 } else {
59 0.0
60 };
61
62 let avg_latency_per_token = if total_generation_tokens > 0 {
63 total_generation_time.as_millis() as f64 / (total_generation_tokens as f64)
64 } else {
65 0.0
66 };
67
68 let avg_first_token_time = if !first_token_times.is_empty() {
69 first_token_times.iter().sum::<u128>() as f64 / first_token_times.len() as f64
70 } else {
71 0.0
72 };
73
74 BenchTuneResult {
75 params,
76 metrics: BenchTuneMetrics {
77 prompt_tps,
78 generation_tps,
79 combined_tps,
80 latency_per_token: avg_latency_per_token,
81 first_token_time: avg_first_token_time,
82 },
83 outputs,
84 per_iteration_metrics,
85 base_settings,
86 server_command: None,
87 }
88}
89
90pub struct BenchTuneRequest<'a> {
91 pub main_config: &'a crate::config::Config,
92 pub config: &'a BenchTuneConfig,
93 pub model: &'a DiscoveredModel,
94 pub settings: &'a ModelSettings,
95 pub progress_tx: mpsc::Sender<BenchTuneStatus>,
96 pub log_tx: mpsc::Sender<String>,
97 pub cancel_rx: &'a mut watch::Receiver<bool>,
98}
99
100pub async fn run_bench_tune(
102 req: BenchTuneRequest<'_>,
103) -> Result<Vec<BenchTuneResult>, Box<dyn std::error::Error + Send + Sync>> {
104 let BenchTuneRequest {
105 main_config,
106 config,
107 model,
108 settings,
109 progress_tx,
110 log_tx,
111 cancel_rx,
112 } = req;
113 let start_time = Instant::now();
114 let total_tests = config.get_total_tests_count();
115
116 if total_tests > 500 {
118 let _ = log_tx
119 .send(format!(
120 "WARNING: Benchmark will run {} combinations. This may take a long time.",
121 total_tests
122 ))
123 .await;
124 }
125
126 let combinations = config.generate_combinations();
128
129 let mut results = Vec::new();
131 let mut failed_tests: Vec<(usize, String)> = Vec::new();
132
133 let mut settings = settings.clone();
135 if let Some(kwargs) = &config.chat_template_kwargs {
136 settings.chat_template_kwargs = Some(kwargs.clone());
137 }
138
139 let client = reqwest::Client::builder()
141 .timeout(Duration::from_secs(REQUEST_TIMEOUT_SECS))
142 .build()?;
143
144 if config.bench_mode == BenchTuneMode::RuntimeOnly {
146 let (exit_tx, _exit_rx) = tokio::sync::mpsc::channel(1);
148 let (server_handle, server_command) = spawn_server(SpawnServerRequest {
149 config: main_config,
150 model: Some(model),
151 settings: &settings,
152 log_tx: log_tx.clone(),
153 progress_tx: None,
154 server_mode: ServerMode::Normal,
155 router_max_models: 1,
156 exit_tx,
157 })
158 .await?;
159
160 let host = if server_handle.host == "0.0.0.0" {
161 "127.0.0.1"
162 } else {
163 &server_handle.host
164 };
165
166 for i in 0..HEALTH_CHECK_ITERATIONS {
168 if *cancel_rx.borrow() {
169 let _ = crate::backend::server::kill_server(server_handle).await;
170 let elapsed = start_time.elapsed();
171 progress_tx
172 .send(BenchTuneStatus::Cancelled {
173 total_tests,
174 successful_tests: results.len(),
175 failed_tests: failed_tests.len(),
176 elapsed,
177 })
178 .await?;
179 return Ok(results);
180 }
181 if crate::backend::server::check_health(host, server_handle.port).await {
182 break;
183 }
184 if i % HEALTH_CHECK_LOG_INTERVAL == 0 && i > 0 {
185 let _ = log_tx
186 .send(format!(
187 " ... still waiting ({:.0}s)...",
188 i as f32 * (HEALTH_CHECK_INTERVAL_MS as f32 / 1000.0)
189 ))
190 .await;
191 }
192 tokio::time::sleep(Duration::from_millis(HEALTH_CHECK_INTERVAL_MS)).await;
193 }
194
195 let server_port = server_handle.port;
196 let server_host = host.to_string();
197
198 for (idx, combination) in combinations.iter().enumerate() {
199 if *cancel_rx.borrow() {
201 let _ = crate::backend::server::kill_server(server_handle).await;
202 let elapsed = start_time.elapsed();
203 progress_tx
204 .send(BenchTuneStatus::Cancelled {
205 total_tests,
206 successful_tests: results.len(),
207 failed_tests: failed_tests.len(),
208 elapsed,
209 })
210 .await?;
211 return Ok(results);
212 }
213
214 let progress = (idx as f32 / total_tests as f32) * 100.0;
215 progress_tx
216 .send(BenchTuneStatus::Running {
217 current: idx + 1,
218 total: total_tests,
219 progress,
220 current_params: combination.clone(),
221 })
222 .await?;
223
224 let result = run_bench_tune_runtime_only(RuntimeOnlyCtx {
225 params: combination,
226 settings: &settings,
227 num_iterations: config.num_iterations,
228 prompt: config.prompt.clone(),
229 server_host: &server_host,
230 server_port,
231 log_tx: log_tx.clone(),
232 config,
233 client: &client,
234 server_command: &server_command,
235 })
236 .await;
237
238 match result {
239 Ok(test_result) => results.push(test_result),
240 Err(e) => {
241 failed_tests.push((idx + 1, e.to_string()));
242 let _ = log_tx
243 .send(format!(
244 "Benchmark test {}/{} failed: {}",
245 idx + 1,
246 total_tests,
247 e
248 ))
249 .await;
250 }
251 }
252 }
253
254 let _ = crate::backend::server::kill_server(server_handle).await;
255 } else {
256 for (idx, combination) in combinations.iter().enumerate() {
258 if *cancel_rx.borrow() {
260 let elapsed = start_time.elapsed();
261 progress_tx
262 .send(BenchTuneStatus::Cancelled {
263 total_tests,
264 successful_tests: results.len(),
265 failed_tests: failed_tests.len(),
266 elapsed,
267 })
268 .await?;
269 return Ok(results);
270 }
271
272 let progress = (idx as f32 / total_tests as f32) * 100.0;
273 progress_tx
274 .send(BenchTuneStatus::Running {
275 current: idx + 1,
276 total: total_tests,
277 progress,
278 current_params: combination.clone(),
279 })
280 .await?;
281
282 let result = run_bench_tune_single_test(SingleTestCtx {
283 main_config,
284 params: combination,
285 model,
286 base_settings: &settings,
287 num_iterations: config.num_iterations,
288 prompt: config.prompt.clone(),
289 log_tx: log_tx.clone(),
290 config,
291 client: &client,
292 })
293 .await;
294
295 match result {
296 Ok(test_result) => results.push(test_result),
297 Err(e) => {
298 failed_tests.push((idx + 1, e.to_string()));
299 let _ = log_tx
300 .send(format!(
301 "Benchmark test {}/{} failed: {}",
302 idx + 1,
303 total_tests,
304 e
305 ))
306 .await;
307 }
308 }
309 }
310 }
311
312 results.sort_by(|a, b| {
314 b.metrics
315 .combined_tps
316 .partial_cmp(&a.metrics.combined_tps)
317 .unwrap_or(std::cmp::Ordering::Equal)
318 });
319
320 let elapsed = start_time.elapsed();
321 let successful_tests = results.len();
322 let failed_count = failed_tests.len();
323
324 if failed_count > 0 {
326 progress_tx
327 .send(BenchTuneStatus::PartiallyCompleted {
328 total_tests,
329 successful_tests,
330 failed_tests: failed_count,
331 elapsed,
332 })
333 .await?;
334 } else {
335 progress_tx
336 .send(BenchTuneStatus::Completed {
337 total_tests,
338 successful_tests,
339 elapsed,
340 })
341 .await?;
342 }
343
344 Ok(results)
345}
346
347struct IterationLoopCtx<'a> {
349 prompt: &'a str,
350 host: &'a str,
351 port: u16,
352 params: &'a BenchTuneParamValue,
353 num_iterations: u32,
354 config: &'a BenchTuneConfig,
355 client: &'a reqwest::Client,
356 log_tx: mpsc::Sender<String>,
357 log_prefix: &'a str,
358}
359
360async fn run_iteration_loop(
362 ctx: IterationLoopCtx<'_>,
363) -> Result<BenchTuneResult, Box<dyn std::error::Error + Send + Sync>> {
364 let IterationLoopCtx {
365 prompt,
366 host,
367 port,
368 params,
369 num_iterations,
370 config,
371 client,
372 log_tx,
373 log_prefix,
374 } = ctx;
375 let mut total_prompt_tokens = 0u64;
376 let mut total_generation_tokens = 0u64;
377 let mut total_prompt_time = Duration::ZERO;
378 let mut total_generation_time = Duration::ZERO;
379 let mut total_time = Duration::ZERO;
380 let mut first_token_times = Vec::new();
381 let mut outputs = Vec::new();
382 let mut per_iteration_metrics = Vec::new();
383
384 let _ = log_tx
385 .send(format!(
386 "Running {} inference iterations {}...",
387 num_iterations, log_prefix
388 ))
389 .await;
390
391 for i in 0..num_iterations {
392 let result = send_inference_request(prompt, host, port, params, config, client).await;
393
394 match result {
395 Ok(res) => {
396 total_prompt_tokens += res.prompt_tokens;
397 total_generation_tokens += res.generation_tokens;
398 total_prompt_time += res.prompt_time;
399 total_generation_time += res.generation_time;
400 total_time += res.total_time;
401 first_token_times.push(res.first_token_time);
402 outputs.push(res.content.clone());
403
404 let iter_prompt_tps = if res.prompt_time.as_secs_f64() > 0.0 {
405 res.prompt_tokens as f64 / res.prompt_time.as_secs_f64()
406 } else {
407 0.0
408 };
409 let iter_gen_tps = if res.generation_time.as_secs_f64() > 0.0 {
410 res.generation_tokens as f64 / res.generation_time.as_secs_f64()
411 } else {
412 0.0
413 };
414 let iter_combined_tps = if res.total_time.as_secs_f64() > 0.0 {
415 ((res.prompt_tokens + res.generation_tokens) as f64) / res.total_time.as_secs_f64()
416 } else {
417 0.0
418 };
419 let iter_latency = if res.generation_tokens > 0 {
420 res.generation_time.as_millis() as f64 / res.generation_tokens as f64
421 } else {
422 0.0
423 };
424
425 per_iteration_metrics.push(BenchTuneMetrics {
426 prompt_tps: iter_prompt_tps,
427 generation_tps: iter_gen_tps,
428 combined_tps: iter_combined_tps,
429 latency_per_token: iter_latency,
430 first_token_time: res.first_token_time as f64,
431 });
432
433 if num_iterations > 1 {
434 let _ = log_tx
435 .send(format!(
436 " Iteration {}/{}: {:.2} gen t/s",
437 i + 1,
438 num_iterations,
439 iter_gen_tps
440 ))
441 .await;
442 }
443
444 let _ = log_tx
445 .send(format!(
446 "--- Generated Output (Iter {}) ---\n{}\n----------------------------------",
447 i + 1,
448 res.content
449 ))
450 .await;
451 }
452 Err(e) => {
453 let _ = log_tx
454 .send(format!(
455 " Iteration {}/{} FAILED: {}",
456 i + 1,
457 num_iterations,
458 e
459 ))
460 .await;
461 if i == 0 {
462 return Err(format!("Inference failed: {}", e).into());
463 }
464 }
465 }
466 }
467
468 Ok(build_bench_result(BenchAccumulator {
469 params: params.clone(),
470 total_prompt_tokens,
471 total_generation_tokens,
472 total_prompt_time,
473 total_generation_time,
474 total_time,
475 first_token_times,
476 outputs,
477 per_iteration_metrics,
478 base_settings: None,
479 }))
480}
481
482struct RuntimeOnlyCtx<'a> {
483 params: &'a BenchTuneParamValue,
484 settings: &'a ModelSettings,
485 num_iterations: u32,
486 prompt: String,
487 server_host: &'a str,
488 server_port: u16,
489 log_tx: mpsc::Sender<String>,
490 config: &'a BenchTuneConfig,
491 client: &'a reqwest::Client,
492 server_command: &'a str,
493}
494
495async fn run_bench_tune_runtime_only(
497 ctx: RuntimeOnlyCtx<'_>,
498) -> Result<BenchTuneResult, Box<dyn std::error::Error + Send + Sync>> {
499 let RuntimeOnlyCtx {
500 params,
501 settings,
502 num_iterations,
503 prompt,
504 server_host,
505 server_port,
506 log_tx,
507 config,
508 client,
509 server_command,
510 } = ctx;
511 let loop_fut = run_iteration_loop(IterationLoopCtx {
512 prompt: &prompt,
513 host: server_host,
514 port: server_port,
515 params,
516 num_iterations,
517 config,
518 client,
519 log_tx,
520 log_prefix: "(runtime-only mode)",
521 });
522 let result = tokio::time::timeout(config.test_timeout, loop_fut).await;
523 let result = match result {
524 Ok(inner) => inner,
525 Err(_) => return Err(format!("Test timed out after {:?}", config.test_timeout).into()),
526 };
527 result.map(|mut r| {
528 r.base_settings = Some(settings.clone());
529 r.server_command = Some(server_command.to_string());
530 r
531 })
532}
533
534struct SingleTestCtx<'a> {
535 main_config: &'a crate::config::Config,
536 params: &'a BenchTuneParamValue,
537 model: &'a DiscoveredModel,
538 base_settings: &'a ModelSettings,
539 num_iterations: u32,
540 prompt: String,
541 log_tx: mpsc::Sender<String>,
542 config: &'a BenchTuneConfig,
543 client: &'a reqwest::Client,
544}
545
546async fn run_bench_tune_single_test(
548 ctx: SingleTestCtx<'_>,
549) -> Result<BenchTuneResult, Box<dyn std::error::Error + Send + Sync>> {
550 let SingleTestCtx {
551 main_config,
552 params,
553 model,
554 base_settings,
555 num_iterations,
556 prompt,
557 log_tx,
558 config,
559 client,
560 } = ctx;
561 let mut settings = base_settings.clone();
563
564 if let Some(temperature) = params.temperature {
566 settings.temperature = temperature as f32;
567 }
568 if let Some(top_p) = params.top_p {
569 settings.top_p = top_p as f32;
570 }
571 if let Some(top_k) = params.top_k {
572 settings.top_k = top_k as i32;
573 }
574 if let Some(repeat_penalty) = params.repeat_penalty {
575 settings.repeat_penalty = repeat_penalty as f32;
576 }
577 if let Some(flash_attn) = params.flash_attn {
578 settings.flash_attn = flash_attn;
579 }
580 if let Some(threads) = params.threads {
581 settings.threads = threads;
582 settings.threads_batch = threads; }
584 if let Some(batch_size) = params.batch_size {
585 settings.batch_size = batch_size;
586 settings.ubatch_size = batch_size;
587 }
588 if let Some(expert_count) = params.expert_count {
589 settings.expert_count = expert_count;
590 }
591 if let Some(ref spec_type) = params.spec_type {
592 settings.spec_type = if spec_type == "Off" {
593 String::new()
594 } else {
595 spec_type.clone()
596 };
597 }
598 if let Some(draft_tokens) = params.draft_tokens {
599 settings.draft_tokens = draft_tokens;
600 }
601
602 let (exit_tx, _exit_rx) = tokio::sync::mpsc::channel(1);
604 let (server_handle, command) = spawn_server(SpawnServerRequest {
605 config: main_config,
606 model: Some(model),
607 settings: &settings,
608 log_tx: log_tx.clone(),
609 progress_tx: None,
610 server_mode: ServerMode::Normal,
611 router_max_models: 1,
612 exit_tx,
613 })
614 .await?;
615 let mut ready = false;
617 let host = if server_handle.host == "0.0.0.0" {
618 "127.0.0.1"
619 } else {
620 &server_handle.host
621 };
622
623 let _ = log_tx
624 .send(format!(
625 "Waiting for server on {}:{}...",
626 host, server_handle.port
627 ))
628 .await;
629
630 for i in 0..HEALTH_CHECK_ITERATIONS {
631 if crate::backend::server::check_health(host, server_handle.port).await {
632 ready = true;
633 break;
634 }
635 if i % HEALTH_CHECK_LOG_INTERVAL == 0 && i > 0 {
636 let _ = log_tx
637 .send(format!(
638 " ... still waiting ({:.0}s)...",
639 i as f32 * (HEALTH_CHECK_INTERVAL_MS as f32 / 1000.0)
640 ))
641 .await;
642 }
643 tokio::time::sleep(Duration::from_millis(HEALTH_CHECK_INTERVAL_MS)).await;
644 }
645
646 if !ready {
647 let _ = log_tx
648 .send("Error: Server health check timed out".to_string())
649 .await;
650 let _ = crate::backend::server::kill_server(server_handle).await;
651 return Err("Server failed to become healthy".into());
652 }
653
654 let loop_fut = run_iteration_loop(IterationLoopCtx {
655 prompt: &prompt,
656 host,
657 port: server_handle.port,
658 params,
659 num_iterations,
660 config,
661 client,
662 log_tx,
663 log_prefix: "",
664 });
665 let result = tokio::time::timeout(config.test_timeout, loop_fut).await;
666 let result = match result {
667 Ok(inner) => inner,
668 Err(_) => {
669 let _ = crate::backend::server::kill_server(server_handle).await;
670 return Err(format!("Test timed out after {:?}", config.test_timeout).into());
671 }
672 };
673
674 let _ = crate::backend::server::kill_server(server_handle).await;
675 tokio::time::sleep(Duration::from_secs(1)).await;
676
677 result.map(|mut r| {
678 r.base_settings = Some(base_settings.clone());
679 r.server_command = Some(command);
680 r
681 })
682}
683
684async fn send_inference_request(
686 prompt: &str,
687 host: &str,
688 port: u16,
689 params: &BenchTuneParamValue,
690 config: &BenchTuneConfig,
691 client: &reqwest::Client,
692) -> Result<InferenceResult, Box<dyn std::error::Error + Send + Sync>> {
693 let mut body = serde_json::json!({
695 "prompt": prompt,
696 "n_predict": config.n_predict,
697 "stream": false
698 });
699
700 if let Some(temperature) = params.temperature {
701 body["temperature"] = serde_json::json!(temperature);
702 }
703 if let Some(top_p) = params.top_p {
704 body["top_p"] = serde_json::json!(top_p);
705 }
706 if let Some(top_k) = params.top_k {
707 body["top_k"] = serde_json::json!(top_k);
708 }
709 if let Some(repeat_penalty) = params.repeat_penalty {
710 body["repeat_penalty"] = serde_json::json!(repeat_penalty);
711 }
712
713 let url = format!("http://{}:{}/completion", host, port);
714 let start = Instant::now();
715 let resp = client.post(url).json(&body).send().await?;
716
717 if !resp.status().is_success() {
718 let status = resp.status();
719 let body = resp.text().await.unwrap_or_else(|_| "no body".to_string());
720 return Err(format!("Server returned error {}: {}", status, body).into());
721 }
722
723 let total_time = start.elapsed();
724 let json: serde_json::Value = resp.json().await?;
725
726 let prompt_tokens = json["tokens_evaluated"]
728 .as_u64()
729 .or_else(|| json["prompt_n"].as_u64())
730 .unwrap_or(0);
731
732 let generation_tokens = json["tokens_predicted"]
733 .as_u64()
734 .or_else(|| json["predicted_n"].as_u64())
735 .unwrap_or(0);
736
737 let timings = &json["timings"];
738 let prompt_time_ms = timings["prompt_ms"]
739 .as_f64()
740 .or_else(|| timings["prompt_eval_ms"].as_f64())
741 .unwrap_or(0.0);
742
743 let generation_time_ms = timings["predicted_ms"]
744 .as_f64()
745 .or_else(|| timings["eval_ms"].as_f64())
746 .unwrap_or(0.0);
747
748 Ok(InferenceResult {
749 prompt_tokens,
750 generation_tokens,
751 prompt_time: Duration::from_millis(prompt_time_ms as u64),
752 generation_time: Duration::from_millis(generation_time_ms as u64),
753 total_time,
754 first_token_time: prompt_time_ms as u128,
755 content: json["content"].as_str().unwrap_or("").to_string(),
756 })
757}
758
759pub async fn save_results(
761 results: &[BenchTuneResult],
762 output_dir: &PathBuf,
763 config: &BenchTuneConfig,
764) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
765 std::fs::create_dir_all(output_dir)?;
767
768 let timestamp = chrono::Local::now().format("%Y%m%d_%H%M%S");
770 let filename = format!("benchmark_{}.md", timestamp);
771 let filepath = output_dir.join(filename);
772
773 let mut md = String::new();
774 md.push_str("# LLM Benchmark Results\n\n");
775 md.push_str(&format!(
776 "Generated on: {}\n\n",
777 chrono::Local::now().format("%Y-%m-%d %H:%M:%S")
778 ));
779
780 md.push_str("| Temp | Top-P | Top-K | RepPen | FA | Threads | Batch | Exp | Spec | Draft | Prompt t/s | Gen t/s | Latency (ms) | First Tok (ms) |\n");
781 md.push_str("|------|-------|-------|--------|----|---------|-------|-----|------|-------|------------|---------|--------------|----------------|\n");
782
783 for r in results {
784 let temp = r
785 .params
786 .temperature
787 .map(|v| format!("{:.2}", v))
788 .unwrap_or_else(|| "-".to_string());
789 let top_p = r
790 .params
791 .top_p
792 .map(|v| format!("{:.2}", v))
793 .unwrap_or_else(|| "-".to_string());
794 let top_k = r
795 .params
796 .top_k
797 .map(|v| v.to_string())
798 .unwrap_or_else(|| "-".to_string());
799 let rep_pen = r
800 .params
801 .repeat_penalty
802 .map(|v| format!("{:.2}", v))
803 .unwrap_or_else(|| "-".to_string());
804 let fa = r
805 .params
806 .flash_attn
807 .map(|v| if v { "ON" } else { "OFF" })
808 .unwrap_or("-");
809 let threads = r
810 .params
811 .threads
812 .map(|v| v.to_string())
813 .unwrap_or_else(|| "-".to_string());
814 let batch = r
815 .params
816 .batch_size
817 .map(|v| v.to_string())
818 .unwrap_or_else(|| "-".to_string());
819 let exp = r
820 .params
821 .expert_count
822 .map(|v| v.to_string())
823 .unwrap_or_else(|| "-".to_string());
824
825 let spec = r
826 .params
827 .spec_type
828 .as_ref()
829 .map(|s| {
830 if s.is_empty() {
831 "-".to_string()
832 } else {
833 s.clone()
834 }
835 })
836 .unwrap_or_else(|| "-".to_string());
837 let draft = r
838 .params
839 .draft_tokens
840 .map(|v| v.to_string())
841 .unwrap_or_else(|| "-".to_string());
842
843 md.push_str(&format!(
844 "| {} | {} | {} | {} | {} | {} | {} | {} | {} | {} | {:.2} | {:.2} | {:.2} | {:.2} |\n",
845 temp,
846 top_p,
847 top_k,
848 rep_pen,
849 fa,
850 threads,
851 batch,
852 exp,
853 spec,
854 draft,
855 r.metrics.prompt_tps,
856 r.metrics.generation_tps,
857 r.metrics.latency_per_token,
858 r.metrics.first_token_time
859 ));
860 }
861
862 tokio::fs::write(&filepath, md).await?;
863
864 let json_filename = format!("benchmark_{}.json", timestamp);
866 let json_filepath = output_dir.join(&json_filename);
867 let json_content = serde_json::to_string_pretty(&results)?;
868 tokio::fs::write(&json_filepath, json_content).await?;
869
870 let yaml_filename = format!("benchmark_{}.yaml", timestamp);
872 let yaml_filepath = output_dir.join(&yaml_filename);
873 let yaml_content = serde_yaml::to_string(&results)?;
874 tokio::fs::write(&yaml_filepath, yaml_content).await?;
875
876 let html_filename = format!("benchmark_{}.html", timestamp);
878 let html_filepath = output_dir.join(&html_filename);
879 let html_content = generate_html_report(results, config);
880 tokio::fs::write(&html_filepath, html_content).await?;
881
882 Ok(())
883}
884
885fn generate_html_report(results: &[BenchTuneResult], config: &BenchTuneConfig) -> String {
886 use chrono::Local;
887
888 let total_tests = results.len();
889 let timestamp = Local::now().format("%Y-%m-%d %H:%M:%S").to_string();
890
891 let model_info = results.first().and_then(|r| {
893 r.base_settings.as_ref().map(|s| {
894 let model_name = if config.model_path.file_name().is_some() {
895 config
896 .model_path
897 .file_name()
898 .unwrap()
899 .to_string_lossy()
900 .to_string()
901 } else {
902 config.model_path.display().to_string()
903 };
904 let file_size_mb = results
905 .first()
906 .and_then(|r| {
907 r.base_settings.as_ref().map(|_s| {
908 0u64
910 })
911 })
912 .unwrap_or(0);
913 (model_name, file_size_mb, s.clone())
914 })
915 });
916
917 struct ResolvedParams {
919 temperature: f64,
920 top_p: f64,
921 top_k: i64,
922 repeat_penalty: f64,
923 flash_attn: bool,
924 threads: u32,
925 batch_size: u32,
926 expert_count: i32,
927 spec_type: String,
928 draft_tokens: u32,
929 }
930
931 fn resolve_params(
932 params: &BenchTuneParamValue,
933 base: &crate::models::ModelSettings,
934 ) -> ResolvedParams {
935 ResolvedParams {
936 temperature: params.temperature.unwrap_or(base.temperature as f64),
937 top_p: params.top_p.unwrap_or(base.top_p as f64),
938 top_k: params.top_k.unwrap_or(base.top_k as i64),
939 repeat_penalty: params.repeat_penalty.unwrap_or(base.repeat_penalty as f64),
940 flash_attn: params.flash_attn.unwrap_or(base.flash_attn),
941 threads: params.threads.unwrap_or(base.threads),
942 batch_size: params.batch_size.unwrap_or(base.batch_size),
943 expert_count: params.expert_count.unwrap_or(base.expert_count),
944 spec_type: params
945 .spec_type
946 .clone()
947 .unwrap_or_else(|| base.spec_type.clone()),
948 draft_tokens: params.draft_tokens.unwrap_or(base.draft_tokens),
949 }
950 }
951
952 fn mean(vals: &[f64]) -> f64 {
954 if vals.is_empty() {
955 return 0.0;
956 }
957 vals.iter().sum::<f64>() / vals.len() as f64
958 }
959 fn median(vals: &mut [f64]) -> f64 {
960 if vals.is_empty() {
961 return 0.0;
962 }
963 vals.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
964 let mid = vals.len() / 2;
965 if vals.len().is_multiple_of(2) {
966 (vals[mid - 1] + vals[mid]) / 2.0
967 } else {
968 vals[mid]
969 }
970 }
971 fn std_dev(vals: &[f64], avg: f64) -> f64 {
972 if vals.len() <= 1 {
973 return 0.0;
974 }
975 let variance =
976 vals.iter().map(|v| (v - avg).powi(2)).sum::<f64>() / (vals.len() - 1) as f64;
977 variance.sqrt()
978 }
979 fn min_val(vals: &[f64]) -> f64 {
980 vals.iter().cloned().fold(f64::INFINITY, f64::min)
981 }
982 fn max_val(vals: &[f64]) -> f64 {
983 vals.iter().cloned().fold(f64::NEG_INFINITY, f64::max)
984 }
985
986 let gen_tps: Vec<f64> = results.iter().map(|r| r.metrics.generation_tps).collect();
987 let mut prompt_tps: Vec<f64> = results.iter().map(|r| r.metrics.prompt_tps).collect();
988 let latency: Vec<f64> = results
989 .iter()
990 .map(|r| r.metrics.latency_per_token)
991 .collect();
992 let mut first_token: Vec<f64> = results.iter().map(|r| r.metrics.first_token_time).collect();
993
994 let mut gen_tps_sorted = gen_tps.clone();
995 let mut latency_sorted = latency.clone();
996
997 let avg_gen_tps = mean(&gen_tps);
998 let avg_prompt_tps = mean(&prompt_tps);
999 let avg_latency = mean(&latency);
1000 let avg_first_token = mean(&first_token);
1001 let _avg_combined_tps = mean(
1002 &results
1003 .iter()
1004 .map(|r| r.metrics.combined_tps)
1005 .collect::<Vec<f64>>(),
1006 );
1007
1008 let gen_std = std_dev(&gen_tps, avg_gen_tps);
1009 let prompt_std = std_dev(&prompt_tps, avg_prompt_tps);
1010 let lat_std = std_dev(&latency, avg_latency);
1011 let ft_std = std_dev(&first_token, avg_first_token);
1012
1013 let best_idx = results
1014 .iter()
1015 .enumerate()
1016 .max_by(|a, b| {
1017 a.1.metrics
1018 .generation_tps
1019 .partial_cmp(&b.1.metrics.generation_tps)
1020 .unwrap_or(std::cmp::Ordering::Equal)
1021 })
1022 .map(|(i, _)| i);
1023 let best_gen_tps = if !gen_tps.is_empty() {
1024 max_val(&gen_tps)
1025 } else {
1026 0.0
1027 };
1028 let best_prompt_tps = if !prompt_tps.is_empty() {
1029 max_val(&prompt_tps)
1030 } else {
1031 0.0
1032 };
1033 let best_latency = if !latency.is_empty() {
1034 min_val(&latency)
1035 } else {
1036 0.0
1037 };
1038 let best_first_token = if !first_token.is_empty() {
1039 min_val(&first_token)
1040 } else {
1041 0.0
1042 };
1043 let min_gen_tps = min_val(&gen_tps);
1044 let min_prompt_tps = min_val(&prompt_tps);
1045 let min_latency = min_val(&latency);
1046 let min_first_token = min_val(&first_token);
1047
1048 let param_names = [("temperature", "Temperature"),
1050 ("top_p", "Top-P"),
1051 ("top_k", "Top-K"),
1052 ("repeat_penalty", "Repeat Penalty"),
1053 ("flash_attn", "Flash Attention"),
1054 ("threads", "Threads"),
1055 ("batch_size", "Batch Size"),
1056 ("expert_count", "Experts")];
1057
1058 let impact_data: Vec<(String, String, f64)> = param_names
1059 .iter()
1060 .filter_map(|(key, label)| {
1061 let values: Vec<f64> = results
1062 .iter()
1063 .filter_map(|r| {
1064 let base = r.base_settings.as_ref()?;
1065 let rp = resolve_params(&r.params, base);
1066 Some(match *key {
1067 "temperature" => rp.temperature,
1068 "top_p" => rp.top_p,
1069 "top_k" => rp.top_k as f64,
1070 "repeat_penalty" => rp.repeat_penalty,
1071 "flash_attn" => {
1072 if rp.flash_attn {
1073 1.0
1074 } else {
1075 0.0
1076 }
1077 }
1078 "threads" => rp.threads as f64,
1079 "batch_size" => rp.batch_size as f64,
1080 "expert_count" => rp.expert_count as f64,
1081 _ => return None,
1082 })
1083 })
1084 .collect();
1085
1086 let mut groups: std::collections::HashMap<String, Vec<f64>> =
1088 std::collections::HashMap::new();
1089 for (r, v) in results.iter().zip(values.iter()) {
1090 let key_str = if *key == "flash_attn" {
1091 if *v > 0.5 {
1092 "ON".to_string()
1093 } else {
1094 "OFF".to_string()
1095 }
1096 } else {
1097 format!("{:.2}", v)
1098 };
1099 groups
1100 .entry(key_str)
1101 .or_default()
1102 .push(r.metrics.generation_tps);
1103 }
1104
1105 if groups.len() <= 1 {
1106 return None;
1107 } let group_means: Vec<f64> = groups.values().map(|vals| mean(vals)).collect();
1110 let spread = max_val(&group_means) - min_val(&group_means);
1111 Some((label.to_string(), format!("{:.1}", spread), spread))
1112 })
1113 .collect();
1114
1115 let mut impact_sorted = impact_data.clone();
1117 impact_sorted.sort_by(|a, b| b.2.partial_cmp(&a.2).unwrap_or(std::cmp::Ordering::Equal));
1118
1119 let consistency_data: Vec<f64> = results
1121 .iter()
1122 .map(|r| {
1123 if r.per_iteration_metrics.len() <= 1 {
1124 return 1.0; }
1126 let iter_gen_tps: Vec<f64> = r
1127 .per_iteration_metrics
1128 .iter()
1129 .map(|m| m.generation_tps)
1130 .collect();
1131 let iter_mean = mean(&iter_gen_tps);
1132 if iter_mean == 0.0 {
1133 return 1.0;
1134 }
1135 let iter_std = std_dev(&iter_gen_tps, iter_mean);
1136 let cv = iter_std / iter_mean; (1.0 - (cv * 5.0)).clamp(0.0, 1.0)
1140 })
1141 .collect();
1142
1143 let top_n = std::cmp::min(20, total_tests);
1145 let top_indices: Vec<(usize, usize)> = (0..total_tests)
1146 .map(|i| (i, results[i].metrics.generation_tps))
1147 .enumerate()
1148 .take(top_n)
1149 .map(|(rank, (idx, _))| (rank + 1, idx))
1150 .collect();
1151
1152 let top_labels: Vec<String> = top_indices
1153 .iter()
1154 .map(|(_rank, idx)| {
1155 let base = results[*idx].base_settings.as_ref().unwrap();
1156 let rp = resolve_params(&results[*idx].params, base);
1157 format!("T={:.2} TP={:.2}", rp.temperature, rp.top_p)
1158 })
1159 .collect();
1160 let top_gen_tps: Vec<f64> = top_indices
1161 .iter()
1162 .map(|(_, idx)| results[*idx].metrics.generation_tps)
1163 .collect();
1164
1165 let scatter_gen_tps: Vec<f64> = results.iter().map(|r| r.metrics.generation_tps).collect();
1167 let scatter_latency: Vec<f64> = results
1168 .iter()
1169 .map(|r| r.metrics.latency_per_token)
1170 .collect();
1171 let scatter_first_token: Vec<f64> =
1172 results.iter().map(|r| r.metrics.first_token_time).collect();
1173
1174 let param_headers: Vec<String> = vec![
1175 "Temp".to_string(),
1176 "Top-P".to_string(),
1177 "Top-K".to_string(),
1178 "RepPen".to_string(),
1179 "FA".to_string(),
1180 "Threads".to_string(),
1181 "Batch".to_string(),
1182 "Exp".to_string(),
1183 "Spec".to_string(),
1184 "Draft".to_string(),
1185 ];
1186 let param_vals: Vec<Vec<String>> = results
1187 .iter()
1188 .map(|r| {
1189 let base = r.base_settings.as_ref().unwrap();
1190 let rp = resolve_params(&r.params, base);
1191 vec![
1192 format!("{:.2}", rp.temperature),
1193 format!("{:.2}", rp.top_p),
1194 rp.top_k.to_string(),
1195 format!("{:.2}", rp.repeat_penalty),
1196 if rp.flash_attn {
1197 "ON".to_string()
1198 } else {
1199 "OFF".to_string()
1200 },
1201 rp.threads.to_string(),
1202 rp.batch_size.to_string(),
1203 rp.expert_count.to_string(),
1204 if rp.spec_type.is_empty() {
1205 "-".to_string()
1206 } else {
1207 rp.spec_type.clone()
1208 },
1209 rp.draft_tokens.to_string(),
1210 ]
1211 })
1212 .collect();
1213
1214 let metrics_data: Vec<serde_json::Value> = results
1216 .iter()
1217 .enumerate()
1218 .map(|(i, r)| {
1219 let base = r.base_settings.as_ref().unwrap();
1220 let rp = resolve_params(&r.params, base);
1221 serde_json::json!({
1222 "idx": i,
1223 "temp": rp.temperature,
1224 "top_p": rp.top_p,
1225 "top_k": rp.top_k,
1226 "repeat_penalty": rp.repeat_penalty,
1227 "flash_attn": rp.flash_attn,
1228 "threads": rp.threads,
1229 "batch_size": rp.batch_size,
1230 "expert_count": rp.expert_count,
1231 "spec_type": rp.spec_type,
1232 "draft_tokens": rp.draft_tokens,
1233 "prompt_tps": r.metrics.prompt_tps,
1234 "generation_tps": r.metrics.generation_tps,
1235 "combined_tps": r.metrics.combined_tps,
1236 "latency_per_token": r.metrics.latency_per_token,
1237 "first_token_time": r.metrics.first_token_time,
1238 "consistency": consistency_data[i],
1239 "outputs": r.outputs,
1240 "per_iteration_metrics": r.per_iteration_metrics.iter().map(|m| {
1241 serde_json::json!({
1242 "prompt_tps": m.prompt_tps,
1243 "generation_tps": m.generation_tps,
1244 "combined_tps": m.combined_tps,
1245 "latency_per_token": m.latency_per_token,
1246 "first_token_time": m.first_token_time,
1247 })
1248 }).collect::<Vec<_>>(),
1249 "server_command": r.server_command.as_deref().unwrap_or("-"),
1250 })
1251 })
1252 .collect();
1253
1254 let scatter_data_json = serde_json::to_string(
1256 &scatter_gen_tps
1257 .iter()
1258 .zip(scatter_latency.iter())
1259 .zip(scatter_first_token.iter())
1260 .map(|((g, l), f)| {
1261 let mut s = String::from("{x:");
1262 s.push_str(&format!("{:.2}", g));
1263 s.push_str(",y:");
1264 s.push_str(&format!("{:.2}", l));
1265 s.push_str(",ft:");
1266 s.push_str(&format!("{:.2}", f));
1267 s.push('}');
1268 s
1269 })
1270 .collect::<Vec<_>>(),
1271 )
1272 .unwrap();
1273 let scatter_data2_json = serde_json::to_string(
1274 &scatter_gen_tps
1275 .iter()
1276 .zip(scatter_first_token.iter())
1277 .map(|(g, f)| {
1278 let mut s = String::from("{x:");
1279 s.push_str(&format!("{:.2}", g));
1280 s.push_str(",y:");
1281 s.push_str(&format!("{:.2}", f));
1282 s.push_str(",lat:");
1283 s.push_str(&format!("{:.2}", min_val(&latency)));
1284 s.push('}');
1285 s
1286 })
1287 .collect::<Vec<_>>(),
1288 )
1289 .unwrap();
1290
1291 let model_meta_json = model_info.as_ref().map(|(name, _size, settings)| {
1293 serde_json::json!({
1294 "model_name": name,
1295 "context_length": settings.context_length,
1296 "threads": settings.threads,
1297 "temperature": settings.temperature,
1298 "top_p": settings.top_p,
1299 "top_k": settings.top_k,
1300 "repeat_penalty": settings.repeat_penalty,
1301 "flash_attn": settings.flash_attn,
1302 "kv_cache_offload": settings.kv_cache_offload,
1303 "mlock": settings.mlock,
1304 "system_prompt": settings.system_prompt,
1305 })
1306 });
1307
1308 let _impact_json = serde_json::to_string(&impact_sorted).unwrap();
1310
1311 let column_defs_json = serde_json::to_string(&vec![
1313 ("col-rank", "#", true),
1314 ("col-temp", "Temp", true),
1315 ("col-top-p", "Top-P", true),
1316 ("col-top-k", "Top-K", true),
1317 ("col-rep-pen", "RepPen", true),
1318 ("col-fa", "FA", true),
1319 ("col-threads", "Threads", true),
1320 ("col-batch", "Batch", true),
1321 ("col-exp", "Exp", true),
1322 ("col-spec", "Spec", true),
1323 ("col-draft", "Draft", true),
1324 ("col-gen-tps", "Gen t/s", true),
1325 ("col-prompt-tps", "Prompt t/s", true),
1326 ("col-latency", "Latency", true),
1327 ("col-first-token", "First Tok", true),
1328 ("col-combined", "Combined", true),
1329 ("col-consistency", "Consistency", true),
1330 ])
1331 .unwrap();
1332
1333 let csv_header = "Rank,Temp,Top-P,Top-K,RepPen,FA,Threads,Batch,Exp,Spec,Draft,Gen t/s,Prompt t/s,Latency (ms),First Tok (ms),Combined,Consistency";
1335 let csv_rows: Vec<String> = (0..total_tests)
1336 .map(|i| {
1337 let d = &metrics_data[i];
1338 let rank = i + 1;
1339 let spec = d
1340 .get("spec_type")
1341 .map(|v| v.as_str().unwrap_or("-"))
1342 .unwrap_or("-")
1343 .to_string();
1344 let draft = d
1345 .get("draft_tokens")
1346 .map(|v| v.as_u64().unwrap_or(0).to_string())
1347 .unwrap_or("-".to_string());
1348 format!(
1349 "{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{:.1}",
1350 rank,
1351 d["temp"].as_f64().unwrap_or(0.0),
1352 d["top_p"].as_f64().unwrap_or(0.0),
1353 d["top_k"].as_i64().unwrap_or(0),
1354 d["repeat_penalty"].as_f64().unwrap_or(0.0),
1355 if d["flash_attn"].as_bool().unwrap_or(false) {
1356 "ON"
1357 } else {
1358 "OFF"
1359 },
1360 d["threads"].as_u64().unwrap_or(0),
1361 d["batch_size"].as_u64().unwrap_or(0),
1362 d["expert_count"].as_i64().unwrap_or(0),
1363 spec,
1364 draft,
1365 d["generation_tps"].as_f64().unwrap_or(0.0),
1366 d["prompt_tps"].as_f64().unwrap_or(0.0),
1367 d["latency_per_token"].as_f64().unwrap_or(0.0),
1368 d["first_token_time"].as_f64().unwrap_or(0.0),
1369 d["combined_tps"].as_f64().unwrap_or(0.0),
1370 d["consistency"].as_f64().unwrap_or(1.0)
1371 )
1372 })
1373 .collect();
1374 let csv_content = format!("{}\n{}", csv_header, csv_rows.join("\n"));
1375 let csv_b64 = base64_encode(&csv_content);
1376
1377 let metrics_json = serde_json::to_string(&metrics_data).unwrap();
1378 let param_headers_json = serde_json::to_string(¶m_headers).unwrap();
1379 let param_vals_json = serde_json::to_string(¶m_vals).unwrap();
1380 let top_labels_json = serde_json::to_string(&top_labels).unwrap();
1381 let top_gen_tps_json = serde_json::to_string(&top_gen_tps).unwrap();
1382
1383 let model_meta_html = model_info
1385 .as_ref()
1386 .map(|(name, _size, s)| {
1387 format!(
1388 r#"
1389<div class="meta-section">
1390<h2>Model & Configuration</h2>
1391<div class="meta-grid">
1392<div class="meta-item"><div class="ml">Model</div><div class="mv">{}</div></div>
1393<div class="meta-item"><div class="ml">Context</div><div class="mv">{}</div></div>
1394<div class="meta-item"><div class="ml">Threads</div><div class="mv">{}</div></div>
1395<div class="meta-item"><div class="ml">Flash Attention</div><div class="mv">{}</div></div>
1396<div class="meta-item"><div class="ml">KV Cache Offload</div><div class="mv">{}</div></div>
1397<div class="meta-item"><div class="ml">MLOCK</div><div class="mv">{}</div></div>
1398<div class="meta-item"><div class="ml">Prompt</div><div class="mv meta-prompt">{}</div></div>
1399</div>
1400</div>"#,
1401 escape_html(name),
1402 s.context_length,
1403 s.threads,
1404 if s.flash_attn { "ON" } else { "OFF" },
1405 if s.kv_cache_offload { "ON" } else { "OFF" },
1406 if s.mlock { "ON" } else { "OFF" },
1407 escape_html(&s.system_prompt.chars().take(100).collect::<String>())
1408 )
1409 })
1410 .unwrap_or_default();
1411
1412 let winner_html = best_idx.and_then(|idx| {
1414 let r = &results[idx];
1415 let base = r.base_settings.as_ref()?;
1416 let rp = resolve_params(&r.params, base);
1417 let m = &r.metrics;
1418 Some(format!(r#"
1419<div class="winner-section">
1420<div class="winner-icon">🏆</div>
1421<div class="winner-content">
1422<div class="winner-title">Best Configuration</div>
1423<div class="winner-metrics">
1424<div class="winner-metric"><span class="wm-label">Gen t/s</span><span class="wm-value" style="color:#3fb950;font-size:1.8em;">{:.2}</span></div>
1425<div class="winner-metric"><span class="wm-label">Prompt t/s</span><span class="wm-value">{:.2}</span></div>
1426<div class="winner-metric"><span class="wm-label">Latency</span><span class="wm-value">{:.2}ms</span></div>
1427<div class="winner-metric"><span class="wm-label">First Token</span><span class="wm-value">{:.0}ms</span></div>
1428</div>
1429<div class="winner-params">Temp: {:.2} · Top-P: {:.2} · Top-K: {} · RepPen: {:.2} · FA: {} · Threads: {} · Batch: {} · Exp: {} · Spec: {} · Draft: {}</div>
1430</div>
1431</div>"#,
1432 m.generation_tps, m.prompt_tps, m.latency_per_token, m.first_token_time,
1433 rp.temperature, rp.top_p, rp.top_k, rp.repeat_penalty,
1434 if rp.flash_attn { "ON" } else { "OFF" }, rp.threads,
1435 rp.batch_size, rp.expert_count,
1436 if rp.spec_type.is_empty() { "Off".to_string() } else { rp.spec_type.clone() }, rp.draft_tokens
1437 ))
1438 }).unwrap_or_default();
1439
1440 let impact_html = if !impact_sorted.is_empty() {
1442 let max_impact = impact_sorted[0].2;
1443 let rows: String = impact_sorted
1444 .iter()
1445 .map(|(label, spread, value)| {
1446 let bar_width = if max_impact > 0.0 {
1447 (value / max_impact * 100.0) as i32
1448 } else {
1449 0
1450 };
1451 let bar_color = if *value > max_impact * 0.7 {
1452 "#f85149"
1453 } else if *value > max_impact * 0.4 {
1454 "#d29922"
1455 } else {
1456 "#3fb950"
1457 };
1458 format!(
1459 r#"<div class="impact-row">
1460<div class="impact-label">{}</div>
1461<div class="impact-bar-bg"><div class="impact-bar-fill" style="width:{}%;background:{}"></div></div>
1462<div class="impact-value">{}</div>
1463</div>"#,
1464 label, bar_width, bar_color, spread
1465 )
1466 })
1467 .collect();
1468 format!(
1469 r#"
1470<div class="impact-section">
1471<h2>Parameter Impact Analysis</h2>
1472<p class="impact-desc">Larger spread in generation throughput between parameter values indicates greater impact on performance.</p>
1473{}
1474</div>"#,
1475 rows
1476 )
1477 } else {
1478 r#"<div class="impact-section"><h2>Parameter Impact Analysis</h2><p class="impact-desc">All parameters were held constant — no impact data available.</p></div>"#.to_string()
1479 };
1480
1481 let empty_html = if total_tests == 0 {
1483 r#"<div class="empty-state">
1484<div class="empty-icon">📊</div>
1485<div class="empty-title">No Results</div>
1486<div class="empty-text">Run a benchmark tuning test to generate results here.</div>
1487</div>"#
1488 } else {
1489 ""
1490 };
1491
1492 let html = include_str!("benchmark_report.html");
1493
1494 html.replace("__TIMESTAMP__", ×tamp)
1496 .replace("__TOTAL_TESTS__", &total_tests.to_string())
1497 .replace("__EMPTY_STATE__", empty_html)
1498 .replace("__MODEL_META__", &model_meta_html)
1499 .replace("__WINNER__", &winner_html)
1500 .replace("__AVG_GEN_TPS__", &format!("{:.1}", avg_gen_tps))
1501 .replace(
1502 "__MED_GEN_TPS__",
1503 &format!("{:.1}", median(&mut gen_tps_sorted)),
1504 )
1505 .replace("__GEN_STD__", &format!("{:.1}", gen_std))
1506 .replace("__MIN_GEN__", &format!("{:.1}", min_gen_tps))
1507 .replace("__MAX_GEN__", &format!("{:.1}", best_gen_tps))
1508 .replace("__AVG_PROMPT_TPS__", &format!("{:.1}", avg_prompt_tps))
1509 .replace(
1510 "__MED_PROMPT_TPS__",
1511 &format!("{:.1}", median(&mut prompt_tps)),
1512 )
1513 .replace("__PROMPT_STD__", &format!("{:.1}", prompt_std))
1514 .replace("__MIN_PROMPT__", &format!("{:.1}", min_prompt_tps))
1515 .replace("__MAX_PROMPT__", &format!("{:.1}", best_prompt_tps))
1516 .replace("__AVG_LATENCY__", &format!("{:.1}ms", avg_latency))
1517 .replace(
1518 "__MED_LATENCY__",
1519 &format!("{:.1}ms", median(&mut latency_sorted)),
1520 )
1521 .replace("__LAT_STD__", &format!("{:.1}", lat_std))
1522 .replace("__MIN_LAT__", &format!("{:.1}", min_latency))
1523 .replace("__MAX_LAT__", &format!("{:.1}", best_latency))
1524 .replace("__AVG_FT__", &format!("{:.0}ms", avg_first_token))
1525 .replace("__MED_FT__", &format!("{:.0}ms", median(&mut first_token)))
1526 .replace("__FT_STD__", &format!("{:.0}", ft_std))
1527 .replace("__MIN_FT__", &format!("{:.0}ms", min_first_token))
1528 .replace("__MAX_FT__", &format!("{:.0}ms", best_first_token))
1529 .replace("__BEST_GEN__", &format!("{:.1}", best_gen_tps))
1530 .replace("__TOP_N__", &top_n.to_string())
1531 .replace("__IMPACT_HTML__", &impact_html)
1532 .replace("__METRICS_JSON__", &metrics_json)
1533 .replace("__PARAM_HEADERS_JSON__", ¶m_headers_json)
1534 .replace("__PARAM_VALS_JSON__", ¶m_vals_json)
1535 .replace("__TOP_LABELS_JSON__", &top_labels_json)
1536 .replace("__TOP_GEN_TPS_JSON__", &top_gen_tps_json)
1537 .replace("__SCATTER_DATA_JSON__", &scatter_data_json)
1538 .replace("__SCATTER_DATA2_JSON__", &scatter_data2_json)
1539 .replace("__COLUMN_DEFS_JSON__", &column_defs_json)
1540 .replace("__CSV_B64__", &csv_b64)
1541 .replace(
1542 "__MODEL_META_JSON__",
1543 &serde_json::to_string(&model_meta_json).unwrap(),
1544 )
1545}
1546
1547fn escape_html(s: &str) -> String {
1549 s.replace('&', "&")
1550 .replace('<', "<")
1551 .replace('>', ">")
1552 .replace('"', """)
1553}
1554
1555fn base64_encode(input: &str) -> String {
1557 const CHARS: &[u8] = b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
1558 let bytes = input.as_bytes();
1559 let mut result = String::new();
1560 for chunk in bytes.chunks(3) {
1561 let b0 = chunk[0] as u32;
1562 let b1 = if chunk.len() > 1 { chunk[1] as u32 } else { 0 };
1563 let b2 = if chunk.len() > 2 { chunk[2] as u32 } else { 0 };
1564 let triple = (b0 << 16) | (b1 << 8) | b2;
1565 result.push(CHARS[((triple >> 18) & 0x3F) as usize] as char);
1566 result.push(CHARS[((triple >> 12) & 0x3F) as usize] as char);
1567 if chunk.len() > 1 {
1568 result.push(CHARS[((triple >> 6) & 0x3F) as usize] as char);
1569 } else {
1570 result.push('=');
1571 }
1572 if chunk.len() > 2 {
1573 result.push(CHARS[(triple & 0x3F) as usize] as char);
1574 } else {
1575 result.push('=');
1576 }
1577 }
1578 result
1579}
1580
1581struct InferenceResult {
1583 prompt_tokens: u64,
1584 generation_tokens: u64,
1585 prompt_time: Duration,
1586 generation_time: Duration,
1587 total_time: Duration,
1588 first_token_time: u128, content: String,
1590}