1pub mod arrivals;
27pub mod env;
28pub mod profile;
29pub mod report;
30pub mod stats;
31pub mod trace;
32
33pub use env::{Env, EnvHash};
34pub use profile::{
35 configure_global_profile, flush_global_profile, global_profile, parse_profile_event_value,
36 parse_profile_jsonl_str, profile_fields_from_json, ProfileEvent, ProfileJsonlWriter,
37 ProfileMetadata, ProfileSinkConfig,
38};
39pub use stats::{ci95_half_width, percentile, student_t_975, PercentileStats, ScalarStats};
40
41use serde::{Deserialize, Serialize};
42
43#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
45#[serde(rename_all = "snake_case")]
46pub enum Scenario {
47 ClosedLoop,
49 OpenLoop,
51 SharedPrefix,
53 Cli,
55}
56
57#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
63pub struct Slo {
64 pub ttft_p99_ms: f64,
65 pub tpot_p99_ms: f64,
66 pub e2e_p99_ms: f64,
67}
68
69impl Default for Slo {
70 fn default() -> Self {
71 Self {
73 ttft_p99_ms: 500.0,
74 tpot_p99_ms: 50.0,
75 e2e_p99_ms: 30_000.0,
76 }
77 }
78}
79
80#[derive(Debug, Clone, Serialize, Deserialize)]
83pub struct MetricSet {
84 pub p50: PercentileStats,
85 pub p75: PercentileStats,
86 pub p95: PercentileStats,
87 pub p99: PercentileStats,
88}
89
90#[derive(Debug, Clone, Serialize, Deserialize)]
94pub struct BenchReport {
95 pub model: String,
96 pub backend: String,
97 pub scenario: Scenario,
98
99 #[serde(default, skip_serializing_if = "Option::is_none")]
101 pub concurrency: Option<u32>,
102 #[serde(default, skip_serializing_if = "Option::is_none")]
104 pub request_rate: Option<f64>,
105
106 pub n_prompt: u32,
107 pub n_gen: u32,
108 pub n_repeats: u32,
109 pub n_requests_per_run: u32,
110 pub warmup_requests: u32,
111
112 pub ttft_ms: MetricSet,
113 pub tpot_ms: MetricSet,
114 pub itl_ms: MetricSet,
115 pub e2e_ms: MetricSet,
116
117 pub output_throughput_tps: ScalarStats,
118 pub total_throughput_tps: ScalarStats,
119 pub request_throughput_rps: ScalarStats,
120 pub goodput_rps: ScalarStats,
121
122 pub slo: Slo,
123
124 pub completed_per_run: Vec<u32>,
125 pub errored_per_run: Vec<u32>,
126
127 pub env: Env,
128 pub env_hash: EnvHash,
129}
130
131#[derive(Debug, Clone)]
133pub struct RequestRecord {
134 pub success: bool,
135 pub ttft_ms: f64,
136 pub e2e_ms: f64,
137 pub input_tokens: u32,
138 pub output_tokens: u32,
139 pub itl_ms: Vec<f64>,
142}
143
144impl RequestRecord {
145 pub fn tpot_ms(&self) -> Option<f64> {
147 if self.output_tokens < 2 {
148 return None;
149 }
150 Some((self.e2e_ms - self.ttft_ms) / (self.output_tokens - 1) as f64)
151 }
152
153 pub fn meets_slo(&self, slo: &Slo) -> bool {
156 if !self.success {
157 return false;
158 }
159 let ttft_ok = self.ttft_ms <= slo.ttft_p99_ms;
160 let e2e_ok = self.e2e_ms <= slo.e2e_p99_ms;
161 let tpot_ok = self.tpot_ms().map(|t| t <= slo.tpot_p99_ms).unwrap_or(true);
162 ttft_ok && e2e_ok && tpot_ok
163 }
164}
165
166#[derive(Debug, Clone)]
168pub struct RunRecord {
169 pub records: Vec<RequestRecord>,
170 pub duration_s: f64,
173}
174
175impl RunRecord {
176 pub fn n_completed(&self) -> u32 {
177 self.records.iter().filter(|r| r.success).count() as u32
178 }
179 pub fn n_errored(&self) -> u32 {
180 self.records.iter().filter(|r| !r.success).count() as u32
181 }
182}
183
184#[allow(clippy::too_many_arguments)]
195pub fn compute_metrics(
196 model: String,
197 backend: String,
198 scenario: Scenario,
199 concurrency: Option<u32>,
200 request_rate: Option<f64>,
201 n_prompt: u32,
202 n_gen: u32,
203 warmup_requests: u32,
204 slo: Slo,
205 runs: Vec<RunRecord>,
206 env: Env,
207) -> BenchReport {
208 assert!(!runs.is_empty(), "compute_metrics: n_repeats must be ≥ 1");
209 let n_repeats = runs.len() as u32;
210 let n_requests_per_run = runs[0].records.len() as u32;
211
212 let mut ttft_p50 = Vec::with_capacity(runs.len());
213 let mut ttft_p75 = Vec::with_capacity(runs.len());
214 let mut ttft_p95 = Vec::with_capacity(runs.len());
215 let mut ttft_p99 = Vec::with_capacity(runs.len());
216 let mut tpot_p50 = Vec::with_capacity(runs.len());
217 let mut tpot_p75 = Vec::with_capacity(runs.len());
218 let mut tpot_p95 = Vec::with_capacity(runs.len());
219 let mut tpot_p99 = Vec::with_capacity(runs.len());
220 let mut itl_p50 = Vec::with_capacity(runs.len());
221 let mut itl_p75 = Vec::with_capacity(runs.len());
222 let mut itl_p95 = Vec::with_capacity(runs.len());
223 let mut itl_p99 = Vec::with_capacity(runs.len());
224 let mut e2e_p50 = Vec::with_capacity(runs.len());
225 let mut e2e_p75 = Vec::with_capacity(runs.len());
226 let mut e2e_p95 = Vec::with_capacity(runs.len());
227 let mut e2e_p99 = Vec::with_capacity(runs.len());
228
229 let mut output_thr = Vec::with_capacity(runs.len());
230 let mut total_thr = Vec::with_capacity(runs.len());
231 let mut req_thr = Vec::with_capacity(runs.len());
232 let mut good_thr = Vec::with_capacity(runs.len());
233
234 let mut completed_per_run = Vec::with_capacity(runs.len());
235 let mut errored_per_run = Vec::with_capacity(runs.len());
236
237 for run in &runs {
238 let success: Vec<&RequestRecord> = run.records.iter().filter(|r| r.success).collect();
239 completed_per_run.push(success.len() as u32);
240 errored_per_run.push((run.records.len() - success.len()) as u32);
241
242 let ttfts: Vec<f64> = success.iter().map(|r| r.ttft_ms).collect();
243 let tpots: Vec<f64> = success.iter().filter_map(|r| r.tpot_ms()).collect();
244 let e2es: Vec<f64> = success.iter().map(|r| r.e2e_ms).collect();
245 let itls: Vec<f64> = success
246 .iter()
247 .flat_map(|r| r.itl_ms.iter().copied())
248 .collect();
249
250 ttft_p50.push(percentile(&ttfts, 0.50));
251 ttft_p75.push(percentile(&ttfts, 0.75));
252 ttft_p95.push(percentile(&ttfts, 0.95));
253 ttft_p99.push(percentile(&ttfts, 0.99));
254 tpot_p50.push(percentile(&tpots, 0.50));
255 tpot_p75.push(percentile(&tpots, 0.75));
256 tpot_p95.push(percentile(&tpots, 0.95));
257 tpot_p99.push(percentile(&tpots, 0.99));
258 itl_p50.push(percentile(&itls, 0.50));
259 itl_p75.push(percentile(&itls, 0.75));
260 itl_p95.push(percentile(&itls, 0.95));
261 itl_p99.push(percentile(&itls, 0.99));
262 e2e_p50.push(percentile(&e2es, 0.50));
263 e2e_p75.push(percentile(&e2es, 0.75));
264 e2e_p95.push(percentile(&e2es, 0.95));
265 e2e_p99.push(percentile(&e2es, 0.99));
266
267 let total_in: u64 = success.iter().map(|r| r.input_tokens as u64).sum();
268 let total_out: u64 = success.iter().map(|r| r.output_tokens as u64).sum();
269 let dur = run.duration_s.max(f64::EPSILON);
270 output_thr.push(total_out as f64 / dur);
271 total_thr.push((total_in + total_out) as f64 / dur);
272 req_thr.push(success.len() as f64 / dur);
273
274 let good = success.iter().filter(|r| r.meets_slo(&slo)).count();
275 good_thr.push(good as f64 / dur);
276 }
277
278 let env_hash = env.hash();
279 BenchReport {
280 model,
281 backend,
282 scenario,
283 concurrency,
284 request_rate,
285 n_prompt,
286 n_gen,
287 n_repeats,
288 n_requests_per_run,
289 warmup_requests,
290 ttft_ms: MetricSet {
291 p50: ScalarStats::from_samples(&ttft_p50),
292 p75: ScalarStats::from_samples(&ttft_p75),
293 p95: ScalarStats::from_samples(&ttft_p95),
294 p99: ScalarStats::from_samples(&ttft_p99),
295 },
296 tpot_ms: MetricSet {
297 p50: ScalarStats::from_samples(&tpot_p50),
298 p75: ScalarStats::from_samples(&tpot_p75),
299 p95: ScalarStats::from_samples(&tpot_p95),
300 p99: ScalarStats::from_samples(&tpot_p99),
301 },
302 itl_ms: MetricSet {
303 p50: ScalarStats::from_samples(&itl_p50),
304 p75: ScalarStats::from_samples(&itl_p75),
305 p95: ScalarStats::from_samples(&itl_p95),
306 p99: ScalarStats::from_samples(&itl_p99),
307 },
308 e2e_ms: MetricSet {
309 p50: ScalarStats::from_samples(&e2e_p50),
310 p75: ScalarStats::from_samples(&e2e_p75),
311 p95: ScalarStats::from_samples(&e2e_p95),
312 p99: ScalarStats::from_samples(&e2e_p99),
313 },
314 output_throughput_tps: ScalarStats::from_samples(&output_thr),
315 total_throughput_tps: ScalarStats::from_samples(&total_thr),
316 request_throughput_rps: ScalarStats::from_samples(&req_thr),
317 goodput_rps: ScalarStats::from_samples(&good_thr),
318 slo,
319 completed_per_run,
320 errored_per_run,
321 env,
322 env_hash,
323 }
324}
325
326#[cfg(test)]
327mod tests {
328 use super::*;
329
330 fn req(success: bool, ttft: f64, e2e: f64, in_tok: u32, out_tok: u32) -> RequestRecord {
331 RequestRecord {
332 success,
333 ttft_ms: ttft,
334 e2e_ms: e2e,
335 input_tokens: in_tok,
336 output_tokens: out_tok,
337 itl_ms: vec![],
338 }
339 }
340
341 #[test]
342 fn tpot_undefined_for_short_response() {
343 let r = req(true, 100.0, 100.0, 5, 1);
344 assert_eq!(r.tpot_ms(), None);
345 let r = req(true, 100.0, 200.0, 5, 2);
346 assert_eq!(r.tpot_ms(), Some(100.0));
347 }
348
349 #[test]
350 fn slo_short_response_treated_as_tpot_ok() {
351 let slo = Slo::default();
352 let r = req(true, 100.0, 200.0, 5, 1);
354 assert!(r.meets_slo(&slo));
355 }
356
357 #[test]
358 fn slo_failure_modes() {
359 let slo = Slo::default();
360 assert!(!req(true, 1000.0, 1100.0, 5, 10).meets_slo(&slo));
362 assert!(!req(true, 100.0, 40_000.0, 5, 10).meets_slo(&slo));
364 assert!(!req(false, 100.0, 200.0, 5, 10).meets_slo(&slo));
366 assert!(req(true, 100.0, 200.0, 5, 10).meets_slo(&slo));
368 }
369
370 fn make_run(records: Vec<RequestRecord>, duration_s: f64) -> RunRecord {
371 RunRecord {
372 records,
373 duration_s,
374 }
375 }
376
377 #[test]
378 fn aggregate_three_repeats() {
379 let mk_run = || {
381 make_run(
382 vec![
383 req(true, 100.0, 200.0, 10, 10),
384 req(true, 120.0, 240.0, 10, 10),
385 req(true, 140.0, 280.0, 10, 10),
386 req(true, 160.0, 320.0, 10, 10),
387 ],
388 10.0,
389 )
390 };
391 let report = compute_metrics(
392 "test".into(),
393 "cpu".into(),
394 Scenario::ClosedLoop,
395 Some(4),
396 None,
397 10,
398 10,
399 0,
400 Slo::default(),
401 vec![mk_run(), mk_run(), mk_run()],
402 Env::default(),
403 );
404 assert_eq!(report.n_repeats, 3);
405 assert_eq!(report.n_requests_per_run, 4);
406 assert_eq!(report.ttft_ms.p50.stddev, 0.0);
408 assert!((report.ttft_ms.p50.mean - 130.0).abs() < 1e-9);
410 assert!((report.output_throughput_tps.mean - 4.0).abs() < 1e-9);
412 assert!((report.request_throughput_rps.mean - 0.4).abs() < 1e-9);
414 assert!((report.goodput_rps.mean - 0.4).abs() < 1e-9);
416 assert!(report.env_hash.as_str().starts_with("sha256:"));
418 }
419
420 #[test]
421 fn goodput_excludes_slo_violators() {
422 let run = make_run(
423 vec![
424 req(true, 100.0, 200.0, 10, 10), req(true, 1000.0, 1100.0, 10, 10), req(true, 100.0, 40_000.0, 10, 10), req(false, 100.0, 200.0, 10, 10), ],
429 10.0,
430 );
431 let report = compute_metrics(
432 "test".into(),
433 "cpu".into(),
434 Scenario::OpenLoop,
435 None,
436 Some(10.0),
437 10,
438 10,
439 0,
440 Slo::default(),
441 vec![run],
442 Env::default(),
443 );
444 assert!((report.request_throughput_rps.mean - 0.3).abs() < 1e-9);
446 assert!((report.goodput_rps.mean - 0.1).abs() < 1e-9);
448 }
449
450 #[test]
451 fn json_round_trip() {
452 let run = make_run(
453 vec![
454 req(true, 100.0, 200.0, 10, 10),
455 req(true, 120.0, 240.0, 10, 10),
456 ],
457 5.0,
458 );
459 let report = compute_metrics(
460 "qwen3:0.6b".into(),
461 "metal".into(),
462 Scenario::ClosedLoop,
463 Some(2),
464 None,
465 256,
466 128,
467 10,
468 Slo::default(),
469 vec![run.clone(), run.clone(), run],
470 Env::default(),
471 );
472 let json = serde_json::to_string_pretty(&report).unwrap();
473 let parsed: BenchReport = serde_json::from_str(&json).unwrap();
474 assert_eq!(parsed.model, "qwen3:0.6b");
475 assert_eq!(parsed.backend, "metal");
476 assert_eq!(parsed.n_repeats, 3);
477 assert_eq!(parsed.concurrency, Some(2));
478 assert_eq!(parsed.request_rate, None);
479 }
480}