ferrum_bench_core/
lib.rs

1//! ferrum-bench-core — canonical schema, metric aggregation, and
2//! variance reporting for ferrum's `bench` and `bench-serve` commands.
3//!
4//! Locked by `docs/bench/PLAYBOOK.md` § 7. Do not invent variants;
5//! producers and consumers (bench, bench-serve, compare-commits,
6//! visualizer, dashboards) all build against the types here.
7//!
8//! # Quick map
9//!
10//! - [`BenchReport`] — top-level: one bench cell, aggregated across `n_repeats`
11//! - [`Scenario`] — closed-loop / open-loop / shared-prefix / cli
12//! - [`MetricSet`] — p50/p75/p95/p99 of one latency metric
13//! - [`ScalarStats`] — `{mean, stddev, ci95_hw}` ([`stats`] module)
14//! - [`Env`] + [`EnvHash`] — apples-to-apples cell identity ([`env`] module)
15//! - [`ProfileEvent`] — locked structured profile JSONL envelope ([`profile`] module)
16//! - [`compute_metrics`] — the one aggregator both bench CLIs call
17//! - [`arrivals`] module — Poisson inter-arrival times for open-loop
18//!
19//! # Determinism notes
20//!
21//! - JSON keys are emitted in struct field-declaration order; field
22//!   order is part of the locked schema and should not change.
23//! - `BTreeMap` (not `HashMap`) for any dynamic key-value bag.
24//! - CI95 fields are suppressed when `n_repeats < 3` (degenerate).
25
26pub mod arrivals;
27pub mod env;
28pub mod profile;
29pub mod report;
30pub mod stats;
31pub mod trace;
32
33pub use env::{Env, EnvHash};
34pub use profile::{
35    configure_global_profile, flush_global_profile, global_profile, parse_profile_event_value,
36    parse_profile_jsonl_str, profile_fields_from_json, ProfileEvent, ProfileJsonlWriter,
37    ProfileMetadata, ProfileSinkConfig,
38};
39pub use stats::{ci95_half_width, percentile, student_t_975, PercentileStats, ScalarStats};
40
41use serde::{Deserialize, Serialize};
42
43/// Locked enum of bench scenarios — see `docs/bench/PLAYBOOK.md` § 2.
44#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
45#[serde(rename_all = "snake_case")]
46pub enum Scenario {
47    /// `--concurrency K` — K workers in tight send→wait loop. Headline: throughput.
48    ClosedLoop,
49    /// `--request-rate R` — Poisson arrivals. Headline: goodput.
50    OpenLoop,
51    /// 1024-token shared prefix, burst arrival. Headline: cache hit rate.
52    SharedPrefix,
53    /// `ferrum bench` single-user batch=1. Headline: TTFT + TPOT.
54    Cli,
55}
56
57/// SLO thresholds applied when computing goodput. All in milliseconds.
58///
59/// A request is "good" iff `ttft ≤ ttft_p99_ms` AND `tpot ≤ tpot_p99_ms`
60/// AND `e2e ≤ e2e_p99_ms`. The `_p99_` naming is convention only — the
61/// comparison is per-request, not against the distribution.
62#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
63pub struct Slo {
64    pub ttft_p99_ms: f64,
65    pub tpot_p99_ms: f64,
66    pub e2e_p99_ms: f64,
67}
68
69impl Default for Slo {
70    fn default() -> Self {
71        // Production defaults from PLAYBOOK § 4.B.
72        Self {
73            ttft_p99_ms: 500.0,
74            tpot_p99_ms: 50.0,
75            e2e_p99_ms: 30_000.0,
76        }
77    }
78}
79
80/// Four percentile points for a single latency metric. Each point is a
81/// `ScalarStats` aggregate across `n_repeats` runs.
82#[derive(Debug, Clone, Serialize, Deserialize)]
83pub struct MetricSet {
84    pub p50: PercentileStats,
85    pub p75: PercentileStats,
86    pub p95: PercentileStats,
87    pub p99: PercentileStats,
88}
89
90/// One bench cell — `n_repeats` independent runs aggregated.
91///
92/// Field order matters for `env_hash` determinism — do not reorder.
93#[derive(Debug, Clone, Serialize, Deserialize)]
94pub struct BenchReport {
95    pub model: String,
96    pub backend: String,
97    pub scenario: Scenario,
98
99    /// Set iff `scenario` is `ClosedLoop` (or `SharedPrefix` closed variant).
100    #[serde(default, skip_serializing_if = "Option::is_none")]
101    pub concurrency: Option<u32>,
102    /// Set iff `scenario` is `OpenLoop` (or `SharedPrefix` open variant).
103    #[serde(default, skip_serializing_if = "Option::is_none")]
104    pub request_rate: Option<f64>,
105
106    pub n_prompt: u32,
107    pub n_gen: u32,
108    pub n_repeats: u32,
109    pub n_requests_per_run: u32,
110    pub warmup_requests: u32,
111
112    pub ttft_ms: MetricSet,
113    pub tpot_ms: MetricSet,
114    pub itl_ms: MetricSet,
115    pub e2e_ms: MetricSet,
116
117    pub output_throughput_tps: ScalarStats,
118    pub total_throughput_tps: ScalarStats,
119    pub request_throughput_rps: ScalarStats,
120    pub goodput_rps: ScalarStats,
121
122    pub slo: Slo,
123
124    pub completed_per_run: Vec<u32>,
125    pub errored_per_run: Vec<u32>,
126
127    pub env: Env,
128    pub env_hash: EnvHash,
129}
130
131/// One request's measurements (input to [`compute_metrics`]).
132#[derive(Debug, Clone)]
133pub struct RequestRecord {
134    pub success: bool,
135    pub ttft_ms: f64,
136    pub e2e_ms: f64,
137    pub input_tokens: u32,
138    pub output_tokens: u32,
139    /// Per-token inter-arrival times within this request (decode steps,
140    /// `len = output_tokens - 1`). Empty if not measured.
141    pub itl_ms: Vec<f64>,
142}
143
144impl RequestRecord {
145    /// Per-request TPOT in ms, or `None` if `output_tokens < 2`.
146    pub fn tpot_ms(&self) -> Option<f64> {
147        if self.output_tokens < 2 {
148            return None;
149        }
150        Some((self.e2e_ms - self.ttft_ms) / (self.output_tokens - 1) as f64)
151    }
152
153    /// True if all three SLO thresholds are met (TPOT is treated as met
154    /// when undefined — single-token responses don't have meaningful TPOT).
155    pub fn meets_slo(&self, slo: &Slo) -> bool {
156        if !self.success {
157            return false;
158        }
159        let ttft_ok = self.ttft_ms <= slo.ttft_p99_ms;
160        let e2e_ok = self.e2e_ms <= slo.e2e_p99_ms;
161        let tpot_ok = self.tpot_ms().map(|t| t <= slo.tpot_p99_ms).unwrap_or(true);
162        ttft_ok && e2e_ok && tpot_ok
163    }
164}
165
166/// One independent run of the bench workload.
167#[derive(Debug, Clone)]
168pub struct RunRecord {
169    pub records: Vec<RequestRecord>,
170    /// Wall-clock duration of the run, in seconds. Used as the denominator
171    /// for throughput / goodput.
172    pub duration_s: f64,
173}
174
175impl RunRecord {
176    pub fn n_completed(&self) -> u32 {
177        self.records.iter().filter(|r| r.success).count() as u32
178    }
179    pub fn n_errored(&self) -> u32 {
180        self.records.iter().filter(|r| !r.success).count() as u32
181    }
182}
183
184/// Aggregate `n_repeats` independent runs into one [`BenchReport`].
185///
186/// The aggregation is two-level: within each run we compute the
187/// per-request percentile distribution (p50/p75/p95/p99); across runs
188/// we compute the mean + sample stddev + Student-t 95% CI half-width
189/// of those per-run percentile values.
190///
191/// # Panics
192///
193/// Panics if `runs.is_empty()`.
194#[allow(clippy::too_many_arguments)]
195pub fn compute_metrics(
196    model: String,
197    backend: String,
198    scenario: Scenario,
199    concurrency: Option<u32>,
200    request_rate: Option<f64>,
201    n_prompt: u32,
202    n_gen: u32,
203    warmup_requests: u32,
204    slo: Slo,
205    runs: Vec<RunRecord>,
206    env: Env,
207) -> BenchReport {
208    assert!(!runs.is_empty(), "compute_metrics: n_repeats must be ≥ 1");
209    let n_repeats = runs.len() as u32;
210    let n_requests_per_run = runs[0].records.len() as u32;
211
212    let mut ttft_p50 = Vec::with_capacity(runs.len());
213    let mut ttft_p75 = Vec::with_capacity(runs.len());
214    let mut ttft_p95 = Vec::with_capacity(runs.len());
215    let mut ttft_p99 = Vec::with_capacity(runs.len());
216    let mut tpot_p50 = Vec::with_capacity(runs.len());
217    let mut tpot_p75 = Vec::with_capacity(runs.len());
218    let mut tpot_p95 = Vec::with_capacity(runs.len());
219    let mut tpot_p99 = Vec::with_capacity(runs.len());
220    let mut itl_p50 = Vec::with_capacity(runs.len());
221    let mut itl_p75 = Vec::with_capacity(runs.len());
222    let mut itl_p95 = Vec::with_capacity(runs.len());
223    let mut itl_p99 = Vec::with_capacity(runs.len());
224    let mut e2e_p50 = Vec::with_capacity(runs.len());
225    let mut e2e_p75 = Vec::with_capacity(runs.len());
226    let mut e2e_p95 = Vec::with_capacity(runs.len());
227    let mut e2e_p99 = Vec::with_capacity(runs.len());
228
229    let mut output_thr = Vec::with_capacity(runs.len());
230    let mut total_thr = Vec::with_capacity(runs.len());
231    let mut req_thr = Vec::with_capacity(runs.len());
232    let mut good_thr = Vec::with_capacity(runs.len());
233
234    let mut completed_per_run = Vec::with_capacity(runs.len());
235    let mut errored_per_run = Vec::with_capacity(runs.len());
236
237    for run in &runs {
238        let success: Vec<&RequestRecord> = run.records.iter().filter(|r| r.success).collect();
239        completed_per_run.push(success.len() as u32);
240        errored_per_run.push((run.records.len() - success.len()) as u32);
241
242        let ttfts: Vec<f64> = success.iter().map(|r| r.ttft_ms).collect();
243        let tpots: Vec<f64> = success.iter().filter_map(|r| r.tpot_ms()).collect();
244        let e2es: Vec<f64> = success.iter().map(|r| r.e2e_ms).collect();
245        let itls: Vec<f64> = success
246            .iter()
247            .flat_map(|r| r.itl_ms.iter().copied())
248            .collect();
249
250        ttft_p50.push(percentile(&ttfts, 0.50));
251        ttft_p75.push(percentile(&ttfts, 0.75));
252        ttft_p95.push(percentile(&ttfts, 0.95));
253        ttft_p99.push(percentile(&ttfts, 0.99));
254        tpot_p50.push(percentile(&tpots, 0.50));
255        tpot_p75.push(percentile(&tpots, 0.75));
256        tpot_p95.push(percentile(&tpots, 0.95));
257        tpot_p99.push(percentile(&tpots, 0.99));
258        itl_p50.push(percentile(&itls, 0.50));
259        itl_p75.push(percentile(&itls, 0.75));
260        itl_p95.push(percentile(&itls, 0.95));
261        itl_p99.push(percentile(&itls, 0.99));
262        e2e_p50.push(percentile(&e2es, 0.50));
263        e2e_p75.push(percentile(&e2es, 0.75));
264        e2e_p95.push(percentile(&e2es, 0.95));
265        e2e_p99.push(percentile(&e2es, 0.99));
266
267        let total_in: u64 = success.iter().map(|r| r.input_tokens as u64).sum();
268        let total_out: u64 = success.iter().map(|r| r.output_tokens as u64).sum();
269        let dur = run.duration_s.max(f64::EPSILON);
270        output_thr.push(total_out as f64 / dur);
271        total_thr.push((total_in + total_out) as f64 / dur);
272        req_thr.push(success.len() as f64 / dur);
273
274        let good = success.iter().filter(|r| r.meets_slo(&slo)).count();
275        good_thr.push(good as f64 / dur);
276    }
277
278    let env_hash = env.hash();
279    BenchReport {
280        model,
281        backend,
282        scenario,
283        concurrency,
284        request_rate,
285        n_prompt,
286        n_gen,
287        n_repeats,
288        n_requests_per_run,
289        warmup_requests,
290        ttft_ms: MetricSet {
291            p50: ScalarStats::from_samples(&ttft_p50),
292            p75: ScalarStats::from_samples(&ttft_p75),
293            p95: ScalarStats::from_samples(&ttft_p95),
294            p99: ScalarStats::from_samples(&ttft_p99),
295        },
296        tpot_ms: MetricSet {
297            p50: ScalarStats::from_samples(&tpot_p50),
298            p75: ScalarStats::from_samples(&tpot_p75),
299            p95: ScalarStats::from_samples(&tpot_p95),
300            p99: ScalarStats::from_samples(&tpot_p99),
301        },
302        itl_ms: MetricSet {
303            p50: ScalarStats::from_samples(&itl_p50),
304            p75: ScalarStats::from_samples(&itl_p75),
305            p95: ScalarStats::from_samples(&itl_p95),
306            p99: ScalarStats::from_samples(&itl_p99),
307        },
308        e2e_ms: MetricSet {
309            p50: ScalarStats::from_samples(&e2e_p50),
310            p75: ScalarStats::from_samples(&e2e_p75),
311            p95: ScalarStats::from_samples(&e2e_p95),
312            p99: ScalarStats::from_samples(&e2e_p99),
313        },
314        output_throughput_tps: ScalarStats::from_samples(&output_thr),
315        total_throughput_tps: ScalarStats::from_samples(&total_thr),
316        request_throughput_rps: ScalarStats::from_samples(&req_thr),
317        goodput_rps: ScalarStats::from_samples(&good_thr),
318        slo,
319        completed_per_run,
320        errored_per_run,
321        env,
322        env_hash,
323    }
324}
325
326#[cfg(test)]
327mod tests {
328    use super::*;
329
330    fn req(success: bool, ttft: f64, e2e: f64, in_tok: u32, out_tok: u32) -> RequestRecord {
331        RequestRecord {
332            success,
333            ttft_ms: ttft,
334            e2e_ms: e2e,
335            input_tokens: in_tok,
336            output_tokens: out_tok,
337            itl_ms: vec![],
338        }
339    }
340
341    #[test]
342    fn tpot_undefined_for_short_response() {
343        let r = req(true, 100.0, 100.0, 5, 1);
344        assert_eq!(r.tpot_ms(), None);
345        let r = req(true, 100.0, 200.0, 5, 2);
346        assert_eq!(r.tpot_ms(), Some(100.0));
347    }
348
349    #[test]
350    fn slo_short_response_treated_as_tpot_ok() {
351        let slo = Slo::default();
352        // 1-token response: TPOT N/A, must not fail SLO on TPOT.
353        let r = req(true, 100.0, 200.0, 5, 1);
354        assert!(r.meets_slo(&slo));
355    }
356
357    #[test]
358    fn slo_failure_modes() {
359        let slo = Slo::default();
360        // TTFT too high.
361        assert!(!req(true, 1000.0, 1100.0, 5, 10).meets_slo(&slo));
362        // E2E too high.
363        assert!(!req(true, 100.0, 40_000.0, 5, 10).meets_slo(&slo));
364        // Errored.
365        assert!(!req(false, 100.0, 200.0, 5, 10).meets_slo(&slo));
366        // Good.
367        assert!(req(true, 100.0, 200.0, 5, 10).meets_slo(&slo));
368    }
369
370    fn make_run(records: Vec<RequestRecord>, duration_s: f64) -> RunRecord {
371        RunRecord {
372            records,
373            duration_s,
374        }
375    }
376
377    #[test]
378    fn aggregate_three_repeats() {
379        // Three identical runs of 4 requests each. All meet SLO.
380        let mk_run = || {
381            make_run(
382                vec![
383                    req(true, 100.0, 200.0, 10, 10),
384                    req(true, 120.0, 240.0, 10, 10),
385                    req(true, 140.0, 280.0, 10, 10),
386                    req(true, 160.0, 320.0, 10, 10),
387                ],
388                10.0,
389            )
390        };
391        let report = compute_metrics(
392            "test".into(),
393            "cpu".into(),
394            Scenario::ClosedLoop,
395            Some(4),
396            None,
397            10,
398            10,
399            0,
400            Slo::default(),
401            vec![mk_run(), mk_run(), mk_run()],
402            Env::default(),
403        );
404        assert_eq!(report.n_repeats, 3);
405        assert_eq!(report.n_requests_per_run, 4);
406        // All three runs identical → stddev = 0, ci95 = 0.
407        assert_eq!(report.ttft_ms.p50.stddev, 0.0);
408        // Mean p50 of [100, 120, 140, 160] = 130 (linear interp at q=0.5 of 4 elems).
409        assert!((report.ttft_ms.p50.mean - 130.0).abs() < 1e-9);
410        // Output throughput: 40 tokens / 10s = 4 tps.
411        assert!((report.output_throughput_tps.mean - 4.0).abs() < 1e-9);
412        // Request throughput: 4 req / 10s = 0.4 rps.
413        assert!((report.request_throughput_rps.mean - 0.4).abs() < 1e-9);
414        // Goodput: all 4 meet SLO → 0.4 rps.
415        assert!((report.goodput_rps.mean - 0.4).abs() < 1e-9);
416        // env_hash format check.
417        assert!(report.env_hash.as_str().starts_with("sha256:"));
418    }
419
420    #[test]
421    fn goodput_excludes_slo_violators() {
422        let run = make_run(
423            vec![
424                req(true, 100.0, 200.0, 10, 10),    // good
425                req(true, 1000.0, 1100.0, 10, 10),  // TTFT violator
426                req(true, 100.0, 40_000.0, 10, 10), // E2E violator
427                req(false, 100.0, 200.0, 10, 10),   // errored
428            ],
429            10.0,
430        );
431        let report = compute_metrics(
432            "test".into(),
433            "cpu".into(),
434            Scenario::OpenLoop,
435            None,
436            Some(10.0),
437            10,
438            10,
439            0,
440            Slo::default(),
441            vec![run],
442            Env::default(),
443        );
444        // Request throughput: 3 successful / 10s = 0.3
445        assert!((report.request_throughput_rps.mean - 0.3).abs() < 1e-9);
446        // Goodput: 1 of 4 = 0.1 (errored excluded; both SLO violators excluded)
447        assert!((report.goodput_rps.mean - 0.1).abs() < 1e-9);
448    }
449
450    #[test]
451    fn json_round_trip() {
452        let run = make_run(
453            vec![
454                req(true, 100.0, 200.0, 10, 10),
455                req(true, 120.0, 240.0, 10, 10),
456            ],
457            5.0,
458        );
459        let report = compute_metrics(
460            "qwen3:0.6b".into(),
461            "metal".into(),
462            Scenario::ClosedLoop,
463            Some(2),
464            None,
465            256,
466            128,
467            10,
468            Slo::default(),
469            vec![run.clone(), run.clone(), run],
470            Env::default(),
471        );
472        let json = serde_json::to_string_pretty(&report).unwrap();
473        let parsed: BenchReport = serde_json::from_str(&json).unwrap();
474        assert_eq!(parsed.model, "qwen3:0.6b");
475        assert_eq!(parsed.backend, "metal");
476        assert_eq!(parsed.n_repeats, 3);
477        assert_eq!(parsed.concurrency, Some(2));
478        assert_eq!(parsed.request_rate, None);
479    }
480}
ferrum_bench_core/lib.rs

ferrum_bench_core/
lib.rs