rlx_voxtral_tts/
bench.rs

1// RLX — versatile ML compiler + runtime.
2// Copyright (C) 2026 Eugene Hauptmann, Nataliya Kosmyna.
3//
4// This program is free software: you can redistribute it and/or modify
5// it under the terms of the GNU General Public License as published by
6// the Free Software Foundation, version 3.
7//
8// This program is distributed in the hope that it will be useful,
9// but WITHOUT ANY WARRANTY; without even the implied warranty of
10// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11// GNU General Public License for more details.
12//
13// You should have received a copy of the GNU General Public License
14// along with this program. If not, see <https://www.gnu.org/licenses/>.
15
16//! Stage timing for native TTS (LM prefill / decode, acoustic, codec).
17
18use rlx_runtime::Device;
19
20/// Per-stage milliseconds from one [`super::backbone::NativeTtsEngine::synthesize_profiled`] run.
21#[derive(Debug, Clone, Copy)]
22pub struct VoxtralTtsBenchReport {
23    pub device: Device,
24    pub eager_lm: bool,
25    pub eager_acoustic: bool,
26    /// Token + voice embedding assembly (host).
27    pub embed_ms: f64,
28    /// First LM forward (prompt prefill).
29    pub lm_prefill_ms: f64,
30    /// Sum of per-frame LM decode forwards.
31    pub lm_decode_ms: f64,
32    /// Sum of per-frame acoustic (flow-matching) work.
33    pub acoustic_ms: f64,
34    /// Codec decode of all frames.
35    pub codec_ms: f64,
36    /// `lm_prefill + lm_decode + acoustic + codec` (excludes embed).
37    pub synthesis_ms: f64,
38    pub audio_frames: usize,
39    pub prompt_tokens: usize,
40    pub pcm_samples: usize,
41    /// Flow-matching Euler steps per frame (from model config).
42    pub euler_steps_per_frame: usize,
43    /// Compiled acoustic velocity `run()` count (= frames × euler × 2 CFG passes).
44    pub acoustic_velocity_runs: u64,
45}
46
47impl VoxtralTtsBenchReport {
48    pub fn sample_rate_hz() -> u32 {
49        24_000
50    }
51
52    pub fn audio_duration_ms(&self) -> f64 {
53        if self.pcm_samples == 0 {
54            return 0.0;
55        }
56        self.pcm_samples as f64 / Self::sample_rate_hz() as f64 * 1000.0
57    }
58
59    pub fn rtf(&self) -> f64 {
60        let dur = self.audio_duration_ms();
61        if dur <= 0.0 {
62            return 0.0;
63        }
64        self.synthesis_ms / dur
65    }
66
67    pub fn lm_total_ms(&self) -> f64 {
68        self.lm_prefill_ms + self.lm_decode_ms
69    }
70
71    pub fn stage_share(&self, ms: f64) -> f64 {
72        let total = self.synthesis_ms;
73        if total <= 0.0 {
74            0.0
75        } else {
76            100.0 * ms / total
77        }
78    }
79
80    pub fn label(&self) -> String {
81        format!(
82            "lm={} acoustic={}",
83            if self.eager_lm { "eager" } else { "compiled" },
84            if self.eager_acoustic {
85                "eager"
86            } else {
87                "compiled"
88            }
89        )
90    }
91
92    pub fn print_line(&self) {
93        let dur = self.audio_duration_ms();
94        let rtf = self.rtf();
95        let lm = self.lm_total_ms();
96        println!(
97            "config={} device={:?} frames={} euler={} rtf={:.3} synthesis_ms={:.2} \
98             lm_prefill_ms={:.2} lm_decode_ms={:.2} acoustic_ms={:.2} codec_ms={:.2} \
99             lm_share={:.0}% acoustic_share={:.0}% codec_share={:.0}% \
100             lm_ms_per_frame={:.3} acoustic_ms_per_frame={:.3} velocity_runs={}",
101            self.label(),
102            self.device,
103            self.audio_frames,
104            self.euler_steps_per_frame,
105            rtf,
106            self.synthesis_ms,
107            self.lm_prefill_ms,
108            self.lm_decode_ms,
109            self.acoustic_ms,
110            self.codec_ms,
111            self.stage_share(lm),
112            self.stage_share(self.acoustic_ms),
113            self.stage_share(self.codec_ms),
114            if self.audio_frames > 0 {
115                lm / self.audio_frames as f64
116            } else {
117                0.0
118            },
119            if self.audio_frames > 0 {
120                self.acoustic_ms / self.audio_frames as f64
121            } else {
122                0.0
123            },
124            self.acoustic_velocity_runs,
125        );
126        println!(
127            "  audio_ms={:.2} prompt_tokens={} pcm_samples={} embed_ms={:.2}",
128            dur, self.prompt_tokens, self.pcm_samples, self.embed_ms,
129        );
130    }
131}
rlx_voxtral_tts/bench.rs

rlx_voxtral_tts/
bench.rs