Skip to main content

rlx_voxtral/
audio.rs

1// RLX — versatile ML compiler + runtime.
2// Copyright (C) 2026 Eugene Hauptmann, Nataliya Kosmyna.
3//
4// This program is free software: you can redistribute it and/or modify
5// it under the terms of the GNU General Public License as published by
6// the Free Software Foundation, version 3.
7//
8// This program is distributed in the hope that it will be useful,
9// but WITHOUT ANY WARRANTY; without even the implied warranty of
10// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11// GNU General Public License for more details.
12//
13// You should have received a copy of the GNU General Public License
14// along with this program. If not, see <https://www.gnu.org/licenses/>.
15
16//! Log-mel frontend — Whisper-style features at 16 kHz via HF preprocessor.
17
18use crate::config::VoxtralAudioConfig;
19use anyhow::{Context, Result, bail, ensure};
20pub use rlx_whisper::{
21    N_SAMPLES, SAMPLE_RATE, SpeechSegment, load_wav_mono_f32, parse_wav_mono_f32,
22    pcm_segments_by_vad,
23};
24use std::path::{Path, PathBuf};
25use std::process::Command;
26
27/// Default mel frames for a 30 s chunk (`preprocessor_config.json`: `nb_max_frames = 3000`).
28pub const N_FRAMES: usize = 3_000;
29
30#[derive(Debug, Clone)]
31pub struct MelSpectrogram {
32    pub n_mels: usize,
33    pub n_frames: usize,
34    pub data: Vec<f32>,
35}
36
37fn script_path() -> PathBuf {
38    PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("scripts/mel_preprocess.py")
39}
40
41fn python_bin() -> String {
42    std::env::var("RLX_VOXTRAL_PYTHON").unwrap_or_else(|_| "python3".into())
43}
44
45/// Build mel + transcription prompt ids using the HF Whisper frontend (requires Python + transformers).
46pub fn pcm_to_mel_and_prompt(
47    model_dir: &Path,
48    wav: Option<&Path>,
49    language: Option<&str>,
50) -> Result<(MelSpectrogram, Vec<u32>)> {
51    let script = script_path();
52    ensure!(
53        script.is_file(),
54        "missing mel preprocessor script at {}",
55        script.display()
56    );
57    let mut cmd = Command::new(python_bin());
58    cmd.arg(&script)
59        .arg("--model-dir")
60        .arg(model_dir)
61        .arg("--json");
62    if let Some(lang) = language {
63        cmd.arg("--language").arg(lang);
64    }
65    if let Some(wav) = wav {
66        cmd.arg("--wav").arg(wav);
67    }
68    let out = cmd
69        .output()
70        .with_context(|| format!("run {}", script.display()))?;
71    if !out.status.success() {
72        bail!(
73            "mel preprocess failed:\n{}",
74            String::from_utf8_lossy(&out.stderr)
75        );
76    }
77    let payload: serde_json::Value =
78        serde_json::from_slice(&out.stdout).context("parse mel preprocess json")?;
79    let n_mels = payload["n_mels"].as_u64().context("n_mels")? as usize;
80    let n_frames = payload["n_frames"].as_u64().context("n_frames")? as usize;
81    let mel: Vec<f32> = payload["mel"]
82        .as_array()
83        .context("mel")?
84        .iter()
85        .map(|v| v.as_f64().unwrap_or(0.0) as f32)
86        .collect();
87    let tokens: Vec<u32> = payload["tokens"]
88        .as_array()
89        .context("tokens")?
90        .iter()
91        .map(|v| v.as_u64().unwrap_or(0) as u32)
92        .collect();
93    Ok((
94        MelSpectrogram {
95            n_mels,
96            n_frames,
97            data: mel,
98        },
99        tokens,
100    ))
101}
102
103pub fn pcm_to_mel(_cfg: &VoxtralAudioConfig, _pcm: &[f32]) -> Result<MelSpectrogram> {
104    bail!("use pcm_to_mel_and_prompt(model_dir, wav, language) for HF-compatible mel features")
105}
106
107pub fn mel_from_flat(n_mels: usize, n_frames: usize, data: Vec<f32>) -> Result<MelSpectrogram> {
108    ensure!(
109        data.len() == n_mels * n_frames,
110        "mel flat len {} != {n_mels}×{n_frames}",
111        data.len()
112    );
113    Ok(MelSpectrogram {
114        n_mels,
115        n_frames,
116        data,
117    })
118}