1use crate::config::VoxtralAudioConfig;
19use anyhow::{Context, Result, bail, ensure};
20pub use rlx_whisper::{
21 N_SAMPLES, SAMPLE_RATE, SpeechSegment, load_wav_mono_f32, parse_wav_mono_f32,
22 pcm_segments_by_vad,
23};
24use std::path::{Path, PathBuf};
25use std::process::Command;
26
27pub const N_FRAMES: usize = 3_000;
29
30#[derive(Debug, Clone)]
31pub struct MelSpectrogram {
32 pub n_mels: usize,
33 pub n_frames: usize,
34 pub data: Vec<f32>,
35}
36
37fn script_path() -> PathBuf {
38 PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("scripts/mel_preprocess.py")
39}
40
41fn python_bin() -> String {
42 std::env::var("RLX_VOXTRAL_PYTHON").unwrap_or_else(|_| "python3".into())
43}
44
45pub fn pcm_to_mel_and_prompt(
47 model_dir: &Path,
48 wav: Option<&Path>,
49 language: Option<&str>,
50) -> Result<(MelSpectrogram, Vec<u32>)> {
51 let script = script_path();
52 ensure!(
53 script.is_file(),
54 "missing mel preprocessor script at {}",
55 script.display()
56 );
57 let mut cmd = Command::new(python_bin());
58 cmd.arg(&script)
59 .arg("--model-dir")
60 .arg(model_dir)
61 .arg("--json");
62 if let Some(lang) = language {
63 cmd.arg("--language").arg(lang);
64 }
65 if let Some(wav) = wav {
66 cmd.arg("--wav").arg(wav);
67 }
68 let out = cmd
69 .output()
70 .with_context(|| format!("run {}", script.display()))?;
71 if !out.status.success() {
72 bail!(
73 "mel preprocess failed:\n{}",
74 String::from_utf8_lossy(&out.stderr)
75 );
76 }
77 let payload: serde_json::Value =
78 serde_json::from_slice(&out.stdout).context("parse mel preprocess json")?;
79 let n_mels = payload["n_mels"].as_u64().context("n_mels")? as usize;
80 let n_frames = payload["n_frames"].as_u64().context("n_frames")? as usize;
81 let mel: Vec<f32> = payload["mel"]
82 .as_array()
83 .context("mel")?
84 .iter()
85 .map(|v| v.as_f64().unwrap_or(0.0) as f32)
86 .collect();
87 let tokens: Vec<u32> = payload["tokens"]
88 .as_array()
89 .context("tokens")?
90 .iter()
91 .map(|v| v.as_u64().unwrap_or(0) as u32)
92 .collect();
93 Ok((
94 MelSpectrogram {
95 n_mels,
96 n_frames,
97 data: mel,
98 },
99 tokens,
100 ))
101}
102
103pub fn pcm_to_mel(_cfg: &VoxtralAudioConfig, _pcm: &[f32]) -> Result<MelSpectrogram> {
104 bail!("use pcm_to_mel_and_prompt(model_dir, wav, language) for HF-compatible mel features")
105}
106
107pub fn mel_from_flat(n_mels: usize, n_frames: usize, data: Vec<f32>) -> Result<MelSpectrogram> {
108 ensure!(
109 data.len() == n_mels * n_frames,
110 "mel flat len {} != {n_mels}×{n_frames}",
111 data.len()
112 );
113 Ok(MelSpectrogram {
114 n_mels,
115 n_frames,
116 data,
117 })
118}