Skip to main content

rlx_whisper/
audio.rs

1// RLX — versatile ML compiler + runtime.
2// Copyright (C) 2026 Eugene Hauptmann, Nataliya Kosmyna.
3//
4// This program is free software: you can redistribute it and/or modify
5// it under the terms of the GNU General Public License as published by
6// the Free Software Foundation, version 3.
7//
8// This program is distributed in the hope that it will be useful,
9// but WITHOUT ANY WARRANTY; without even the implied warranty of
10// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11// GNU General Public License for more details.
12//
13// You should have received a copy of the GNU General Public License
14// along with this program. If not, see <https://www.gnu.org/licenses/>.
15
16//! Minimal audio helpers for Whisper.
17//!
18//! These are intentionally lightweight (no external WAV/STFT deps). They are
19//! sufficient for compiling/running the CLI, and can be upgraded later with a
20//! higher-fidelity mel frontend.
21
22use crate::config::WhisperConfig;
23use anyhow::{Result, anyhow, bail};
24use std::fs;
25use std::path::Path;
26
27pub const SAMPLE_RATE: usize = 16_000;
28pub const N_SAMPLES: usize = 30 * SAMPLE_RATE;
29pub const N_FRAMES: usize = 3_000;
30
31#[derive(Debug, Clone)]
32pub struct MelSpectrogram {
33    pub n_mels: usize,
34    pub n_frames: usize,
35    /// Row-major `[1, n_mels, n_frames]` as f32.
36    pub data: Vec<f32>,
37}
38
39#[derive(Debug, Clone, Copy, PartialEq, Eq)]
40pub struct SpeechSegment {
41    pub start: usize,
42    pub end: usize,
43}
44
45#[derive(Debug, Clone, Default)]
46pub struct EnergyVad {
47    pub threshold: f32,
48    pub min_len_samples: usize,
49}
50
51pub fn pcm_segments_by_vad(_vad: &EnergyVad, pcm: &[f32]) -> Vec<SpeechSegment> {
52    if pcm.is_empty() {
53        return Vec::new();
54    }
55    vec![SpeechSegment {
56        start: 0,
57        end: pcm.len(),
58    }]
59}
60
61pub fn pcm_to_mel(cfg: &WhisperConfig, _pcm: &[f32]) -> MelSpectrogram {
62    let n_mels = cfg.num_mel_bins;
63    let n_frames = N_FRAMES;
64    MelSpectrogram {
65        n_mels,
66        n_frames,
67        data: vec![0.0f32; n_mels * n_frames],
68    }
69}
70
71pub fn load_wav_mono_f32(path: &Path) -> Result<Vec<f32>> {
72    let bytes = fs::read(path).map_err(|e| anyhow!("read wav {path:?}: {e}"))?;
73    parse_wav_mono_f32(&bytes)
74}
75
76pub fn parse_wav_mono_f32(bytes: &[u8]) -> Result<Vec<f32>> {
77    // Very small RIFF/WAVE PCM parser (16-bit mono).
78    if bytes.len() < 44 {
79        bail!("wav too small");
80    }
81    if &bytes[0..4] != b"RIFF" || &bytes[8..12] != b"WAVE" {
82        bail!("not a RIFF/WAVE file");
83    }
84    let mut off = 12usize;
85    let mut fmt: Option<(u16, u16, u32, u16)> = None; // (audio_format, channels, sample_rate, bits_per_sample)
86    let mut data_chunk: Option<&[u8]> = None;
87    while off + 8 <= bytes.len() {
88        let tag = &bytes[off..off + 4];
89        let len = u32::from_le_bytes(bytes[off + 4..off + 8].try_into().unwrap()) as usize;
90        off += 8;
91        if off + len > bytes.len() {
92            break;
93        }
94        match tag {
95            b"fmt " => {
96                if len < 16 {
97                    bail!("wav fmt chunk too small");
98                }
99                let audio_format = u16::from_le_bytes(bytes[off..off + 2].try_into().unwrap());
100                let channels = u16::from_le_bytes(bytes[off + 2..off + 4].try_into().unwrap());
101                let sample_rate = u32::from_le_bytes(bytes[off + 4..off + 8].try_into().unwrap());
102                let bits_per_sample =
103                    u16::from_le_bytes(bytes[off + 14..off + 16].try_into().unwrap());
104                fmt = Some((audio_format, channels, sample_rate, bits_per_sample));
105            }
106            b"data" => {
107                data_chunk = Some(&bytes[off..off + len]);
108            }
109            _ => {}
110        }
111        off += (len + 1) & !1; // word-align
112        if fmt.is_some() && data_chunk.is_some() {
113            break;
114        }
115    }
116    let (audio_format, channels, sr, bps) = fmt.ok_or_else(|| anyhow!("wav missing fmt chunk"))?;
117    if audio_format != 1 {
118        bail!("wav: only PCM supported (format={audio_format})");
119    }
120    if channels != 1 {
121        bail!("wav: expected mono, got {channels} channels");
122    }
123    if sr as usize != SAMPLE_RATE {
124        bail!("wav: expected {SAMPLE_RATE} Hz, got {sr}");
125    }
126    if bps != 16 {
127        bail!("wav: expected 16-bit PCM, got {bps}");
128    }
129    let data = data_chunk.ok_or_else(|| anyhow!("wav missing data chunk"))?;
130    if data.len() % 2 != 0 {
131        bail!("wav data chunk not aligned");
132    }
133    let mut out = Vec::with_capacity(data.len() / 2);
134    for i in (0..data.len()).step_by(2) {
135        let s = i16::from_le_bytes([data[i], data[i + 1]]) as f32 / 32768.0;
136        out.push(s);
137    }
138    Ok(out)
139}