1use crate::config::WhisperConfig;
23use anyhow::{Result, anyhow, bail};
24use std::fs;
25use std::path::Path;
26
27pub const SAMPLE_RATE: usize = 16_000;
28pub const N_SAMPLES: usize = 30 * SAMPLE_RATE;
29pub const N_FRAMES: usize = 3_000;
30
31#[derive(Debug, Clone)]
32pub struct MelSpectrogram {
33 pub n_mels: usize,
34 pub n_frames: usize,
35 pub data: Vec<f32>,
37}
38
39#[derive(Debug, Clone, Copy, PartialEq, Eq)]
40pub struct SpeechSegment {
41 pub start: usize,
42 pub end: usize,
43}
44
45#[derive(Debug, Clone, Default)]
46pub struct EnergyVad {
47 pub threshold: f32,
48 pub min_len_samples: usize,
49}
50
51pub fn pcm_segments_by_vad(_vad: &EnergyVad, pcm: &[f32]) -> Vec<SpeechSegment> {
52 if pcm.is_empty() {
53 return Vec::new();
54 }
55 vec![SpeechSegment {
56 start: 0,
57 end: pcm.len(),
58 }]
59}
60
61pub fn pcm_to_mel(cfg: &WhisperConfig, _pcm: &[f32]) -> MelSpectrogram {
62 let n_mels = cfg.num_mel_bins;
63 let n_frames = N_FRAMES;
64 MelSpectrogram {
65 n_mels,
66 n_frames,
67 data: vec![0.0f32; n_mels * n_frames],
68 }
69}
70
71pub fn load_wav_mono_f32(path: &Path) -> Result<Vec<f32>> {
72 let bytes = fs::read(path).map_err(|e| anyhow!("read wav {path:?}: {e}"))?;
73 parse_wav_mono_f32(&bytes)
74}
75
76pub fn parse_wav_mono_f32(bytes: &[u8]) -> Result<Vec<f32>> {
77 if bytes.len() < 44 {
79 bail!("wav too small");
80 }
81 if &bytes[0..4] != b"RIFF" || &bytes[8..12] != b"WAVE" {
82 bail!("not a RIFF/WAVE file");
83 }
84 let mut off = 12usize;
85 let mut fmt: Option<(u16, u16, u32, u16)> = None; let mut data_chunk: Option<&[u8]> = None;
87 while off + 8 <= bytes.len() {
88 let tag = &bytes[off..off + 4];
89 let len = u32::from_le_bytes(bytes[off + 4..off + 8].try_into().unwrap()) as usize;
90 off += 8;
91 if off + len > bytes.len() {
92 break;
93 }
94 match tag {
95 b"fmt " => {
96 if len < 16 {
97 bail!("wav fmt chunk too small");
98 }
99 let audio_format = u16::from_le_bytes(bytes[off..off + 2].try_into().unwrap());
100 let channels = u16::from_le_bytes(bytes[off + 2..off + 4].try_into().unwrap());
101 let sample_rate = u32::from_le_bytes(bytes[off + 4..off + 8].try_into().unwrap());
102 let bits_per_sample =
103 u16::from_le_bytes(bytes[off + 14..off + 16].try_into().unwrap());
104 fmt = Some((audio_format, channels, sample_rate, bits_per_sample));
105 }
106 b"data" => {
107 data_chunk = Some(&bytes[off..off + len]);
108 }
109 _ => {}
110 }
111 off += (len + 1) & !1; if fmt.is_some() && data_chunk.is_some() {
113 break;
114 }
115 }
116 let (audio_format, channels, sr, bps) = fmt.ok_or_else(|| anyhow!("wav missing fmt chunk"))?;
117 if audio_format != 1 {
118 bail!("wav: only PCM supported (format={audio_format})");
119 }
120 if channels != 1 {
121 bail!("wav: expected mono, got {channels} channels");
122 }
123 if sr as usize != SAMPLE_RATE {
124 bail!("wav: expected {SAMPLE_RATE} Hz, got {sr}");
125 }
126 if bps != 16 {
127 bail!("wav: expected 16-bit PCM, got {bps}");
128 }
129 let data = data_chunk.ok_or_else(|| anyhow!("wav missing data chunk"))?;
130 if data.len() % 2 != 0 {
131 bail!("wav data chunk not aligned");
132 }
133 let mut out = Vec::with_capacity(data.len() / 2);
134 for i in (0..data.len()).step_by(2) {
135 let s = i16::from_le_bytes([data[i], data[i + 1]]) as f32 / 32768.0;
136 out.push(s);
137 }
138 Ok(out)
139}