#![cfg(any(feature = "stt", feature = "stt-candle"))]
use std::path::Path;
use super::{Result, SttError, TranscribeConfig};
pub(crate) async fn decode_to_pcm_mono(path: &Path, cfg: &TranscribeConfig) -> Result<Vec<u8>> {
let bytes = tokio::fs::read(path).await?;
let target_rate = cfg.target_sample_rate;
tokio::task::spawn_blocking(move || -> Result<Vec<u8>> { decode_ogg_opus(&bytes, target_rate) })
.await
.map_err(|e| SttError::Decode(format!("decode join: {e}")))?
}
fn decode_ogg_opus(bytes: &[u8], target_rate: u32) -> Result<Vec<u8>> {
if !bytes.starts_with(b"OggS") {
return Err(SttError::UnsupportedFormat(
"expected ogg container (WA/TG voice notes); got something else".into(),
));
}
let cursor = std::io::Cursor::new(bytes.to_vec());
let mut reader = ogg::PacketReader::new(cursor);
let head = reader
.read_packet_expected()
.map_err(|e| SttError::Decode(format!("ogg OpusHead: {e}")))?;
if !head.data.starts_with(b"OpusHead") || head.data.len() < 19 {
return Err(SttError::UnsupportedFormat(
"ogg stream is not opus (missing OpusHead)".into(),
));
}
let channels = head.data[9] as usize;
if channels == 0 {
return Err(SttError::Decode("OpusHead reports 0 channels".into()));
}
let _tags = reader
.read_packet_expected()
.map_err(|e| SttError::Decode(format!("ogg OpusTags: {e}")))?;
let (decoder_sr, decoder_rate_hz) = match target_rate {
8000 => (opus_wave::SampleRate::Hz8000, 8000u32),
12000 => (opus_wave::SampleRate::Hz12000, 12000),
16000 => (opus_wave::SampleRate::Hz16000, 16000),
24000 => (opus_wave::SampleRate::Hz24000, 24000),
_ => (opus_wave::SampleRate::Hz48000, 48000),
};
let decoder_channels = if channels >= 2 {
opus_wave::Channels::Stereo
} else {
opus_wave::Channels::Mono
};
let mut decoder = opus_wave::OpusDecoder::new(decoder_sr, decoder_channels)
.map_err(|e| SttError::Decode(format!("opus decoder init: {e:?}")))?;
let max_frame_samples = (decoder_rate_hz as usize / 1000) * 120;
let dec_channels_n = match decoder_channels {
opus_wave::Channels::Mono => 1,
opus_wave::Channels::Stereo => 2,
};
let mut buf = vec![0.0f32; max_frame_samples * dec_channels_n];
let mut mono = Vec::<f32>::new();
while let Some(packet) = reader
.read_packet()
.map_err(|e| SttError::Decode(format!("ogg packet: {e}")))?
{
let n = decoder
.decode_float(
Some(&packet.data),
&mut buf,
max_frame_samples as i32,
false,
)
.map_err(|e| SttError::Decode(format!("opus decode: {e:?}")))?;
let n = n as usize;
if n == 0 {
continue;
}
if dec_channels_n == 1 {
mono.extend_from_slice(&buf[..n]);
} else {
for i in 0..n {
let mut sum = 0.0f32;
for c in 0..dec_channels_n {
sum += buf[i * dec_channels_n + c];
}
mono.push(sum / dec_channels_n as f32);
}
}
}
let resampled = if decoder_rate_hz == target_rate {
mono
} else {
resample_linear(&mono, decoder_rate_hz, target_rate)
};
f32_mono_to_s16le_bytes(&resampled)
}
fn resample_linear(input: &[f32], from_hz: u32, to_hz: u32) -> Vec<f32> {
if from_hz == to_hz || input.is_empty() {
return input.to_vec();
}
let ratio = from_hz as f64 / to_hz as f64;
let out_len = ((input.len() as f64) / ratio).floor() as usize;
let mut out = Vec::with_capacity(out_len);
let last_idx = input.len() - 1;
for i in 0..out_len {
let src = i as f64 * ratio;
let i0 = src.floor() as usize;
let i1 = (i0 + 1).min(last_idx);
let frac = (src - i0 as f64) as f32;
let s0 = input[i0];
let s1 = input[i1];
out.push(s0 + (s1 - s0) * frac);
}
out
}
fn f32_mono_to_s16le_bytes(samples: &[f32]) -> Result<Vec<u8>> {
let mut out = Vec::with_capacity(samples.len() * 2);
for &s in samples {
let v = (s * 32767.0).clamp(-32768.0, 32767.0) as i16;
out.extend_from_slice(&v.to_le_bytes());
}
Ok(out)
}
pub(crate) fn pcm_s16_to_f32(pcm: &[u8]) -> Vec<f32> {
let mut out = Vec::with_capacity(pcm.len() / 2);
for chunk in pcm.chunks_exact(2) {
let s = i16::from_le_bytes([chunk[0], chunk[1]]);
out.push(s as f32 / i16::MAX as f32);
}
out
}