use crate::{CodecError, CodecResult};
use super::packet::OpusBandwidth;
use super::silk_decoder::{
decode_silk_frame, SilkBandwidth, SilkChannelState, MAX_LPC_ORDER, MAX_SUBFRAMES,
};
use super::silk_range::SilkRangeDecoder;
const LSF_COUNT: usize = 16;
#[derive(Debug)]
pub struct SilkDecoder {
sample_rate: u32,
channels: usize,
bandwidth: SilkBandwidth,
channel_state: Vec<SilkChannelState>,
last_frame: Vec<Vec<f32>>,
consecutive_losses: usize,
resample_pos: Vec<f64>,
}
impl SilkDecoder {
pub fn new(sample_rate: u32, channels: usize, bandwidth: OpusBandwidth) -> Self {
let silk_bw = map_bandwidth(bandwidth);
Self {
sample_rate,
channels,
bandwidth: silk_bw,
channel_state: (0..channels).map(|_| SilkChannelState::new()).collect(),
last_frame: vec![Vec::new(); channels],
consecutive_losses: 0,
resample_pos: vec![0.0; channels],
}
}
fn initialize_lsf() -> Vec<f32> {
let mut lsf = Vec::with_capacity(LSF_COUNT);
for i in 0..LSF_COUNT {
lsf.push((i as f32 + 0.5) * std::f32::consts::PI / (LSF_COUNT as f32 + 1.0));
}
lsf
}
pub fn decode(
&mut self,
data: &[u8],
output: &mut [f32],
frame_size: usize,
) -> CodecResult<()> {
if output.len() < frame_size * self.channels {
return Err(CodecError::InvalidData(
"Output buffer too small".to_string(),
));
}
if data.is_empty() {
return self.decode_plc(output, frame_size);
}
let mut dec = SilkRangeDecoder::new(data)?;
self.consecutive_losses = 0;
self.decode_into(&mut dec, output, frame_size)
}
pub fn decode_with(
&mut self,
dec: &mut SilkRangeDecoder,
output: &mut [f32],
frame_size: usize,
) -> CodecResult<()> {
if output.len() < frame_size * self.channels {
return Err(CodecError::InvalidData(
"Output buffer too small".to_string(),
));
}
self.consecutive_losses = 0;
self.decode_into(dec, output, frame_size)
}
fn decode_into(
&mut self,
dec: &mut SilkRangeDecoder,
output: &mut [f32],
frame_size: usize,
) -> CodecResult<()> {
let stereo = self.channels == 2;
let internal_rate = self.bandwidth.hz();
let internal_total =
((frame_size as u64) * u64::from(internal_rate) / u64::from(self.sample_rate)) as usize;
let unit_20ms = self.bandwidth.khz() * 20;
let unit_10ms = self.bandwidth.khz() * 10;
let mut silk_frames: Vec<(usize, usize)> = Vec::new(); let mut remaining = internal_total.max(unit_10ms);
while remaining >= unit_20ms {
silk_frames.push((MAX_SUBFRAMES, unit_20ms));
remaining -= unit_20ms;
}
if remaining > 0 {
silk_frames.push((2, unit_10ms));
}
if silk_frames.is_empty() {
silk_frames.push((2, unit_10ms));
}
let frames_per_channel = silk_frames.len();
let mut vad_flags = vec![vec![true; frames_per_channel]; self.channels];
for ch_flags in vad_flags.iter_mut().take(self.channels) {
for slot in ch_flags.iter_mut() {
*slot = dec.decode_bit_logp(1)?;
}
let _lbrr = dec.decode_bit_logp(1)?;
}
let mut internal_pcm: Vec<Vec<f32>> =
vec![Vec::with_capacity(internal_total); self.channels];
for (frame_idx, &(subframes, _len)) in silk_frames.iter().enumerate() {
let stereo_pred = if stereo {
Some(decode_stereo_weights(dec)?)
} else {
None
};
for ch in 0..self.channels {
let is_side = stereo && ch == 1;
let vad = vad_flags[ch][frame_idx];
let result = decode_silk_frame(
dec,
self.bandwidth,
&mut self.channel_state[ch],
subframes,
is_side,
vad,
)?;
internal_pcm[ch].extend_from_slice(&result.samples);
}
if let Some((w0, w1)) = stereo_pred {
apply_stereo_prediction(&mut internal_pcm, w0, w1);
}
}
for ch in 0..self.channels {
let resampled = resample_linear(
&internal_pcm[ch],
internal_rate,
self.sample_rate,
frame_size,
);
for (i, &s) in resampled.iter().enumerate().take(frame_size) {
output[i * self.channels + ch] = s;
}
self.last_frame[ch] = resampled;
}
Ok(())
}
fn decode_plc(&mut self, output: &mut [f32], frame_size: usize) -> CodecResult<()> {
self.consecutive_losses += 1;
let attenuation = 0.92_f32.powi(self.consecutive_losses as i32);
for ch in 0..self.channels {
let prev = &self.last_frame[ch];
for i in 0..frame_size {
let idx = i * self.channels + ch;
output[idx] = if prev.is_empty() {
0.0
} else {
prev[i % prev.len()] * attenuation
};
}
}
Ok(())
}
pub fn reset(&mut self) {
for st in &mut self.channel_state {
st.reset();
}
for f in &mut self.last_frame {
f.clear();
}
self.consecutive_losses = 0;
for p in &mut self.resample_pos {
*p = 0.0;
}
let _ = Self::initialize_lsf();
}
#[must_use]
pub const fn sample_rate(&self) -> u32 {
self.sample_rate
}
#[must_use]
pub const fn channels(&self) -> usize {
self.channels
}
}
fn map_bandwidth(bw: OpusBandwidth) -> SilkBandwidth {
match bw {
OpusBandwidth::Narrowband => SilkBandwidth::Narrowband,
OpusBandwidth::Mediumband => SilkBandwidth::Mediumband,
_ => SilkBandwidth::Wideband,
}
}
fn decode_stereo_weights(dec: &mut SilkRangeDecoder) -> CodecResult<(f32, f32)> {
use super::silk_tables as t;
let n = dec.decode_icdf(&t::STEREO_PRED_JOINT_ICDF, 8)? as i32;
let i0 = dec.decode_icdf(&t::UNIFORM3_ICDF, 8)? as i32;
let i1 = dec.decode_icdf(&t::UNIFORM5_ICDF, 8)? as i32 * 3 + i0;
let i2 = dec.decode_icdf(&t::UNIFORM3_ICDF, 8)? as i32;
let i3 = dec.decode_icdf(&t::UNIFORM5_ICDF, 8)? as i32 * 3 + i2;
let w0_idx = (n % 5) * 5 + (i1 % 5);
let w1_idx = (n / 5) * 5 + (i3 % 5);
let w0 = i32::from(t::STEREO_PRED_QUANT_Q13[(w0_idx as usize) % 16]);
let w1 = i32::from(t::STEREO_PRED_QUANT_Q13[(w1_idx as usize) % 16]);
let _mid_only = dec.decode_icdf(&t::STEREO_ONLY_CODE_MID_ICDF, 8)?;
Ok((w0 as f32 / 8192.0, w1 as f32 / 8192.0))
}
fn apply_stereo_prediction(pcm: &mut [Vec<f32>], w0: f32, w1: f32) {
if pcm.len() != 2 {
return;
}
let n = pcm[0].len().min(pcm[1].len());
for i in 0..n {
let mid = pcm[0][i];
let side = pcm[1][i];
let pred_side = side + w0 * mid + w1 * mid * 0.0;
let left = mid + pred_side;
let right = mid - pred_side;
pcm[0][i] = left;
pcm[1][i] = right;
}
}
fn resample_linear(input: &[f32], in_rate: u32, out_rate: u32, out_len: usize) -> Vec<f32> {
let mut out = vec![0.0f32; out_len];
if input.is_empty() {
return out;
}
if in_rate == out_rate {
for (i, slot) in out.iter_mut().enumerate() {
*slot = input[i.min(input.len() - 1)];
}
return out;
}
let ratio = f64::from(in_rate) / f64::from(out_rate);
for (i, slot) in out.iter_mut().enumerate() {
let src = (i as f64) * ratio;
let idx = src.floor() as usize;
let frac = (src - idx as f64) as f32;
let a = input[idx.min(input.len() - 1)];
let b = input[(idx + 1).min(input.len() - 1)];
*slot = a + (b - a) * frac;
}
out
}
pub use super::silk_decoder::MAX_LPC_ORDER as SILK_MAX_LPC_ORDER;
const _: () = assert!(MAX_LPC_ORDER == 16);
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum VadDecision {
Active,
Inactive,
}
#[derive(Clone, Debug)]
pub struct VoiceActivityDetector {
sample_rate: u32,
signal_energy: [f32; 4],
noise_floor: [f32; 4],
signal_ema: f32,
noise_ema: f32,
hangover_counter: u32,
hangover_max: u32,
threshold_db: f32,
prev_band_energy: [f32; 4],
frame_count: u64,
}
impl VoiceActivityDetector {
#[must_use]
pub fn new(sample_rate: u32) -> Self {
Self {
sample_rate,
signal_energy: [1e-6f32; 4],
noise_floor: [1e-6f32; 4],
signal_ema: 0.3,
noise_ema: 0.02,
hangover_counter: 0,
hangover_max: 8, threshold_db: 12.0,
prev_band_energy: [0.0f32; 4],
frame_count: 0,
}
}
pub fn set_hangover(&mut self, frames: u32) {
self.hangover_max = frames;
}
pub fn set_threshold_db(&mut self, db: f32) {
self.threshold_db = db.clamp(0.0, 40.0);
}
pub fn process(&mut self, samples: &[f32]) -> VadDecision {
if samples.is_empty() {
self.frame_count += 1;
return VadDecision::Inactive;
}
let band_energy = self.compute_band_energy(samples);
self.update_signal_energy(&band_energy);
let flux: f32 = band_energy
.iter()
.zip(self.prev_band_energy.iter())
.map(|(&b, &p)| (b - p).abs())
.sum::<f32>();
self.prev_band_energy = band_energy;
let is_likely_noise = self.is_likely_noise(&band_energy);
if is_likely_noise {
for i in 0..4 {
self.noise_floor[i] =
self.noise_floor[i] * (1.0 - self.noise_ema) + band_energy[i] * self.noise_ema;
self.noise_floor[i] = self.noise_floor[i].min(self.signal_energy[i]);
}
}
let mut voice_bands = 0u32;
for i in 0..4 {
let noise = self.noise_floor[i].max(1e-10);
let snr_db = 10.0 * (self.signal_energy[i] / noise).log10();
if snr_db >= self.threshold_db {
voice_bands += 1;
}
}
let flux_boost = flux > 0.01;
let speech_active = voice_bands >= 2 || (voice_bands >= 1 && flux_boost);
self.frame_count += 1;
if speech_active {
self.hangover_counter = self.hangover_max;
VadDecision::Active
} else if self.hangover_counter > 0 {
self.hangover_counter -= 1;
VadDecision::Active
} else {
VadDecision::Inactive
}
}
fn compute_band_energy(&self, samples: &[f32]) -> [f32; 4] {
let n = samples.len() as f32;
let alpha2 = {
let fc = 500.0f32 / self.sample_rate as f32;
(-2.0 * std::f32::consts::PI * fc).exp()
};
let mut lp2 = 0.0f32;
let mut band0_e = 0.0f32;
let mut band1_e = 0.0f32;
for &s in samples {
lp2 = lp2 * alpha2 + s * (1.0 - alpha2);
let hp2 = s - lp2;
band0_e += lp2 * lp2;
band1_e += hp2 * hp2;
}
let alpha3 = {
let fc = 3000.0f32 / self.sample_rate as f32;
(-2.0 * std::f32::consts::PI * fc).exp()
};
let mut lp3 = 0.0f32;
let mut band2_e = 0.0f32;
let mut band3_e = 0.0f32;
for &s in samples {
lp3 = lp3 * alpha3 + s * (1.0 - alpha3);
let hp3 = s - lp3;
band2_e += lp3 * lp3;
band3_e += hp3 * hp3;
}
let inv_n = if n > 0.0 { 1.0 / n } else { 1.0 };
[
band0_e * inv_n,
band1_e * inv_n,
band2_e * inv_n,
band3_e * inv_n,
]
}
fn update_signal_energy(&mut self, band_energy: &[f32; 4]) {
for i in 0..4 {
self.signal_energy[i] =
self.signal_energy[i] * (1.0 - self.signal_ema) + band_energy[i] * self.signal_ema;
}
}
fn is_likely_noise(&self, band_energy: &[f32; 4]) -> bool {
let total: f32 = band_energy.iter().sum();
let running: f32 = self.signal_energy.iter().sum();
total < running * 0.5
}
pub fn reset(&mut self) {
self.signal_energy = [1e-6f32; 4];
self.noise_floor = [1e-6f32; 4];
self.hangover_counter = 0;
self.prev_band_energy = [0.0f32; 4];
self.frame_count = 0;
}
#[must_use]
pub const fn hangover_counter(&self) -> u32 {
self.hangover_counter
}
#[must_use]
pub const fn frame_count(&self) -> u64 {
self.frame_count
}
}
#[derive(Debug)]
pub struct SilkEncoder {
sample_rate: u32,
channels: usize,
#[allow(dead_code)]
bandwidth: OpusBandwidth,
vad: VoiceActivityDetector,
last_vad: VadDecision,
pub dtx_enabled: bool,
inactive_frame_count: u32,
}
impl SilkEncoder {
pub fn new(sample_rate: u32, channels: usize, bandwidth: OpusBandwidth) -> Self {
Self {
sample_rate,
channels,
bandwidth,
vad: VoiceActivityDetector::new(sample_rate),
last_vad: VadDecision::Inactive,
dtx_enabled: false,
inactive_frame_count: 0,
}
}
#[must_use]
pub fn run_vad(&mut self, input: &[f32], frame_size: usize) -> VadDecision {
let ch = self.channels;
let mono: Vec<f32> = if ch == 1 {
input[..frame_size.min(input.len())].to_vec()
} else {
(0..frame_size.min(input.len() / ch))
.map(|i| input[i * ch])
.collect()
};
self.last_vad = self.vad.process(&mono);
self.last_vad
}
#[must_use]
pub const fn last_vad_decision(&self) -> VadDecision {
self.last_vad
}
pub fn encode(
&mut self,
input: &[f32],
output: &mut [u8],
frame_size: usize,
) -> CodecResult<usize> {
if output.is_empty() {
return Err(CodecError::InvalidData("Output buffer empty".to_string()));
}
let vad = self.run_vad(input, frame_size);
if vad == VadDecision::Inactive {
self.inactive_frame_count += 1;
if self.dtx_enabled && self.inactive_frame_count > 1 {
return Ok(0);
}
} else {
self.inactive_frame_count = 0;
}
output[0] = if vad == VadDecision::Active {
0x01
} else {
0x00
};
Ok(1)
}
pub fn reset(&mut self) {
self.vad.reset();
self.last_vad = VadDecision::Inactive;
self.inactive_frame_count = 0;
}
#[must_use]
pub const fn vad(&self) -> &VoiceActivityDetector {
&self.vad
}
#[must_use]
pub const fn sample_rate(&self) -> u32 {
self.sample_rate
}
#[must_use]
pub const fn channels(&self) -> usize {
self.channels
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_silk_decoder_creation() {
let decoder = SilkDecoder::new(48000, 2, OpusBandwidth::Wideband);
assert_eq!(decoder.sample_rate(), 48000);
assert_eq!(decoder.channels(), 2);
}
#[test]
fn test_silk_decoder_plc() {
let mut decoder = SilkDecoder::new(48000, 1, OpusBandwidth::Wideband);
let mut output = vec![0.0f32; 480];
let result = decoder.decode_plc(&mut output, 480);
assert!(result.is_ok());
}
#[test]
fn test_lsf_initialization() {
let lsf = SilkDecoder::initialize_lsf();
assert_eq!(lsf.len(), LSF_COUNT);
for i in 1..lsf.len() {
assert!(lsf[i] > lsf[i - 1]);
}
}
#[test]
fn test_silk_decode_real_packet_finite_output() {
let mut decoder = SilkDecoder::new(16000, 1, OpusBandwidth::Narrowband);
let data: Vec<u8> = (0u8..40)
.map(|i| i.wrapping_mul(37).wrapping_add(11))
.collect();
let mut output = vec![0.0f32; 320];
let result = decoder.decode(&data, &mut output, 320);
assert!(
result.is_ok(),
"SILK decode should not error on valid input"
);
for &s in &output {
assert!(s.is_finite(), "every SILK output sample must be finite");
assert!(s.abs() <= 4.0, "SILK output must be bounded");
}
}
#[test]
fn test_silk_decode_resamples_to_output_rate() {
let mut decoder = SilkDecoder::new(48000, 1, OpusBandwidth::Wideband);
let data: Vec<u8> = (0u8..60)
.map(|i| i.wrapping_mul(53).wrapping_add(7))
.collect();
let mut output = vec![0.0f32; 960];
decoder.decode(&data, &mut output, 960).expect("decode");
assert!(output.iter().all(|s| s.is_finite()));
}
#[test]
fn test_silk_decode_stereo_interleaved() {
let mut decoder = SilkDecoder::new(16000, 2, OpusBandwidth::Wideband);
let data: Vec<u8> = (0u8..80)
.map(|i| i.wrapping_mul(29).wrapping_add(3))
.collect();
let mut output = vec![0.0f32; 320 * 2];
decoder
.decode(&data, &mut output, 320)
.expect("stereo decode");
assert!(output.iter().all(|s| s.is_finite()));
}
#[test]
fn test_silk_decode_then_reset_is_stable() {
let mut decoder = SilkDecoder::new(16000, 1, OpusBandwidth::Wideband);
let data: Vec<u8> = (0u8..48)
.map(|i| i.wrapping_mul(91).wrapping_add(5))
.collect();
let mut output = vec![0.0f32; 320];
for _ in 0..4 {
decoder.decode(&data, &mut output, 320).expect("decode");
}
decoder.reset();
decoder
.decode(&data, &mut output, 320)
.expect("decode after reset");
assert!(output.iter().all(|s| s.is_finite()));
}
#[test]
fn test_silk_resample_linear_exact_length() {
let input: Vec<f32> = (0..160).map(|i| (i as f32 / 160.0).sin()).collect();
let out = resample_linear(&input, 16000, 48000, 480);
assert_eq!(out.len(), 480);
assert!(out.iter().all(|s| s.is_finite()));
let same = resample_linear(&input, 16000, 16000, 160);
for (a, b) in same.iter().zip(input.iter()) {
assert!((a - b).abs() < 1e-6);
}
}
#[test]
fn test_silk_encoder_creation() {
let encoder = SilkEncoder::new(48000, 2, OpusBandwidth::Wideband);
assert_eq!(encoder.sample_rate(), 48000);
assert_eq!(encoder.channels(), 2);
}
#[test]
fn test_silk_encoder_encode_active() {
let mut encoder = SilkEncoder::new(16000, 1, OpusBandwidth::Wideband);
let freq = 440.0f32;
let sr = 16000.0f32;
let input: Vec<f32> = (0..320)
.map(|i| (2.0 * std::f32::consts::PI * freq * i as f32 / sr).sin() * 0.5)
.collect();
let mut output = vec![0u8; 1024];
let result = encoder.encode(&input, &mut output, 320);
assert!(result.is_ok());
assert!(
result.expect("encode should succeed") >= 1,
"Active frame must emit at least 1 byte"
);
}
#[test]
fn test_silk_encoder_dtx_silence() {
let mut encoder = SilkEncoder::new(16000, 1, OpusBandwidth::Wideband);
encoder.dtx_enabled = true;
let silence = vec![0.0f32; 320];
let mut output = vec![0u8; 1024];
let _ = encoder.encode(&silence, &mut output, 320);
for _ in 0..5 {
let _ = encoder.encode(&silence, &mut output, 320);
}
let result = encoder.encode(&silence, &mut output, 320);
assert!(result.is_ok());
assert_eq!(
result.expect("encode should succeed"),
0,
"DTX must suppress silent frames"
);
}
#[test]
fn test_vad_creation() {
let vad = VoiceActivityDetector::new(16000);
assert_eq!(vad.frame_count(), 0);
assert_eq!(vad.hangover_counter(), 0);
}
#[test]
fn test_vad_silence_returns_inactive() {
let mut vad = VoiceActivityDetector::new(16000);
let silence = vec![0.0f32; 320];
for _ in 0..20 {
let _ = vad.process(&silence);
}
let decision = vad.process(&silence);
assert_eq!(
decision,
VadDecision::Inactive,
"Prolonged silence must be inactive"
);
}
#[test]
fn test_vad_sine_wave_returns_active() {
let mut vad = VoiceActivityDetector::new(16000);
let silence = vec![0.0f32; 320];
for _ in 0..10 {
let _ = vad.process(&silence);
}
let sine: Vec<f32> = (0..320)
.map(|i| (2.0 * std::f32::consts::PI * 300.0 * i as f32 / 16000.0).sin() * 0.8)
.collect();
let decision = vad.process(&sine);
assert_eq!(
decision,
VadDecision::Active,
"Loud sine wave must be active"
);
}
#[test]
fn test_vad_frame_count_increments() {
let mut vad = VoiceActivityDetector::new(16000);
let frame = vec![0.0f32; 160];
for i in 1..=5 {
vad.process(&frame);
assert_eq!(vad.frame_count(), i);
}
}
#[test]
fn test_vad_empty_frame_returns_inactive() {
let mut vad = VoiceActivityDetector::new(16000);
let decision = vad.process(&[]);
assert_eq!(decision, VadDecision::Inactive);
}
#[test]
fn test_vad_hangover_maintains_active() {
let mut vad = VoiceActivityDetector::new(16000);
vad.set_hangover(4);
let silence = vec![0.0f32; 160];
for _ in 0..5 {
let _ = vad.process(&silence);
}
let tone: Vec<f32> = (0..160)
.map(|i| (2.0 * std::f32::consts::PI * 400.0 * i as f32 / 16000.0).sin() * 0.9)
.collect();
let d1 = vad.process(&tone);
let d2 = vad.process(&silence);
assert_eq!(d1, VadDecision::Active);
assert_eq!(d2, VadDecision::Active, "Hang-over should keep active flag");
}
#[test]
fn test_vad_reset_clears_state() {
let mut vad = VoiceActivityDetector::new(16000);
let frame = vec![0.5f32; 320];
for _ in 0..10 {
vad.process(&frame);
}
vad.reset();
assert_eq!(vad.frame_count(), 0);
assert_eq!(vad.hangover_counter(), 0);
}
#[test]
fn test_vad_set_threshold() {
let mut vad = VoiceActivityDetector::new(16000);
vad.set_threshold_db(20.0);
let low: Vec<f32> = vec![0.0001f32; 320];
for _ in 0..5 {
let _ = vad.process(&low);
}
let d = vad.process(&low);
assert_eq!(d, VadDecision::Inactive);
}
#[test]
fn test_encoder_vad_method() {
let mut encoder = SilkEncoder::new(16000, 1, OpusBandwidth::Narrowband);
let sine: Vec<f32> = (0..320)
.map(|i| (2.0 * std::f32::consts::PI * 250.0 * i as f32 / 16000.0).sin() * 0.7)
.collect();
let silence = vec![0.0f32; 320];
for _ in 0..5 {
let _ = encoder.run_vad(&silence, 320);
}
let decision = encoder.run_vad(&sine, 320);
assert_eq!(decision, VadDecision::Active);
assert_eq!(encoder.last_vad_decision(), VadDecision::Active);
}
}