#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum VocalEffort {
Whisper,
Normal,
Shout,
}
impl std::fmt::Display for VocalEffort {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::Whisper => write!(f, "whisper"),
Self::Normal => write!(f, "normal"),
Self::Shout => write!(f, "shout"),
}
}
}
#[derive(Debug, Clone)]
pub struct VocalEffortResult {
pub effort: VocalEffort,
pub confidence: f32,
pub rms_energy: f32,
pub zcr: f32,
pub spectral_tilt_db_octave: f32,
pub hnr_estimate: f32,
}
#[must_use]
pub fn estimate_vocal_effort(samples: &[f32], sample_rate: f32) -> VocalEffortResult {
if samples.is_empty() {
return VocalEffortResult {
effort: VocalEffort::Normal,
confidence: 0.0,
rms_energy: 0.0,
zcr: 0.0,
spectral_tilt_db_octave: 0.0,
hnr_estimate: 0.0,
};
}
let rms_energy = compute_rms(samples);
let zcr = compute_zcr(samples);
let spectral_tilt_db_octave = estimate_spectral_tilt(samples, sample_rate);
let hnr_estimate = estimate_hnr(samples, sample_rate);
let loudness = (rms_energy / 0.1_f32).clamp(0.0, 1.0);
let whisper_score = {
let energy_score = (1.0 - loudness).max(0.0);
let zcr_score = (zcr / 0.4_f32).clamp(0.0, 1.0); let hnr_score = (1.0 - (hnr_estimate / 20.0_f32).clamp(0.0, 1.0)).max(0.0);
let tilt_score = if spectral_tilt_db_octave < -3.0 { 1.0_f32 } else { 0.3 };
energy_score * 0.35 + zcr_score * 0.30 + hnr_score * 0.20 + tilt_score * 0.15
};
let shout_score = {
let energy_score = loudness;
let zcr_score = (1.0 - (zcr / 0.3_f32).clamp(0.0, 1.0)).max(0.0);
let hnr_score = ((hnr_estimate - 5.0) / 20.0_f32).clamp(0.0, 1.0);
let tilt_score = if spectral_tilt_db_octave > -1.0 { 1.0_f32 } else { 0.2 };
energy_score * 0.40 + zcr_score * 0.25 + hnr_score * 0.20 + tilt_score * 0.15
};
let normal_score = {
let not_whisper = 1.0 - whisper_score;
let not_shout = 1.0 - shout_score;
(not_whisper * 0.5 + not_shout * 0.5).clamp(0.0, 1.0)
};
let max_score = whisper_score.max(shout_score).max(normal_score);
let (effort, raw_confidence) = if max_score == whisper_score && whisper_score > shout_score {
(VocalEffort::Whisper, whisper_score)
} else if max_score == shout_score && shout_score > normal_score {
(VocalEffort::Shout, shout_score)
} else {
(VocalEffort::Normal, normal_score)
};
let mut scores = [whisper_score, normal_score, shout_score];
scores.sort_by(|a, b| b.partial_cmp(a).unwrap_or(std::cmp::Ordering::Equal));
let confidence = if scores[0] > 1e-6 {
((scores[0] - scores[1]) / scores[0]).clamp(0.0, 1.0)
} else {
0.0
};
let _ = raw_confidence;
VocalEffortResult {
effort,
confidence,
rms_energy,
zcr,
spectral_tilt_db_octave,
hnr_estimate,
}
}
fn compute_rms(samples: &[f32]) -> f32 {
if samples.is_empty() {
return 0.0;
}
let sq: f32 = samples.iter().map(|&x| x * x).sum();
(sq / samples.len() as f32).sqrt()
}
fn compute_zcr(samples: &[f32]) -> f32 {
if samples.len() < 2 {
return 0.0;
}
let mut crossings = 0usize;
for i in 1..samples.len() {
if (samples[i] >= 0.0) != (samples[i - 1] >= 0.0) {
crossings += 1;
}
}
crossings as f32 / (samples.len() - 1) as f32
}
fn estimate_spectral_tilt(samples: &[f32], sample_rate: f32) -> f32 {
if samples.len() < 2 {
return 0.0;
}
let bin_hz = sample_rate / (samples.len() as f32 * 2.0);
let low_end = (1000.0 / bin_hz) as usize;
let high_end = (8000.0 / bin_hz).min(samples.len() as f32 / 2.0) as usize;
if low_end == 0 || high_end <= low_end {
return -6.0; }
let n_bands = 8usize;
let band_size = (samples.len() / n_bands).max(1);
let mut band_energies = Vec::with_capacity(n_bands);
for b in 0..n_bands {
let start = b * band_size;
let end = ((b + 1) * band_size).min(samples.len());
let e: f32 = samples[start..end].iter().map(|&x| x * x).sum::<f32>()
/ (end - start) as f32;
band_energies.push(e.max(1e-20_f32));
}
let n = n_bands as f32;
let log_energies: Vec<f32> = band_energies.iter().map(|&e| e.ln()).collect();
let log_freqs: Vec<f32> = (1..=n_bands).map(|i| (i as f32).ln()).collect();
let mean_lf = log_freqs.iter().sum::<f32>() / n;
let mean_le = log_energies.iter().sum::<f32>() / n;
let num: f32 = log_freqs
.iter()
.zip(&log_energies)
.map(|(&lf, &le)| (lf - mean_lf) * (le - mean_le))
.sum();
let den: f32 = log_freqs.iter().map(|&lf| (lf - mean_lf).powi(2)).sum();
if den.abs() < 1e-10 {
return -6.0;
}
let slope = num / den; slope * 10.0 / std::f32::consts::LN_2 }
fn estimate_hnr(samples: &[f32], sample_rate: f32) -> f32 {
if samples.is_empty() {
return 0.0;
}
let energy: f32 = samples.iter().map(|&x| x * x).sum();
if energy < 1e-12 {
return 0.0;
}
let min_lag = (sample_rate / 500.0) as usize; let max_lag = (sample_rate / 60.0) as usize;
if max_lag >= samples.len() || min_lag >= max_lag {
return 0.0;
}
let mut best_corr = 0.0_f32;
for lag in min_lag..=max_lag.min(samples.len() - 1) {
let corr: f32 = samples[..samples.len() - lag]
.iter()
.zip(&samples[lag..])
.map(|(&a, &b)| a * b)
.sum::<f32>()
/ energy;
if corr > best_corr {
best_corr = corr;
}
}
let r = best_corr.clamp(0.0, 0.9999);
if r < 1e-6 {
return 0.0;
}
10.0 * (r / (1.0 - r)).log10()
}
#[cfg(test)]
mod tests {
use super::*;
use std::f32::consts::PI;
fn sine_wave(freq: f32, amp: f32, n: usize, sr: f32) -> Vec<f32> {
(0..n)
.map(|i| amp * (2.0 * PI * freq * i as f32 / sr).sin())
.collect()
}
fn white_noise_like(n: usize, amp: f32) -> Vec<f32> {
let mut state = 12345u64;
(0..n)
.map(|_| {
state = state.wrapping_mul(6364136223846793005).wrapping_add(1);
let folded = ((state >> 32) as u32) ^ (state as u32);
let v = (folded as f32 / u32::MAX as f32) * 2.0 - 1.0;
v * amp
})
.collect()
}
#[test]
fn test_estimate_vocal_effort_silence() {
let result = estimate_vocal_effort(&vec![0.0; 4096], 44100.0);
assert!(result.rms_energy < 1e-6);
}
#[test]
fn test_estimate_vocal_effort_shout_high_energy() {
let samples = sine_wave(200.0, 0.9, 8192, 44100.0);
let result = estimate_vocal_effort(&samples, 44100.0);
assert!(result.rms_energy > 0.5, "RMS should be high: {}", result.rms_energy);
assert!(
matches!(result.effort, VocalEffort::Shout | VocalEffort::Normal),
"High energy should classify as shout or normal"
);
}
#[test]
fn test_estimate_vocal_effort_whisper_noise() {
let samples = white_noise_like(8192, 0.01);
let result = estimate_vocal_effort(&samples, 44100.0);
assert!(result.rms_energy < 0.05, "RMS should be low: {}", result.rms_energy);
assert!(
result.zcr > 0.0,
"ZCR should be positive for noise: {}",
result.zcr
);
}
#[test]
fn test_vocal_effort_result_fields() {
let samples = sine_wave(440.0, 0.3, 4096, 44100.0);
let result = estimate_vocal_effort(&samples, 44100.0);
assert!(result.confidence >= 0.0 && result.confidence <= 1.0);
assert!(result.rms_energy >= 0.0);
assert!(result.zcr >= 0.0 && result.zcr <= 1.0);
}
#[test]
fn test_vocal_effort_empty() {
let result = estimate_vocal_effort(&[], 44100.0);
assert_eq!(result.effort, VocalEffort::Normal);
assert_eq!(result.confidence, 0.0);
}
#[test]
fn test_display() {
assert_eq!(VocalEffort::Whisper.to_string(), "whisper");
assert_eq!(VocalEffort::Normal.to_string(), "normal");
assert_eq!(VocalEffort::Shout.to_string(), "shout");
}
}