use super::{dsp, AudioBuffer};
use crate::{Result, VoirsError};
use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct VoiceFeatures {
pub f0: f32,
pub formants: Vec<f32>,
pub mfcc: Vec<f32>,
pub jitter: f32,
pub shimmer: f32,
pub hnr: f32,
pub speech_rate: f32,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AudioQualityMetrics {
pub rms_db: f32,
pub peak_db: f32,
pub crest_factor: f32,
pub snr: f32,
pub has_clipping: bool,
pub clipped_samples: usize,
}
pub fn podcast_quality(audio: &AudioBuffer) -> Result<AudioBuffer> {
let mut processed = audio.clone();
processed = dsp::highpass_filter(&processed, 80.0)?;
processed.normalize(0.8)?;
let peak = processed.peak_db();
if peak > -6.0 {
let gain_reduction = -6.0 - peak;
processed.apply_gain(gain_reduction)?;
}
Ok(processed)
}
pub fn telephone_quality(audio: &AudioBuffer) -> Result<AudioBuffer> {
let mut processed = audio.clone();
processed = dsp::bandpass_filter(&processed, 300.0, 3400.0)?;
Ok(processed)
}
pub fn voice_feature_extraction(audio: &AudioBuffer) -> Result<VoiceFeatures> {
let f0 = audio.detect_pitch_yin(80.0, 400.0, 0.15);
let formants = audio.estimate_formants(3);
let mfcc = audio.mfcc(13, 26, 512);
let jitter = audio.calculate_jitter(80.0, 400.0);
let shimmer = audio.calculate_shimmer(80.0, 400.0);
let hnr = audio.calculate_hnr(80.0, 400.0);
let speech_rate = audio.zero_crossing_rate();
Ok(VoiceFeatures {
f0,
formants,
mfcc,
jitter,
shimmer,
hnr,
speech_rate,
})
}
pub fn analyze_quality(audio: &AudioBuffer) -> Result<AudioQualityMetrics> {
Ok(AudioQualityMetrics {
rms_db: audio.rms_db(),
peak_db: audio.peak_db(),
crest_factor: audio.crest_factor(),
snr: audio.signal_to_noise_ratio(),
has_clipping: audio.has_clipping(),
clipped_samples: audio.count_clipped_samples(),
})
}
pub fn broadcast_quality(audio: &AudioBuffer) -> Result<AudioBuffer> {
let mut processed = audio.clone();
processed = dsp::highpass_filter(&processed, 50.0)?;
processed.normalize(0.95)?;
let peak = processed.peak_db();
if peak > -1.0 {
let gain_reduction = -1.0 - peak;
processed.apply_gain(gain_reduction)?;
}
Ok(processed)
}
pub fn low_bitrate_optimize(audio: &AudioBuffer) -> Result<AudioBuffer> {
let mut processed = audio.clone();
processed = dsp::highpass_filter(&processed, 60.0)?;
let nyquist = (processed.sample_rate() as f32) / 2.0;
let cutoff = nyquist.min(16000.0) * 0.9; processed = dsp::lowpass_filter(&processed, cutoff)?;
processed.normalize(1.0)?;
Ok(processed)
}
pub fn remove_silence(
audio: &AudioBuffer,
threshold_db: f32,
min_duration: f32,
) -> Result<AudioBuffer> {
let silence_regions = audio.detect_silence(threshold_db, min_duration);
if silence_regions.is_empty() {
return Ok(audio.clone());
}
let sample_rate = audio.sample_rate();
let mut output_samples = Vec::new();
let total_samples = audio.len();
let all_samples = audio.samples();
let mut current_pos = 0;
for (start_time, end_time) in silence_regions {
let start_sample = (start_time * sample_rate as f32) as usize;
let end_sample = (end_time * sample_rate as f32) as usize;
if start_sample > current_pos {
output_samples.extend_from_slice(&all_samples[current_pos..start_sample]);
}
current_pos = end_sample;
}
if current_pos < total_samples {
output_samples.extend_from_slice(&all_samples[current_pos..total_samples]);
}
Ok(AudioBuffer::mono(output_samples, sample_rate))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_voice_features_extraction() {
let sample_rate = 44100;
let duration = 1.0; let frequency = 200.0;
let samples: Vec<f32> = (0..(sample_rate as f32 * duration) as usize)
.map(|i| {
let t = i as f32 / sample_rate as f32;
(2.0 * std::f32::consts::PI * frequency * t).sin() * 0.5
})
.collect();
let audio = AudioBuffer::mono(samples, sample_rate);
let features = voice_feature_extraction(&audio).unwrap();
assert!((features.f0 - 200.0).abs() < 10.0);
assert_eq!(features.mfcc.len(), 13);
assert_eq!(features.formants.len(), 3);
}
#[test]
fn test_quality_analysis() {
let samples = vec![0.5f32; 44100]; let audio = AudioBuffer::mono(samples, 44100);
let metrics = analyze_quality(&audio).unwrap();
assert!(metrics.rms_db < 0.0);
assert!(metrics.peak_db < 0.0);
assert!(!metrics.has_clipping);
assert_eq!(metrics.clipped_samples, 0);
}
#[test]
fn test_podcast_quality() {
let samples = vec![0.5f32; 44100];
let audio = AudioBuffer::mono(samples, 44100);
let result = podcast_quality(&audio);
assert!(result.is_ok());
}
#[test]
fn test_telephone_quality() {
let samples = vec![0.5f32; 44100];
let audio = AudioBuffer::mono(samples, 44100);
let result = telephone_quality(&audio);
assert!(result.is_ok());
}
#[test]
fn test_broadcast_quality() {
let samples = vec![0.5f32; 44100];
let audio = AudioBuffer::mono(samples, 44100);
let result = broadcast_quality(&audio);
assert!(result.is_ok());
}
#[test]
fn test_low_bitrate_optimize() {
let samples = vec![0.5f32; 44100];
let audio = AudioBuffer::mono(samples, 44100);
let result = low_bitrate_optimize(&audio);
assert!(result.is_ok());
}
#[test]
fn test_remove_silence() {
let mut samples = Vec::new();
samples.extend(vec![0.5f32; 22050]); samples.extend(vec![0.0f32; 44100]); samples.extend(vec![0.5f32; 22050]);
let audio = AudioBuffer::mono(samples, 44100);
let trimmed = remove_silence(&audio, -50.0, 0.5).unwrap();
assert!(trimmed.len() < audio.len());
}
}