stratum_dsp/preprocessing/
silence.rs

1//! Silence detection and trimming utilities
2//!
3//! Detects silent regions in audio and trims leading/trailing silence.
4//!
5//! Algorithm:
6//! 1. Frame audio into chunks
7//! 2. Compute RMS energy per frame
8//! 3. Mark frames below threshold as silent
9//! 4. Merge consecutive silent frames (< 500ms)
10//! 5. Trim leading/trailing silence
11//!
12//! # Example
13//!
14//! ```no_run
15//! use stratum_dsp::preprocessing::silence::{detect_and_trim, SilenceDetector};
16//!
17//! let samples = vec![0.0f32; 44100 * 5]; // 5 seconds
18//! let detector = SilenceDetector::default();
19//! let (trimmed, silence_map) = detect_and_trim(&samples, 44100, detector)?;
20//! # Ok::<(), stratum_dsp::AnalysisError>(())
21//! ```
22
23use crate::error::AnalysisError;
24
25/// Silence detection configuration
26#[derive(Debug, Clone)]
27pub struct SilenceDetector {
28    /// Threshold in dB (default: -40.0)
29    /// Frames with RMS below this threshold are considered silent
30    pub threshold_db: f32,
31    
32    /// Minimum duration in milliseconds for silence to be merged (default: 500)
33    /// Consecutive silent frames shorter than this are merged
34    pub min_duration_ms: u32,
35    
36    /// Frame size for analysis (default: 2048)
37    pub frame_size: usize,
38}
39
40impl Default for SilenceDetector {
41    fn default() -> Self {
42        Self {
43            threshold_db: -40.0,
44            min_duration_ms: 500,
45            frame_size: 2048,
46        }
47    }
48}
49
50/// Silence region information
51#[derive(Debug, Clone)]
52pub struct SilenceRegion {
53    /// Start sample index (inclusive)
54    pub start_sample: usize,
55    /// End sample index (exclusive)
56    pub end_sample: usize,
57    /// Duration in seconds
58    pub duration_seconds: f32,
59}
60
61/// Detect and trim silence from audio
62///
63/// This function detects silent regions in audio by analyzing RMS energy
64/// per frame, then trims leading and trailing silence. It also returns
65/// a map of all silence regions found.
66///
67/// # Arguments
68///
69/// * `samples` - Audio samples (mono, normalized to [-1.0, 1.0])
70/// * `sample_rate` - Sample rate in Hz
71/// * `detector` - Silence detection configuration
72///
73/// # Returns
74///
75/// Tuple of:
76/// - `Vec<f32>`: Trimmed samples (leading/trailing silence removed)
77/// - `Vec<(usize, usize)>`: Silence map as (start_sample, end_sample) pairs
78///
79/// # Errors
80///
81/// Returns `AnalysisError` if parameters are invalid or processing fails
82///
83/// # Example
84///
85/// ```no_run
86/// use stratum_dsp::preprocessing::silence::{detect_and_trim, SilenceDetector};
87///
88/// let mut samples = vec![0.0f32; 44100 * 5];
89/// // Add some audio in the middle
90/// for i in 22050..66150 {
91///     samples[i] = 0.5;
92/// }
93///
94/// let detector = SilenceDetector::default();
95/// let (trimmed, silence_map) = detect_and_trim(&samples, 44100, detector)?;
96///
97/// println!("Trimmed from {} to {} samples", samples.len(), trimmed.len());
98/// println!("Found {} silence regions", silence_map.len());
99/// # Ok::<(), stratum_dsp::AnalysisError>(())
100/// ```
101pub fn detect_and_trim(
102    samples: &[f32],
103    sample_rate: u32,
104    detector: SilenceDetector,
105) -> Result<(Vec<f32>, Vec<(usize, usize)>), AnalysisError> {
106    // Validate inputs
107    if samples.is_empty() {
108        return Ok((Vec::new(), Vec::new()));
109    }
110    
111    if sample_rate == 0 {
112        return Err(AnalysisError::InvalidInput("Sample rate must be > 0".to_string()));
113    }
114    
115    if detector.frame_size == 0 {
116        return Err(AnalysisError::InvalidInput("Frame size must be > 0".to_string()));
117    }
118    
119    if detector.frame_size > samples.len() {
120        log::warn!("Frame size ({}) larger than audio length ({}), treating as single frame",
121                   detector.frame_size, samples.len());
122    }
123    
124    log::debug!("Detecting silence: {} samples at {} Hz, threshold={:.1} dB, min_duration={} ms",
125                samples.len(), sample_rate, detector.threshold_db, detector.min_duration_ms);
126    
127    // Convert threshold from dB to linear RMS
128    let threshold_linear = 10.0_f32.powf(detector.threshold_db / 20.0);
129    
130    // Step 1: Frame audio and compute RMS per frame
131    let hop_size = detector.frame_size / 2; // 50% overlap for smoother detection
132    let num_frames = if samples.len() >= detector.frame_size {
133        (samples.len() - detector.frame_size) / hop_size + 1
134    } else {
135        1 // At least one frame
136    };
137    
138    let mut frame_rms = Vec::with_capacity(num_frames);
139    let mut frame_starts = Vec::with_capacity(num_frames);
140    
141    for i in 0..num_frames {
142        let start = i * hop_size;
143        let end = (start + detector.frame_size).min(samples.len());
144        
145        // Compute RMS
146        let sum_sq: f32 = samples[start..end]
147            .iter()
148            .map(|&x| x * x)
149            .sum();
150        
151        let rms = if end > start {
152            (sum_sq / (end - start) as f32).sqrt()
153        } else {
154            0.0
155        };
156        
157        frame_rms.push(rms);
158        frame_starts.push(start);
159    }
160    
161    // Step 2: Mark frames as silent or not
162    let mut frame_is_silent = Vec::with_capacity(num_frames);
163    for &rms in &frame_rms {
164        frame_is_silent.push(rms <= threshold_linear);
165    }
166    
167    // Step 3: Merge consecutive silent frames
168    // Convert min_duration_ms to frames
169    let min_duration_samples = (detector.min_duration_ms as f32 / 1000.0 * sample_rate as f32) as usize;
170    let min_duration_frames = (min_duration_samples + hop_size - 1) / hop_size; // Round up
171    
172    // Find silence regions
173    let mut silence_regions: Vec<SilenceRegion> = Vec::new();
174    let mut in_silence = false;
175    let mut silence_start_frame = 0;
176    
177    for (frame_idx, &is_silent) in frame_is_silent.iter().enumerate() {
178        if is_silent && !in_silence {
179            // Start of silence
180            in_silence = true;
181            silence_start_frame = frame_idx;
182        } else if !is_silent && in_silence {
183            // End of silence
184            in_silence = false;
185            let silence_end_frame = frame_idx;
186            let silence_duration_frames = silence_end_frame - silence_start_frame;
187            
188            // Only include if duration >= min_duration (or if it's leading/trailing)
189            if silence_duration_frames >= min_duration_frames || 
190               silence_start_frame == 0 || 
191               silence_end_frame == num_frames {
192                let start_sample = frame_starts[silence_start_frame];
193                let end_sample = if silence_end_frame < frame_starts.len() {
194                    frame_starts[silence_end_frame]
195                } else {
196                    samples.len()
197                };
198                
199                silence_regions.push(SilenceRegion {
200                    start_sample,
201                    end_sample,
202                    duration_seconds: (end_sample - start_sample) as f32 / sample_rate as f32,
203                });
204            }
205        }
206    }
207    
208    // Handle trailing silence
209    if in_silence {
210        let silence_duration_frames = num_frames - silence_start_frame;
211        if silence_duration_frames >= min_duration_frames || silence_start_frame == 0 {
212            let start_sample = frame_starts[silence_start_frame];
213            silence_regions.push(SilenceRegion {
214                start_sample,
215                end_sample: samples.len(),
216                duration_seconds: (samples.len() - start_sample) as f32 / sample_rate as f32,
217            });
218        }
219    }
220    
221    // Step 4: Trim leading and trailing silence
222    let trim_start = if let Some(first_region) = silence_regions.first() {
223        if first_region.start_sample == 0 {
224            first_region.end_sample
225        } else {
226            0
227        }
228    } else {
229        0
230    };
231    
232    let trim_end = if let Some(last_region) = silence_regions.last() {
233        if last_region.end_sample == samples.len() {
234            last_region.start_sample
235        } else {
236            samples.len()
237        }
238    } else {
239        samples.len()
240    };
241    
242    // Ensure trim_end > trim_start
243    let trim_start = trim_start.min(trim_end);
244    let trim_end = trim_end.max(trim_start);
245    
246    // Extract trimmed samples
247    let trimmed = if trim_start < trim_end && trim_end <= samples.len() {
248        samples[trim_start..trim_end].to_vec()
249    } else {
250        Vec::new()
251    };
252    
253    // Convert silence regions to (start, end) pairs for return
254    let silence_map: Vec<(usize, usize)> = silence_regions
255        .iter()
256        .map(|r| (r.start_sample, r.end_sample))
257        .collect();
258    
259    log::debug!("Silence detection: trimmed from {} to {} samples, found {} silence regions",
260                samples.len(), trimmed.len(), silence_map.len());
261    
262    Ok((trimmed, silence_map))
263}
264
265#[cfg(test)]
266mod tests {
267    use super::*;
268    
269    /// Generate test audio with silence at beginning and end
270    fn generate_test_audio_with_silence(
271        total_samples: usize,
272        audio_start: usize,
273        audio_end: usize,
274        amplitude: f32,
275    ) -> Vec<f32> {
276        let mut samples = vec![0.0f32; total_samples];
277        // Add audio signal in the middle
278        for i in audio_start..audio_end.min(total_samples) {
279            samples[i] = amplitude * (i as f32 / 1000.0).sin(); // Simple sine wave
280        }
281        samples
282    }
283    
284    #[test]
285    fn test_detect_and_trim_leading_trailing() {
286        // Create audio with silence at start and end
287        let total_samples = 44100 * 3; // 3 seconds
288        let audio_start = 44100; // 1 second of silence at start
289        let audio_end = 44100 * 2; // 1 second of audio, then silence
290        let samples = generate_test_audio_with_silence(total_samples, audio_start, audio_end, 0.5);
291        
292        let detector = SilenceDetector::default();
293        let (trimmed, silence_map) = detect_and_trim(&samples, 44100, detector).unwrap();
294        
295        // Should trim leading and trailing silence
296        assert!(trimmed.len() < samples.len(), "Should trim some silence");
297        assert!(!trimmed.is_empty(), "Should keep audio content");
298        
299        // Should detect silence regions
300        assert!(!silence_map.is_empty(), "Should detect silence regions: {:?}", silence_map);
301    }
302    
303    #[test]
304    fn test_detect_and_trim_all_silent() {
305        // All silent audio
306        let samples = vec![0.0f32; 44100];
307        
308        let detector = SilenceDetector::default();
309        let (trimmed, _silence_map) = detect_and_trim(&samples, 44100, detector).unwrap();
310        
311        // Should trim everything
312        assert!(trimmed.is_empty() || trimmed.iter().all(|&x| x.abs() < 1e-6),
313                "All silent audio should be trimmed");
314    }
315    
316    #[test]
317    fn test_detect_and_trim_no_silence() {
318        // Audio with no silence
319        let mut samples = vec![0.0f32; 44100];
320        for i in 0..samples.len() {
321            samples[i] = 0.5 * (i as f32 / 1000.0).sin();
322        }
323        
324        let detector = SilenceDetector {
325            threshold_db: -60.0, // Very low threshold
326            ..Default::default()
327        };
328        let (trimmed, _silence_map) = detect_and_trim(&samples, 44100, detector).unwrap();
329        
330        // Should keep most/all of the audio
331        assert!(trimmed.len() > samples.len() / 2,
332                "Should keep most audio when no silence detected");
333    }
334    
335    #[test]
336    fn test_detect_and_trim_invalid_parameters() {
337        let samples = vec![0.5f32; 44100];
338        let detector = SilenceDetector::default();
339        
340        // Test zero sample rate
341        let result = detect_and_trim(&samples, 0, detector.clone());
342        assert!(result.is_err());
343        
344        // Test zero frame size
345        let mut bad_detector = detector.clone();
346        bad_detector.frame_size = 0;
347        let result = detect_and_trim(&samples, 44100, bad_detector);
348        assert!(result.is_err());
349    }
350    
351    #[test]
352    fn test_detect_and_trim_empty_samples() {
353        let samples = vec![];
354        let detector = SilenceDetector::default();
355        let (trimmed, silence_map) = detect_and_trim(&samples, 44100, detector).unwrap();
356        
357        assert!(trimmed.is_empty());
358        assert!(silence_map.is_empty());
359    }
360    
361    #[test]
362    fn test_detect_and_trim_threshold_sensitivity() {
363        // Create audio with varying levels
364        let mut samples = vec![0.0f32; 44100 * 2];
365        // Add quiet section (below -40 dB)
366        for i in 0..22050 {
367            samples[i] = 0.01; // ~-40 dB
368        }
369        // Add louder section
370        for i in 22050..44100 {
371            samples[i] = 0.5; // Much louder
372        }
373        
374        // Low threshold should detect less silence
375        let detector_low = SilenceDetector {
376            threshold_db: -60.0,
377            ..Default::default()
378        };
379        let (_, silence_map_low) = detect_and_trim(&samples, 44100, detector_low).unwrap();
380        
381        // High threshold should detect more silence
382        let detector_high = SilenceDetector {
383            threshold_db: -20.0,
384            ..Default::default()
385        };
386        let (_, silence_map_high) = detect_and_trim(&samples, 44100, detector_high).unwrap();
387        
388        // Higher threshold should detect more silence regions (or longer ones)
389        let total_silence_low: usize = silence_map_low.iter()
390            .map(|(start, end)| end - start)
391            .sum();
392        let total_silence_high: usize = silence_map_high.iter()
393            .map(|(start, end)| end - start)
394            .sum();
395        
396        assert!(total_silence_high >= total_silence_low,
397                "Higher threshold should detect more silence");
398    }
399    
400    #[test]
401    fn test_detect_and_trim_min_duration() {
402        // Create audio with short silence bursts
403        let mut samples = vec![0.5f32; 44100 * 2];
404        // Add very short silence (less than 500ms)
405        for i in 10000..15000 {
406            samples[i] = 0.0;
407        }
408        
409        let detector = SilenceDetector {
410            min_duration_ms: 500, // 500ms minimum
411            ..Default::default()
412        };
413        let (_, _silence_map) = detect_and_trim(&samples, 44100, detector).unwrap();
414        
415        // Short silence bursts should be filtered out (unless leading/trailing)
416        // The 5000 sample silence is ~113ms, which is less than 500ms
417        // So it might not be included unless it's at the edges
418    }
419}
420