voirs_sdk/audio/
utilities.rs

1//! Audio buffer utility methods for common operations.
2//!
3//! This module provides convenient utility methods for AudioBuffer manipulation,
4//! including concatenation, mixing, normalization, and effects.
5
6use super::AudioBuffer;
7use crate::error::Result;
8use crate::VoirsError;
9
10/// Audio buffer utilities
11impl AudioBuffer {
12    /// Concatenate multiple audio buffers
13    ///
14    /// Buffers must have the same sample rate and channel configuration.
15    ///
16    /// # Examples
17    ///
18    /// ```no_run
19    /// use voirs_sdk::audio::AudioBuffer;
20    ///
21    /// let buf1 = AudioBuffer::mono(vec![1.0, 2.0, 3.0], 22050);
22    /// let buf2 = AudioBuffer::mono(vec![4.0, 5.0, 6.0], 22050);
23    /// let buf3 = AudioBuffer::mono(vec![7.0, 8.0, 9.0], 22050);
24    ///
25    /// let concatenated = AudioBuffer::concatenate(&[buf1, buf2, buf3]).unwrap();
26    /// assert_eq!(concatenated.len(), 9);
27    /// ```
28    pub fn concatenate(buffers: &[AudioBuffer]) -> Result<Self> {
29        if buffers.is_empty() {
30            return Err(VoirsError::AudioError {
31                buffer_info: None,
32                message: "Cannot concatenate empty buffer list".to_string(),
33            });
34        }
35
36        let first = &buffers[0];
37        let sample_rate = first.sample_rate;
38        let channels = first.channels;
39
40        // Validate all buffers have same format
41        for buffer in buffers.iter().skip(1) {
42            if buffer.sample_rate != sample_rate {
43                return Err(VoirsError::AudioError {
44                    buffer_info: None,
45                    message: format!(
46                        "Sample rate mismatch: {} != {}",
47                        buffer.sample_rate, sample_rate
48                    ),
49                });
50            }
51            if buffer.channels != channels {
52                return Err(VoirsError::AudioError {
53                    buffer_info: None,
54                    message: format!(
55                        "Channel count mismatch: {} != {}",
56                        buffer.channels, channels
57                    ),
58                });
59            }
60        }
61
62        // Concatenate samples
63        let total_samples: usize = buffers.iter().map(|b| b.samples.len()).sum();
64        let mut samples = Vec::with_capacity(total_samples);
65
66        for buffer in buffers {
67            samples.extend_from_slice(&buffer.samples);
68        }
69
70        Ok(Self::new(samples, sample_rate, channels))
71    }
72
73    /// Pad buffer with silence
74    ///
75    /// Adds silence before and/or after the audio.
76    ///
77    /// # Arguments
78    ///
79    /// * `before_seconds` - Silence duration before audio
80    /// * `after_seconds` - Silence duration after audio
81    ///
82    /// # Examples
83    ///
84    /// ```no_run
85    /// use voirs_sdk::audio::AudioBuffer;
86    ///
87    /// let mut buffer = AudioBuffer::mono(vec![1.0; 100], 22050);
88    /// buffer.pad(0.1, 0.2); // Add 0.1s before, 0.2s after
89    /// ```
90    pub fn pad(&mut self, before_seconds: f32, after_seconds: f32) {
91        let before_samples =
92            (before_seconds * self.sample_rate as f32 * self.channels as f32) as usize;
93        let after_samples =
94            (after_seconds * self.sample_rate as f32 * self.channels as f32) as usize;
95
96        let mut new_samples =
97            Vec::with_capacity(before_samples + self.samples.len() + after_samples);
98        new_samples.resize(before_samples, 0.0);
99        new_samples.extend_from_slice(&self.samples);
100        new_samples.resize(new_samples.len() + after_samples, 0.0);
101
102        self.samples = new_samples;
103        self.update_metadata();
104    }
105
106    /// Check if buffer contains clipping (samples outside [-1.0, 1.0])
107    ///
108    /// # Examples
109    ///
110    /// ```no_run
111    /// use voirs_sdk::audio::AudioBuffer;
112    ///
113    /// let buffer = AudioBuffer::mono(vec![0.5, 1.5, 0.3], 22050);
114    /// assert!(buffer.has_clipping());
115    /// ```
116    pub fn has_clipping(&self) -> bool {
117        self.samples.iter().any(|&s| s.abs() > 1.0)
118    }
119
120    /// Count number of clipped samples
121    ///
122    /// # Examples
123    ///
124    /// ```no_run
125    /// use voirs_sdk::audio::AudioBuffer;
126    ///
127    /// let buffer = AudioBuffer::mono(vec![0.5, 1.5, -1.2, 0.3], 22050);
128    /// assert_eq!(buffer.count_clipped_samples(), 2);
129    /// ```
130    pub fn count_clipped_samples(&self) -> usize {
131        self.samples.iter().filter(|&&s| s.abs() > 1.0).count()
132    }
133
134    /// Get RMS (Root Mean Square) level in dB
135    ///
136    /// Returns -∞ for silence.
137    ///
138    /// # Examples
139    ///
140    /// ```no_run
141    /// use voirs_sdk::audio::AudioBuffer;
142    ///
143    /// let buffer = AudioBuffer::mono(vec![0.5; 1000], 22050);
144    /// let rms_db = buffer.rms_db();
145    /// ```
146    pub fn rms_db(&self) -> f32 {
147        if self.metadata.rms_amplitude > 0.0 {
148            20.0 * self.metadata.rms_amplitude.log10()
149        } else {
150            f32::NEG_INFINITY
151        }
152    }
153
154    /// Get peak level in dB
155    ///
156    /// # Examples
157    ///
158    /// ```no_run
159    /// use voirs_sdk::audio::AudioBuffer;
160    ///
161    /// let buffer = AudioBuffer::mono(vec![0.5; 1000], 22050);
162    /// let peak_db = buffer.peak_db();
163    /// ```
164    pub fn peak_db(&self) -> f32 {
165        if self.metadata.peak_amplitude > 0.0 {
166            20.0 * self.metadata.peak_amplitude.log10()
167        } else {
168            f32::NEG_INFINITY
169        }
170    }
171
172    /// Calculate zero-crossing rate (ZCR)
173    ///
174    /// ZCR is the rate at which the signal changes sign, useful for
175    /// voice activity detection and audio classification.
176    ///
177    /// # Returns
178    ///
179    /// The zero-crossing rate as a fraction of the total samples.
180    ///
181    /// # Examples
182    ///
183    /// ```no_run
184    /// use voirs_sdk::audio::AudioBuffer;
185    ///
186    /// let buffer = AudioBuffer::mono(vec![0.5, -0.3, 0.2, -0.1, 0.4], 22050);
187    /// let zcr = buffer.zero_crossing_rate();
188    /// println!("Zero-crossing rate: {:.4}", zcr);
189    /// ```
190    pub fn zero_crossing_rate(&self) -> f32 {
191        if self.samples.len() < 2 {
192            return 0.0;
193        }
194
195        let mut crossings = 0;
196        for i in 1..self.samples.len() {
197            if (self.samples[i] >= 0.0 && self.samples[i - 1] < 0.0)
198                || (self.samples[i] < 0.0 && self.samples[i - 1] >= 0.0)
199            {
200                crossings += 1;
201            }
202        }
203
204        crossings as f32 / (self.samples.len() - 1) as f32
205    }
206
207    /// Calculate spectral centroid
208    ///
209    /// The spectral centroid is the "center of mass" of the spectrum,
210    /// indicating where the majority of the signal's energy is concentrated.
211    /// Higher values indicate brighter sounds.
212    ///
213    /// # Returns
214    ///
215    /// The spectral centroid in Hz, or 0.0 if calculation fails.
216    ///
217    /// # Examples
218    ///
219    /// ```no_run
220    /// use voirs_sdk::audio::AudioBuffer;
221    ///
222    /// let buffer = AudioBuffer::mono(vec![0.5; 1024], 22050);
223    /// let centroid = buffer.spectral_centroid();
224    /// println!("Spectral centroid: {:.2} Hz", centroid);
225    /// ```
226    pub fn spectral_centroid(&self) -> f32 {
227        // Need at least enough samples for meaningful analysis
228        if self.samples.len() < 64 {
229            return 0.0;
230        }
231
232        // Use next power of 2 for FFT efficiency
233        let fft_size = self.samples.len().next_power_of_two().min(2048);
234        let samples_to_analyze = self.samples.len().min(fft_size);
235
236        // Prepare complex buffer for FFT (as f64 for scirs2-fft)
237        let input: Vec<f64> = self.samples[..samples_to_analyze]
238            .iter()
239            .map(|&s| s as f64)
240            .chain(std::iter::repeat(0.0))
241            .take(fft_size)
242            .collect();
243
244        // Perform FFT
245        let spectrum = match scirs2_fft::fft(&input, Some(fft_size)) {
246            Ok(result) => result,
247            Err(_) => return 0.0,
248        };
249
250        // Calculate magnitude spectrum
251        let magnitudes: Vec<f32> = spectrum
252            .iter()
253            .take(fft_size / 2)
254            .map(|c| c.norm() as f32)
255            .collect();
256
257        // Calculate weighted sum for centroid
258        let mut weighted_sum = 0.0;
259        let mut magnitude_sum = 0.0;
260
261        for (i, &mag) in magnitudes.iter().enumerate() {
262            let freq = i as f32 * self.sample_rate as f32 / fft_size as f32;
263            weighted_sum += freq * mag;
264            magnitude_sum += mag;
265        }
266
267        if magnitude_sum > 0.0 {
268            weighted_sum / magnitude_sum
269        } else {
270            0.0
271        }
272    }
273
274    /// Calculate spectral rolloff
275    ///
276    /// The spectral rolloff is the frequency below which a specified percentage
277    /// (typically 85%) of the total spectral energy is contained.
278    ///
279    /// # Arguments
280    ///
281    /// * `threshold` - The energy threshold (0.0 to 1.0), typically 0.85
282    ///
283    /// # Returns
284    ///
285    /// The rolloff frequency in Hz, or 0.0 if calculation fails.
286    ///
287    /// # Examples
288    ///
289    /// ```no_run
290    /// use voirs_sdk::audio::AudioBuffer;
291    ///
292    /// let buffer = AudioBuffer::mono(vec![0.5; 1024], 22050);
293    /// let rolloff = buffer.spectral_rolloff(0.85);
294    /// println!("Spectral rolloff: {:.2} Hz", rolloff);
295    /// ```
296    pub fn spectral_rolloff(&self, threshold: f32) -> f32 {
297        if self.samples.len() < 64 {
298            return 0.0;
299        }
300
301        let fft_size = self.samples.len().next_power_of_two().min(2048);
302        let samples_to_analyze = self.samples.len().min(fft_size);
303
304        let input: Vec<f64> = self.samples[..samples_to_analyze]
305            .iter()
306            .map(|&s| s as f64)
307            .chain(std::iter::repeat(0.0))
308            .take(fft_size)
309            .collect();
310
311        let spectrum = match scirs2_fft::fft(&input, Some(fft_size)) {
312            Ok(result) => result,
313            Err(_) => return 0.0,
314        };
315
316        let magnitudes: Vec<f32> = spectrum
317            .iter()
318            .take(fft_size / 2)
319            .map(|c| c.norm() as f32)
320            .collect();
321
322        // Calculate total energy
323        let total_energy: f32 = magnitudes.iter().sum();
324        if total_energy == 0.0 {
325            return 0.0;
326        }
327
328        // Find rolloff frequency
329        let target_energy = total_energy * threshold;
330        let mut cumulative_energy = 0.0;
331
332        for (i, &mag) in magnitudes.iter().enumerate() {
333            cumulative_energy += mag;
334            if cumulative_energy >= target_energy {
335                return i as f32 * self.sample_rate as f32 / fft_size as f32;
336            }
337        }
338
339        // If we didn't reach threshold, return nyquist frequency
340        self.sample_rate as f32 / 2.0
341    }
342
343    /// Calculate signal-to-noise ratio (SNR) in dB
344    ///
345    /// Estimates SNR by comparing signal power to noise floor power.
346    /// Uses a simple heuristic: assumes the quietest 10% of frames represent noise.
347    ///
348    /// # Returns
349    ///
350    /// SNR in dB, or 0.0 if calculation fails.
351    ///
352    /// # Examples
353    ///
354    /// ```no_run
355    /// use voirs_sdk::audio::AudioBuffer;
356    ///
357    /// let buffer = AudioBuffer::mono(vec![0.5; 1000], 22050);
358    /// let snr = buffer.signal_to_noise_ratio();
359    /// println!("SNR: {:.2} dB", snr);
360    /// ```
361    pub fn signal_to_noise_ratio(&self) -> f32 {
362        if self.samples.len() < 100 {
363            return 0.0;
364        }
365
366        // Split audio into frames
367        let frame_size = 512;
368        let hop_size = frame_size / 2;
369        let num_frames = (self.samples.len() - frame_size) / hop_size + 1;
370
371        if num_frames < 10 {
372            return 0.0;
373        }
374
375        // Calculate RMS for each frame
376        let mut frame_rms: Vec<f32> = Vec::with_capacity(num_frames);
377
378        for frame_idx in 0..num_frames {
379            let start = frame_idx * hop_size;
380            let end = (start + frame_size).min(self.samples.len());
381
382            let rms: f32 =
383                self.samples[start..end].iter().map(|&s| s * s).sum::<f32>() / (end - start) as f32;
384
385            frame_rms.push(rms.sqrt());
386        }
387
388        // Sort to find noise floor (bottom 10%), treating NaN as max value
389        frame_rms.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
390        let noise_threshold_idx = (frame_rms.len() as f32 * 0.1) as usize;
391        let noise_floor = frame_rms[..noise_threshold_idx.max(1)].iter().sum::<f32>()
392            / noise_threshold_idx.max(1) as f32;
393
394        // Signal power is mean of all frames
395        let signal_power = frame_rms.iter().sum::<f32>() / frame_rms.len() as f32;
396
397        if noise_floor > 0.0 && signal_power > 0.0 {
398            20.0 * (signal_power / noise_floor).log10()
399        } else {
400            0.0
401        }
402    }
403
404    /// Calculate crest factor (peak-to-RMS ratio) in dB
405    ///
406    /// Crest factor indicates the dynamic range of the audio.
407    /// Higher values indicate more dynamic content with prominent peaks.
408    ///
409    /// # Returns
410    ///
411    /// Crest factor in dB.
412    ///
413    /// # Examples
414    ///
415    /// ```no_run
416    /// use voirs_sdk::audio::AudioBuffer;
417    ///
418    /// let buffer = AudioBuffer::mono(vec![0.5; 1000], 22050);
419    /// let crest = buffer.crest_factor();
420    /// println!("Crest factor: {:.2} dB", crest);
421    /// ```
422    pub fn crest_factor(&self) -> f32 {
423        let peak = self.metadata.peak_amplitude;
424        let rms = self.metadata.rms_amplitude;
425
426        if rms > 0.0 {
427            20.0 * (peak / rms).log10()
428        } else {
429            f32::INFINITY
430        }
431    }
432
433    /// Detect silence segments in the audio
434    ///
435    /// Returns a list of (start_time, end_time) tuples in seconds representing
436    /// detected silence segments.
437    ///
438    /// # Arguments
439    ///
440    /// * `threshold_db` - Silence threshold in dB (e.g., -40.0)
441    /// * `min_duration` - Minimum silence duration in seconds
442    ///
443    /// # Returns
444    ///
445    /// Vector of (start, end) time pairs in seconds.
446    ///
447    /// # Examples
448    ///
449    /// ```no_run
450    /// use voirs_sdk::audio::AudioBuffer;
451    ///
452    /// let buffer = AudioBuffer::mono(vec![0.5; 1000], 22050);
453    /// let silences = buffer.detect_silence(-40.0, 0.1);
454    /// for (start, end) in silences {
455    ///     println!("Silence: {:.2}s to {:.2}s", start, end);
456    /// }
457    /// ```
458    pub fn detect_silence(&self, threshold_db: f32, min_duration: f32) -> Vec<(f32, f32)> {
459        let threshold_amplitude = 10.0_f32.powf(threshold_db / 20.0);
460        let min_samples = (min_duration * self.sample_rate as f32) as usize;
461
462        let frame_size = 512;
463        let hop_size = frame_size / 2;
464
465        let mut silence_segments = Vec::new();
466        let mut silence_start: Option<usize> = None;
467
468        let mut frame_idx = 0;
469        while frame_idx * hop_size < self.samples.len() {
470            let start = frame_idx * hop_size;
471            let end = (start + frame_size).min(self.samples.len());
472
473            // Calculate frame RMS
474            let rms = (self.samples[start..end].iter().map(|&s| s * s).sum::<f32>()
475                / (end - start) as f32)
476                .sqrt();
477
478            if rms < threshold_amplitude {
479                // Silent frame
480                if silence_start.is_none() {
481                    silence_start = Some(start);
482                }
483            } else {
484                // Non-silent frame
485                if let Some(start_sample) = silence_start {
486                    let duration_samples = start - start_sample;
487                    if duration_samples >= min_samples {
488                        let start_time = start_sample as f32 / self.sample_rate as f32;
489                        let end_time = start as f32 / self.sample_rate as f32;
490                        silence_segments.push((start_time, end_time));
491                    }
492                    silence_start = None;
493                }
494            }
495
496            frame_idx += 1;
497        }
498
499        // Handle trailing silence
500        if let Some(start_sample) = silence_start {
501            let duration_samples = self.samples.len() - start_sample;
502            if duration_samples >= min_samples {
503                let start_time = start_sample as f32 / self.sample_rate as f32;
504                let end_time = self.samples.len() as f32 / self.sample_rate as f32;
505                silence_segments.push((start_time, end_time));
506            }
507        }
508
509        silence_segments
510    }
511
512    /// Calculate Mel-Frequency Cepstral Coefficients (MFCCs)
513    ///
514    /// MFCCs are widely used in speech and audio processing for feature extraction.
515    /// They represent the short-term power spectrum of a sound on a mel scale.
516    ///
517    /// # Arguments
518    ///
519    /// * `num_coeffs` - Number of MFCC coefficients to extract (typically 13)
520    /// * `num_filters` - Number of mel-scale filters (typically 26-40)
521    /// * `fft_size` - FFT size (power of 2, typically 512-2048)
522    ///
523    /// # Returns
524    ///
525    /// Vector of MFCC coefficients, or empty vector if calculation fails.
526    ///
527    /// # Examples
528    ///
529    /// ```no_run
530    /// use voirs_sdk::audio::AudioBuffer;
531    ///
532    /// let buffer = AudioBuffer::mono(vec![0.5; 1024], 22050);
533    /// let mfccs = buffer.mfcc(13, 26, 512);
534    /// println!("MFCC coefficients: {:?}", mfccs);
535    /// ```
536    pub fn mfcc(&self, num_coeffs: usize, num_filters: usize, fft_size: usize) -> Vec<f32> {
537        if self.samples.len() < fft_size || !fft_size.is_power_of_two() {
538            return Vec::new();
539        }
540
541        // Prepare input with windowing (Hamming window)
542        let window: Vec<f64> = (0..fft_size)
543            .map(|i| {
544                0.54 - 0.46 * (2.0 * std::f64::consts::PI * i as f64 / (fft_size - 1) as f64).cos()
545            })
546            .collect();
547
548        let samples_to_analyze = self.samples.len().min(fft_size);
549        let input: Vec<f64> = self.samples[..samples_to_analyze]
550            .iter()
551            .enumerate()
552            .map(|(i, &s)| s as f64 * window[i])
553            .chain(std::iter::repeat(0.0))
554            .take(fft_size)
555            .collect();
556
557        // Perform FFT
558        let spectrum = match scirs2_fft::fft(&input, Some(fft_size)) {
559            Ok(result) => result,
560            Err(_) => return Vec::new(),
561        };
562
563        // Calculate power spectrum
564        let power_spectrum: Vec<f32> = spectrum
565            .iter()
566            .take(fft_size / 2 + 1)
567            .map(|c| (c.norm() * c.norm()) as f32)
568            .collect();
569
570        // Create mel filterbank
571        let mel_filters =
572            Self::create_mel_filterbank(num_filters, fft_size, self.sample_rate as f32);
573
574        // Apply mel filterbank
575        let mut mel_energies = vec![0.0f32; num_filters];
576        for (filter_idx, filter) in mel_filters.iter().enumerate() {
577            for (bin_idx, &power) in power_spectrum.iter().enumerate() {
578                if bin_idx < filter.len() {
579                    mel_energies[filter_idx] += power * filter[bin_idx];
580                }
581            }
582        }
583
584        // Apply log and handle zeros
585        let log_mel: Vec<f32> = mel_energies
586            .iter()
587            .map(|&e| if e > 1e-10 { e.ln() } else { -23.0 }) // ln(1e-10) ≈ -23
588            .collect();
589
590        // Apply DCT (Discrete Cosine Transform) Type-II
591        let mut mfcc_coeffs = vec![0.0f32; num_coeffs];
592        // Note: We need the index i for DCT calculation, not for direct indexing
593        #[allow(clippy::needless_range_loop)]
594        for i in 0..num_coeffs {
595            let mut sum = 0.0;
596            for (k, &log_e) in log_mel.iter().enumerate() {
597                sum += log_e
598                    * ((std::f32::consts::PI * i as f32 * (k as f32 + 0.5)) / num_filters as f32)
599                        .cos();
600            }
601            mfcc_coeffs[i] = sum * (2.0 / num_filters as f32).sqrt();
602        }
603
604        mfcc_coeffs
605    }
606
607    /// Create mel-scale filterbank
608    ///
609    /// Helper function for MFCC calculation.
610    fn create_mel_filterbank(
611        num_filters: usize,
612        fft_size: usize,
613        sample_rate: f32,
614    ) -> Vec<Vec<f32>> {
615        // Mel scale conversion functions
616        let hz_to_mel = |hz: f32| 2595.0 * (1.0 + hz / 700.0).log10();
617        let mel_to_hz = |mel: f32| 700.0 * (10.0_f32.powf(mel / 2595.0) - 1.0);
618
619        let nyquist = sample_rate / 2.0;
620        let mel_min = hz_to_mel(0.0);
621        let mel_max = hz_to_mel(nyquist);
622
623        // Create mel-spaced frequency points
624        let mel_points: Vec<f32> = (0..=num_filters + 1)
625            .map(|i| mel_min + (mel_max - mel_min) * i as f32 / (num_filters + 1) as f32)
626            .map(mel_to_hz)
627            .collect();
628
629        // Convert to FFT bin indices
630        let bin_points: Vec<usize> = mel_points
631            .iter()
632            .map(|&freq| ((fft_size + 1) as f32 * freq / sample_rate).floor() as usize)
633            .collect();
634
635        // Create triangular filters
636        let mut filterbank = Vec::with_capacity(num_filters);
637        for i in 0..num_filters {
638            let mut filter = vec![0.0f32; fft_size / 2 + 1];
639
640            let left = bin_points[i];
641            let center = bin_points[i + 1];
642            let right = bin_points[i + 2];
643
644            // Rising slope
645            // Note: We need k for arithmetic, not just indexing
646            #[allow(clippy::needless_range_loop)]
647            for k in left..center {
648                if center != left {
649                    filter[k] = (k - left) as f32 / (center - left) as f32;
650                }
651            }
652
653            // Falling slope
654            // Note: We need k for arithmetic, not just indexing
655            #[allow(clippy::needless_range_loop)]
656            for k in center..right {
657                if right != center {
658                    filter[k] = (right - k) as f32 / (right - center) as f32;
659                }
660            }
661
662            filterbank.push(filter);
663        }
664
665        filterbank
666    }
667
668    /// Detect pitch using autocorrelation
669    ///
670    /// Uses the autocorrelation method to estimate the fundamental frequency (F0)
671    /// of the audio signal. This is more robust than simple zero-crossing rate.
672    ///
673    /// # Arguments
674    ///
675    /// * `min_freq` - Minimum expected frequency in Hz (e.g., 80 for male voice)
676    /// * `max_freq` - Maximum expected frequency in Hz (e.g., 400 for female voice)
677    ///
678    /// # Returns
679    ///
680    /// Estimated pitch in Hz, or 0.0 if no pitch detected.
681    ///
682    /// # Examples
683    ///
684    /// ```no_run
685    /// use voirs_sdk::audio::AudioBuffer;
686    ///
687    /// let buffer = AudioBuffer::mono(vec![0.5; 1024], 22050);
688    /// let pitch = buffer.detect_pitch_autocorr(80.0, 400.0);
689    /// println!("Detected pitch: {:.2} Hz", pitch);
690    /// ```
691    pub fn detect_pitch_autocorr(&self, min_freq: f32, max_freq: f32) -> f32 {
692        if self.samples.len() < 1024 {
693            return 0.0;
694        }
695
696        let sample_rate = self.sample_rate as f32;
697        let min_lag = (sample_rate / max_freq) as usize;
698        let max_lag = (sample_rate / min_freq) as usize;
699
700        // Use a reasonable window for analysis
701        let window_size = (max_lag * 3).min(self.samples.len());
702        let samples = &self.samples[..window_size];
703
704        // Compute autocorrelation using normalized method
705        let mut max_corr = 0.0;
706        let mut best_lag = 0;
707
708        for lag in min_lag..=max_lag.min(window_size / 2) {
709            let mut sum = 0.0;
710            let mut energy1 = 0.0;
711            let mut energy2 = 0.0;
712
713            for i in 0..(window_size - lag) {
714                let s1 = samples[i];
715                let s2 = samples[i + lag];
716                sum += s1 * s2;
717                energy1 += s1 * s1;
718                energy2 += s2 * s2;
719            }
720
721            // Normalized autocorrelation
722            let corr = if energy1 > 0.0 && energy2 > 0.0 {
723                sum / (energy1 * energy2).sqrt()
724            } else {
725                0.0
726            };
727
728            if corr > max_corr {
729                max_corr = corr;
730                best_lag = lag;
731            }
732        }
733
734        // Confidence threshold for pitch detection
735        if max_corr > 0.5 && best_lag > 0 {
736            sample_rate / best_lag as f32
737        } else {
738            0.0
739        }
740    }
741
742    /// Calculate spectral flux
743    ///
744    /// Spectral flux measures the rate of change in the power spectrum,
745    /// useful for detecting onsets and transients in audio.
746    ///
747    /// # Arguments
748    ///
749    /// * `prev_buffer` - Previous audio buffer for comparison (optional)
750    /// * `fft_size` - FFT size for spectral analysis
751    ///
752    /// # Returns
753    ///
754    /// Spectral flux value, or 0.0 if calculation fails.
755    ///
756    /// # Examples
757    ///
758    /// ```no_run
759    /// use voirs_sdk::audio::AudioBuffer;
760    ///
761    /// let buffer1 = AudioBuffer::mono(vec![0.5; 1024], 22050);
762    /// let buffer2 = AudioBuffer::mono(vec![0.6; 1024], 22050);
763    /// let flux = buffer2.spectral_flux(Some(&buffer1), 512);
764    /// println!("Spectral flux: {:.4}", flux);
765    /// ```
766    pub fn spectral_flux(&self, prev_buffer: Option<&AudioBuffer>, fft_size: usize) -> f32 {
767        if self.samples.len() < fft_size || !fft_size.is_power_of_two() {
768            return 0.0;
769        }
770
771        // Get current spectrum
772        let current_spectrum = self.get_magnitude_spectrum(fft_size);
773
774        // If no previous buffer, return 0
775        let prev_spectrum = match prev_buffer {
776            Some(buf) => buf.get_magnitude_spectrum(fft_size),
777            None => return 0.0,
778        };
779
780        if current_spectrum.is_empty() || prev_spectrum.is_empty() {
781            return 0.0;
782        }
783
784        // Calculate flux as sum of squared differences (positive only)
785        let mut flux = 0.0;
786        let len = current_spectrum.len().min(prev_spectrum.len());
787
788        for i in 0..len {
789            let diff = current_spectrum[i] - prev_spectrum[i];
790            if diff > 0.0 {
791                flux += diff * diff;
792            }
793        }
794
795        flux.sqrt()
796    }
797
798    /// Get magnitude spectrum (helper for spectral flux)
799    fn get_magnitude_spectrum(&self, fft_size: usize) -> Vec<f32> {
800        let samples_to_analyze = self.samples.len().min(fft_size);
801        let input: Vec<f64> = self.samples[..samples_to_analyze]
802            .iter()
803            .map(|&s| s as f64)
804            .chain(std::iter::repeat(0.0))
805            .take(fft_size)
806            .collect();
807
808        match scirs2_fft::fft(&input, Some(fft_size)) {
809            Ok(spectrum) => spectrum
810                .iter()
811                .take(fft_size / 2)
812                .map(|c| c.norm() as f32)
813                .collect(),
814            Err(_) => Vec::new(),
815        }
816    }
817
818    /// Estimate formant frequencies
819    ///
820    /// Formants are resonant frequencies of the vocal tract, crucial for
821    /// vowel identification and speaker characteristics. This uses LPC
822    /// (Linear Predictive Coding) analysis to estimate the first 4 formants.
823    ///
824    /// # Arguments
825    ///
826    /// * `num_formants` - Number of formants to estimate (typically 3-4)
827    ///
828    /// # Returns
829    ///
830    /// Vector of estimated formant frequencies in Hz.
831    ///
832    /// # Examples
833    ///
834    /// ```no_run
835    /// use voirs_sdk::audio::AudioBuffer;
836    ///
837    /// let buffer = AudioBuffer::mono(vec![0.5; 1024], 22050);
838    /// let formants = buffer.estimate_formants(4);
839    /// println!("Formants: {:?} Hz", formants);
840    /// ```
841    pub fn estimate_formants(&self, num_formants: usize) -> Vec<f32> {
842        if self.samples.len() < 512 {
843            return Vec::new();
844        }
845
846        // Use LPC order = 2 + sample_rate/1000 (common rule of thumb)
847        let lpc_order = 2 + (self.sample_rate / 1000) as usize;
848
849        // Window the signal (use up to 1024 samples)
850        let window_size = self.samples.len().min(1024);
851        let samples = &self.samples[..window_size];
852
853        // Apply pre-emphasis filter (high-pass)
854        let pre_emphasis = 0.97;
855        let mut emphasized = vec![samples[0]];
856        for i in 1..samples.len() {
857            emphasized.push(samples[i] - pre_emphasis * samples[i - 1]);
858        }
859
860        // Compute autocorrelation
861        let mut autocorr = vec![0.0; lpc_order + 1];
862        for lag in 0..=lpc_order {
863            let mut sum = 0.0;
864            for i in 0..(emphasized.len() - lag) {
865                sum += emphasized[i] * emphasized[i + lag];
866            }
867            autocorr[lag] = sum;
868        }
869
870        // Solve for LPC coefficients using Levinson-Durbin algorithm
871        let lpc_coeffs = self.levinson_durbin(&autocorr, lpc_order);
872
873        // Find roots of LPC polynomial to get formants
874        // For simplicity, we use peak-picking in the frequency response
875        let fft_size = 1024;
876        let num_bins = fft_size / 2;
877        let mut freq_response = vec![0.0; num_bins];
878
879        // Note: We need bin index for omega calculation, not just for indexing
880        #[allow(clippy::needless_range_loop)]
881        for bin in 0..num_bins {
882            let omega = 2.0 * std::f32::consts::PI * bin as f32 / fft_size as f32;
883            let mut real_part = 1.0;
884            let mut imag_part = 0.0;
885
886            for (k, &coeff) in lpc_coeffs.iter().enumerate() {
887                let angle = omega * (k + 1) as f32;
888                real_part -= coeff * angle.cos();
889                imag_part += coeff * angle.sin();
890            }
891
892            freq_response[bin] = 1.0 / (real_part * real_part + imag_part * imag_part).sqrt();
893        }
894
895        // Find peaks in frequency response
896        let mut formants = Vec::new();
897        for i in 1..(num_bins - 1) {
898            if freq_response[i] > freq_response[i - 1] && freq_response[i] > freq_response[i + 1] {
899                let freq = i as f32 * self.sample_rate as f32 / fft_size as f32;
900                // Typical formant range: 200 Hz to 4000 Hz
901                if freq >= 200.0 && freq <= 4000.0 {
902                    formants.push(freq);
903                    if formants.len() >= num_formants {
904                        break;
905                    }
906                }
907            }
908        }
909
910        formants
911    }
912
913    /// Levinson-Durbin algorithm for LPC coefficient estimation
914    ///
915    /// Solves the Yule-Walker equations efficiently.
916    fn levinson_durbin(&self, autocorr: &[f32], order: usize) -> Vec<f32> {
917        let mut lpc = vec![0.0; order];
918        let mut error = autocorr[0];
919
920        for i in 0..order {
921            let mut lambda = 0.0;
922            for j in 0..i {
923                lambda -= lpc[j] * autocorr[i - j];
924            }
925            lambda -= autocorr[i + 1];
926            lambda /= error;
927
928            // Update LPC coefficients
929            let mut new_lpc = vec![0.0; order];
930            new_lpc[i] = lambda;
931            for j in 0..i {
932                new_lpc[j] = lpc[j] + lambda * lpc[i - 1 - j];
933            }
934            lpc = new_lpc;
935
936            error *= 1.0 - lambda * lambda;
937        }
938
939        lpc
940    }
941
942    /// Calculate Jitter (pitch period irregularity)
943    ///
944    /// Jitter measures the cycle-to-cycle variation in fundamental frequency (F0),
945    /// expressed as a percentage. It's a crucial indicator of voice quality and
946    /// pathology. Higher jitter indicates more irregular vocal fold vibration.
947    ///
948    /// This implements the Jitter (local) metric, which is the average absolute
949    /// difference between consecutive periods, divided by the average period.
950    ///
951    /// # Arguments
952    ///
953    /// * `min_freq` - Minimum expected pitch (e.g., 75 Hz for male voice)
954    /// * `max_freq` - Maximum expected pitch (e.g., 500 Hz for female voice)
955    ///
956    /// # Returns
957    ///
958    /// Jitter percentage (0-100), or 0.0 if calculation fails. Typical values:
959    /// - < 1.0%: Normal voice quality
960    /// - 1.0-2.0%: Mild irregularity
961    /// - > 2.0%: Potential voice pathology
962    ///
963    /// # Examples
964    ///
965    /// ```no_run
966    /// use voirs_sdk::audio::AudioBuffer;
967    ///
968    /// let buffer = AudioBuffer::mono(vec![0.5; 4096], 22050);
969    /// let jitter = buffer.calculate_jitter(75.0, 500.0);
970    /// println!("Jitter: {:.2}%", jitter);
971    /// ```
972    pub fn calculate_jitter(&self, min_freq: f32, max_freq: f32) -> f32 {
973        if self.samples.len() < 2048 {
974            return 0.0;
975        }
976
977        let sample_rate = self.sample_rate as f32;
978        let min_period = (sample_rate / max_freq) as usize;
979        let max_period = (sample_rate / min_freq) as usize;
980
981        // Find pitch periods using autocorrelation
982        let window_size = (max_period * 4).min(self.samples.len());
983        let mut periods = Vec::new();
984
985        // Slide through the signal to find multiple periods
986        let hop_size = max_period / 2;
987        for start_idx in (0..self.samples.len() - window_size).step_by(hop_size) {
988            let window = &self.samples[start_idx..start_idx + window_size];
989
990            // Find period using autocorrelation for this window
991            let mut max_corr = 0.0;
992            let mut best_period = 0;
993
994            for period in min_period..=max_period.min(window_size / 2) {
995                let mut sum = 0.0;
996                let mut energy1 = 0.0;
997                let mut energy2 = 0.0;
998
999                for i in 0..(window_size - period) {
1000                    let s1 = window[i];
1001                    let s2 = window[i + period];
1002                    sum += s1 * s2;
1003                    energy1 += s1 * s1;
1004                    energy2 += s2 * s2;
1005                }
1006
1007                let corr = if energy1 > 0.0 && energy2 > 0.0 {
1008                    sum / (energy1 * energy2).sqrt()
1009                } else {
1010                    0.0
1011                };
1012
1013                if corr > max_corr {
1014                    max_corr = corr;
1015                    best_period = period;
1016                }
1017            }
1018
1019            // Only include periods with high confidence
1020            if max_corr > 0.6 && best_period > 0 {
1021                periods.push(best_period as f32);
1022            }
1023        }
1024
1025        // Need at least 3 periods to calculate jitter
1026        if periods.len() < 3 {
1027            return 0.0;
1028        }
1029
1030        // Calculate jitter (local): mean absolute difference between consecutive periods
1031        let mut sum_diff = 0.0;
1032        for i in 1..periods.len() {
1033            sum_diff += (periods[i] - periods[i - 1]).abs();
1034        }
1035
1036        let mean_diff = sum_diff / (periods.len() - 1) as f32;
1037        let mean_period: f32 = periods.iter().sum::<f32>() / periods.len() as f32;
1038
1039        if mean_period > 0.0 {
1040            (mean_diff / mean_period) * 100.0 // Convert to percentage
1041        } else {
1042            0.0
1043        }
1044    }
1045
1046    /// Calculate Shimmer (amplitude variation)
1047    ///
1048    /// Shimmer measures the cycle-to-cycle variation in amplitude, expressed
1049    /// as a percentage. It's an important indicator of voice quality and vocal
1050    /// fold irregularities. Higher shimmer indicates unstable voice production.
1051    ///
1052    /// This implements the Shimmer (local) metric, which is the average absolute
1053    /// difference between consecutive peak amplitudes, divided by the average amplitude.
1054    ///
1055    /// # Arguments
1056    ///
1057    /// * `min_freq` - Minimum expected pitch for period detection
1058    /// * `max_freq` - Maximum expected pitch for period detection
1059    ///
1060    /// # Returns
1061    ///
1062    /// Shimmer percentage (0-100), or 0.0 if calculation fails. Typical values:
1063    /// - < 3.0%: Normal voice quality
1064    /// - 3.0-6.0%: Mild amplitude variation
1065    /// - > 6.0%: Potential voice pathology
1066    ///
1067    /// # Examples
1068    ///
1069    /// ```no_run
1070    /// use voirs_sdk::audio::AudioBuffer;
1071    ///
1072    /// let buffer = AudioBuffer::mono(vec![0.5; 4096], 22050);
1073    /// let shimmer = buffer.calculate_shimmer(75.0, 500.0);
1074    /// println!("Shimmer: {:.2}%", shimmer);
1075    /// ```
1076    pub fn calculate_shimmer(&self, min_freq: f32, max_freq: f32) -> f32 {
1077        if self.samples.len() < 2048 {
1078            return 0.0;
1079        }
1080
1081        let sample_rate = self.sample_rate as f32;
1082        let min_period = (sample_rate / max_freq) as usize;
1083        let max_period = (sample_rate / min_freq) as usize;
1084
1085        // Find peak amplitudes for each period
1086        let window_size = (max_period * 4).min(self.samples.len());
1087        let mut peak_amplitudes = Vec::new();
1088
1089        let hop_size = max_period / 2;
1090        for start_idx in (0..self.samples.len() - window_size).step_by(hop_size) {
1091            let window = &self.samples[start_idx..start_idx + window_size];
1092
1093            // Find period using autocorrelation
1094            let mut max_corr = 0.0;
1095            let mut best_period = 0;
1096
1097            for period in min_period..=max_period.min(window_size / 2) {
1098                let mut sum = 0.0;
1099                let mut energy1 = 0.0;
1100                let mut energy2 = 0.0;
1101
1102                for i in 0..(window_size - period) {
1103                    let s1 = window[i];
1104                    let s2 = window[i + period];
1105                    sum += s1 * s2;
1106                    energy1 += s1 * s1;
1107                    energy2 += s2 * s2;
1108                }
1109
1110                let corr = if energy1 > 0.0 && energy2 > 0.0 {
1111                    sum / (energy1 * energy2).sqrt()
1112                } else {
1113                    0.0
1114                };
1115
1116                if corr > max_corr {
1117                    max_corr = corr;
1118                    best_period = period;
1119                }
1120            }
1121
1122            // Find peak amplitude in this period
1123            if max_corr > 0.6 && best_period > 0 {
1124                let period_samples = &window[..best_period.min(window.len())];
1125                let peak = period_samples
1126                    .iter()
1127                    .map(|&s| s.abs())
1128                    .max_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
1129                    .unwrap_or(0.0);
1130                peak_amplitudes.push(peak);
1131            }
1132        }
1133
1134        // Need at least 3 peaks to calculate shimmer
1135        if peak_amplitudes.len() < 3 {
1136            return 0.0;
1137        }
1138
1139        // Calculate shimmer: mean absolute difference between consecutive peaks
1140        let mut sum_diff = 0.0;
1141        for i in 1..peak_amplitudes.len() {
1142            sum_diff += (peak_amplitudes[i] - peak_amplitudes[i - 1]).abs();
1143        }
1144
1145        let mean_diff = sum_diff / (peak_amplitudes.len() - 1) as f32;
1146        let mean_amplitude: f32 =
1147            peak_amplitudes.iter().sum::<f32>() / peak_amplitudes.len() as f32;
1148
1149        if mean_amplitude > 0.0 {
1150            (mean_diff / mean_amplitude) * 100.0 // Convert to percentage
1151        } else {
1152            0.0
1153        }
1154    }
1155
1156    /// Calculate Harmonic-to-Noise Ratio (HNR)
1157    ///
1158    /// HNR measures the ratio of harmonic (periodic) to noise (aperiodic) energy
1159    /// in the voice signal. It's a fundamental measure of voice quality.
1160    /// Higher HNR indicates clearer, more periodic voice production.
1161    ///
1162    /// This uses autocorrelation-based method to separate harmonic and noise components.
1163    ///
1164    /// # Arguments
1165    ///
1166    /// * `min_freq` - Minimum expected pitch
1167    /// * `max_freq` - Maximum expected pitch
1168    ///
1169    /// # Returns
1170    ///
1171    /// HNR in decibels (dB). Typical values:
1172    /// - > 20 dB: Excellent voice quality
1173    /// - 10-20 dB: Good voice quality
1174    /// - 5-10 dB: Fair voice quality
1175    /// - < 5 dB: Poor voice quality or pathology
1176    ///
1177    /// # Examples
1178    ///
1179    /// ```no_run
1180    /// use voirs_sdk::audio::AudioBuffer;
1181    ///
1182    /// let buffer = AudioBuffer::mono(vec![0.5; 4096], 22050);
1183    /// let hnr = buffer.calculate_hnr(75.0, 500.0);
1184    /// println!("HNR: {:.2} dB", hnr);
1185    /// ```
1186    pub fn calculate_hnr(&self, min_freq: f32, max_freq: f32) -> f32 {
1187        if self.samples.len() < 2048 {
1188            return 0.0;
1189        }
1190
1191        let sample_rate = self.sample_rate as f32;
1192        let min_period = (sample_rate / max_freq) as usize;
1193        let max_period = (sample_rate / min_freq) as usize;
1194
1195        // Use autocorrelation to find the fundamental period
1196        let window_size = (max_period * 3).min(self.samples.len());
1197        let samples = &self.samples[..window_size];
1198
1199        let mut max_corr = 0.0;
1200
1201        for period in min_period..=max_period.min(window_size / 2) {
1202            let mut sum = 0.0;
1203            let mut energy1 = 0.0;
1204            let mut energy2 = 0.0;
1205
1206            for i in 0..(window_size - period) {
1207                let s1 = samples[i];
1208                let s2 = samples[i + period];
1209                sum += s1 * s2;
1210                energy1 += s1 * s1;
1211                energy2 += s2 * s2;
1212            }
1213
1214            let corr = if energy1 > 0.0 && energy2 > 0.0 {
1215                sum / (energy1 * energy2).sqrt()
1216            } else {
1217                0.0
1218            };
1219
1220            if corr > max_corr {
1221                max_corr = corr;
1222                // Note: We only need max_corr for HNR calculation, not the period itself
1223            }
1224        }
1225
1226        // If no strong periodicity found, voice is mostly noise
1227        if max_corr < 0.3 {
1228            return 0.0;
1229        }
1230
1231        // Calculate HNR using the autocorrelation peak
1232        // HNR = 10 * log10(max_corr / (1 - max_corr))
1233        // This formula relates autocorrelation to harmonic/noise ratio
1234        if max_corr >= 0.99 {
1235            // Avoid division by very small numbers
1236            return 30.0; // Very high HNR
1237        }
1238
1239        let hnr_linear = max_corr / (1.0 - max_corr);
1240        10.0 * hnr_linear.log10()
1241    }
1242
1243    /// Calculate Delta MFCCs (first-order temporal derivatives)
1244    ///
1245    /// Delta coefficients represent the rate of change of MFCCs over time,
1246    /// capturing dynamic spectral information. These are essential for
1247    /// improving speech recognition accuracy.
1248    ///
1249    /// # Arguments
1250    ///
1251    /// * `mfcc_frames` - Vector of MFCC coefficient vectors from consecutive frames
1252    /// * `delta_window` - Number of frames to use for delta calculation (typically 2)
1253    ///
1254    /// # Returns
1255    ///
1256    /// Vector of delta MFCC vectors, one per input frame.
1257    ///
1258    /// # Examples
1259    ///
1260    /// ```no_run
1261    /// use voirs_sdk::audio::AudioBuffer;
1262    ///
1263    /// let buffer = AudioBuffer::mono(vec![0.5; 8192], 22050);
1264    /// // Extract MFCCs from multiple frames...
1265    /// let mfcc_frames = vec![
1266    ///     buffer.mfcc(13, 26, 512),
1267    ///     buffer.mfcc(13, 26, 512),
1268    ///     buffer.mfcc(13, 26, 512),
1269    /// ];
1270    /// let delta_mfccs = AudioBuffer::calculate_delta_mfcc(&mfcc_frames, 2);
1271    /// ```
1272    pub fn calculate_delta_mfcc(mfcc_frames: &[Vec<f32>], delta_window: usize) -> Vec<Vec<f32>> {
1273        if mfcc_frames.is_empty() {
1274            return Vec::new();
1275        }
1276
1277        let num_frames = mfcc_frames.len();
1278        let num_coeffs = mfcc_frames[0].len();
1279        let mut delta_mfccs = Vec::with_capacity(num_frames);
1280
1281        for frame_idx in 0..num_frames {
1282            let mut delta_coeffs = vec![0.0; num_coeffs];
1283
1284            // Calculate delta using regression formula
1285            // delta[t] = (sum(n * mfcc[t+n]) - sum(n * mfcc[t-n])) / (2 * sum(n²))
1286            let mut numerator = vec![0.0; num_coeffs];
1287            let mut denominator = 0.0;
1288
1289            for n in 1..=delta_window {
1290                let n_f32 = n as f32;
1291
1292                // Future frame (t+n)
1293                let future_idx = (frame_idx + n).min(num_frames - 1);
1294                // Past frame (t-n)
1295                let past_idx = frame_idx.saturating_sub(n);
1296
1297                // Note: We need coeff_idx for indexing both numerator and mfcc_frames
1298                #[allow(clippy::needless_range_loop)]
1299                for coeff_idx in 0..num_coeffs {
1300                    numerator[coeff_idx] += n_f32
1301                        * (mfcc_frames[future_idx][coeff_idx] - mfcc_frames[past_idx][coeff_idx]);
1302                }
1303
1304                denominator += n_f32 * n_f32;
1305            }
1306
1307            // Normalize by denominator
1308            if denominator > 0.0 {
1309                // Note: We need coeff_idx for indexing both delta_coeffs and numerator
1310                #[allow(clippy::needless_range_loop)]
1311                for coeff_idx in 0..num_coeffs {
1312                    delta_coeffs[coeff_idx] = numerator[coeff_idx] / (2.0 * denominator);
1313                }
1314            }
1315
1316            delta_mfccs.push(delta_coeffs);
1317        }
1318
1319        delta_mfccs
1320    }
1321
1322    /// Calculate Delta-Delta MFCCs (second-order temporal derivatives)
1323    ///
1324    /// Delta-Delta (acceleration) coefficients represent the rate of change of
1325    /// Delta coefficients, capturing the dynamics of spectral dynamics.
1326    /// Combined with MFCCs and Deltas, they form a powerful feature set for ASR.
1327    ///
1328    /// # Arguments
1329    ///
1330    /// * `delta_mfccs` - Vector of delta MFCC coefficient vectors
1331    /// * `delta_window` - Number of frames to use (typically 2)
1332    ///
1333    /// # Returns
1334    ///
1335    /// Vector of delta-delta MFCC vectors.
1336    ///
1337    /// # Examples
1338    ///
1339    /// ```no_run
1340    /// use voirs_sdk::audio::AudioBuffer;
1341    ///
1342    /// let buffer = AudioBuffer::mono(vec![0.5; 8192], 22050);
1343    /// let mfcc_frames = vec![
1344    ///     buffer.mfcc(13, 26, 512),
1345    ///     buffer.mfcc(13, 26, 512),
1346    ///     buffer.mfcc(13, 26, 512),
1347    /// ];
1348    /// let delta_mfccs = AudioBuffer::calculate_delta_mfcc(&mfcc_frames, 2);
1349    /// let delta_delta_mfccs = AudioBuffer::calculate_delta_delta_mfcc(&delta_mfccs, 2);
1350    /// ```
1351    pub fn calculate_delta_delta_mfcc(
1352        delta_mfccs: &[Vec<f32>],
1353        delta_window: usize,
1354    ) -> Vec<Vec<f32>> {
1355        // Delta-delta is just the delta of delta coefficients
1356        Self::calculate_delta_mfcc(delta_mfccs, delta_window)
1357    }
1358
1359    /// Calculate Chroma features (pitch class representation)
1360    ///
1361    /// Chroma features, also known as pitch class profiles or chromagrams, represent
1362    /// the intensity of the 12 pitch classes (C, C#, D, ..., B) regardless of octave.
1363    /// This is particularly useful for music information retrieval, chord recognition,
1364    /// and key detection.
1365    ///
1366    /// The algorithm maps each frequency bin to one of 12 pitch classes using:
1367    /// pitch_class = 12 * log2(freq / ref_freq) mod 12
1368    ///
1369    /// # Arguments
1370    ///
1371    /// * `fft_size` - FFT size (must be power of 2, typically 2048 or 4096 for music)
1372    /// * `ref_freq` - Reference frequency for A4 (default: 440.0 Hz)
1373    ///
1374    /// # Returns
1375    ///
1376    /// 12-element vector representing energy in each pitch class (C=0, C#=1, ..., B=11).
1377    /// Returns empty vector if insufficient samples.
1378    ///
1379    /// # Applications
1380    ///
1381    /// - Music information retrieval
1382    /// - Chord recognition and key detection
1383    /// - Cover song identification
1384    /// - Music similarity analysis
1385    /// - Tonal music analysis
1386    ///
1387    /// # Examples
1388    ///
1389    /// ```no_run
1390    /// use voirs_sdk::audio::AudioBuffer;
1391    ///
1392    /// let buffer = AudioBuffer::mono(vec![0.5; 8192], 22050);
1393    /// let chroma = buffer.chroma_features(2048, 440.0);
1394    /// assert_eq!(chroma.len(), 12);
1395    /// ```
1396    pub fn chroma_features(&self, fft_size: usize, ref_freq: f32) -> Vec<f32> {
1397        if self.samples.len() < fft_size || !fft_size.is_power_of_two() {
1398            return Vec::new();
1399        }
1400
1401        // Get magnitude spectrum
1402        let magnitudes = self.get_magnitude_spectrum(fft_size);
1403
1404        // Initialize 12 pitch class bins
1405        let mut chroma = vec![0.0_f32; 12];
1406
1407        let sample_rate = self.sample_rate as f32;
1408        let freq_resolution = sample_rate / fft_size as f32;
1409
1410        // Map each frequency bin to a pitch class
1411        for (bin, &magnitude) in magnitudes.iter().enumerate().skip(1) {
1412            let freq = bin as f32 * freq_resolution;
1413
1414            // Skip very low frequencies (below 20 Hz)
1415            if freq < 20.0 {
1416                continue;
1417            }
1418
1419            // Calculate pitch class: 12 * log2(freq / ref_freq) mod 12
1420            let pitch_class_float = 12.0 * (freq / ref_freq).log2();
1421            let pitch_class = pitch_class_float.rem_euclid(12.0) as usize % 12;
1422
1423            // Accumulate magnitude in corresponding pitch class
1424            chroma[pitch_class] += magnitude;
1425        }
1426
1427        // Normalize chroma vector
1428        let max_chroma = chroma.iter().cloned().fold(0.0_f32, f32::max);
1429        if max_chroma > 0.0 {
1430            for c in &mut chroma {
1431                *c /= max_chroma;
1432            }
1433        }
1434
1435        chroma
1436    }
1437
1438    /// Calculate Spectral Contrast
1439    ///
1440    /// Spectral contrast measures the difference between peaks and valleys in the
1441    /// spectrum across multiple frequency bands. This provides a robust timbre
1442    /// representation, particularly useful for music genre classification and
1443    /// audio texture analysis.
1444    ///
1445    /// The spectrum is divided into sub-bands, and for each band:
1446    /// - Peak: Mean of top 20% of magnitudes
1447    /// - Valley: Mean of bottom 20% of magnitudes
1448    /// - Contrast: Peak - Valley (in dB)
1449    ///
1450    /// # Arguments
1451    ///
1452    /// * `fft_size` - FFT size (must be power of 2, typically 2048)
1453    /// * `num_bands` - Number of frequency bands (typically 6-8)
1454    ///
1455    /// # Returns
1456    ///
1457    /// Vector of contrast values (in dB) for each frequency band.
1458    /// Returns empty vector if insufficient samples.
1459    ///
1460    /// # Applications
1461    ///
1462    /// - Music genre classification
1463    /// - Audio texture analysis
1464    /// - Instrument recognition
1465    /// - Timbre characterization
1466    /// - Sound quality assessment
1467    ///
1468    /// # Examples
1469    ///
1470    /// ```no_run
1471    /// use voirs_sdk::audio::AudioBuffer;
1472    ///
1473    /// let buffer = AudioBuffer::mono(vec![0.5; 8192], 22050);
1474    /// let contrast = buffer.spectral_contrast(2048, 6);
1475    /// assert_eq!(contrast.len(), 6);
1476    /// ```
1477    pub fn spectral_contrast(&self, fft_size: usize, num_bands: usize) -> Vec<f32> {
1478        if self.samples.len() < fft_size || !fft_size.is_power_of_two() || num_bands == 0 {
1479            return Vec::new();
1480        }
1481
1482        // Get magnitude spectrum
1483        let magnitudes = self.get_magnitude_spectrum(fft_size);
1484
1485        // Divide spectrum into logarithmically-spaced frequency bands
1486        let mut contrasts = Vec::with_capacity(num_bands);
1487
1488        let num_bins = magnitudes.len();
1489        let band_edges = (0..=num_bands)
1490            .map(|i| {
1491                let ratio = i as f32 / num_bands as f32;
1492                // Logarithmic spacing
1493                (ratio * (num_bins as f32).ln()).exp() as usize
1494            })
1495            .collect::<Vec<_>>();
1496
1497        // Calculate contrast for each band
1498        for band_idx in 0..num_bands {
1499            let start = band_edges[band_idx];
1500            let end = band_edges[band_idx + 1].min(num_bins);
1501
1502            if start >= end {
1503                contrasts.push(0.0);
1504                continue;
1505            }
1506
1507            // Extract band magnitudes
1508            let mut band_mags: Vec<f32> = magnitudes[start..end].to_vec();
1509            band_mags.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
1510
1511            let band_len = band_mags.len();
1512            if band_len < 5 {
1513                contrasts.push(0.0);
1514                continue;
1515            }
1516
1517            // Calculate peak (top 20%) and valley (bottom 20%)
1518            let percentile_idx = (band_len as f32 * 0.2) as usize;
1519
1520            // Valley: mean of bottom 20%
1521            let valley_sum: f32 = band_mags[..percentile_idx].iter().sum();
1522            let valley = valley_sum / percentile_idx as f32;
1523
1524            // Peak: mean of top 20%
1525            let peak_sum: f32 = band_mags[band_len - percentile_idx..].iter().sum();
1526            let peak = peak_sum / percentile_idx as f32;
1527
1528            // Calculate contrast in dB
1529            let contrast = if valley > 1e-10 {
1530                20.0 * (peak / valley).log10()
1531            } else if peak > 1e-10 {
1532                60.0 // Maximum contrast when valley is near zero
1533            } else {
1534                0.0
1535            };
1536
1537            contrasts.push(contrast);
1538        }
1539
1540        contrasts
1541    }
1542
1543    /// Detect pitch using the YIN algorithm
1544    ///
1545    /// The YIN algorithm is a robust pitch detection method that improves upon
1546    /// autocorrelation by using a cumulative mean normalized difference function.
1547    /// It provides more accurate pitch detection, especially for noisy signals.
1548    ///
1549    /// YIN steps:
1550    /// 1. Calculate difference function
1551    /// 2. Apply cumulative mean normalization
1552    /// 3. Find first minimum below threshold
1553    /// 4. Apply parabolic interpolation for sub-sample accuracy
1554    ///
1555    /// Reference: "YIN, a fundamental frequency estimator for speech and music"
1556    /// by Alain de Cheveigné and Hideki Kawahara (2002)
1557    ///
1558    /// # Arguments
1559    ///
1560    /// * `min_freq` - Minimum frequency to search (Hz)
1561    /// * `max_freq` - Maximum frequency to search (Hz)
1562    /// * `threshold` - Threshold for minimum detection (typically 0.1-0.2)
1563    ///
1564    /// # Returns
1565    ///
1566    /// Detected fundamental frequency in Hz, or 0.0 if no pitch detected.
1567    ///
1568    /// # Applications
1569    ///
1570    /// - High-accuracy pitch tracking
1571    /// - Music transcription
1572    /// - Singing voice analysis
1573    /// - Instrument tuning
1574    /// - Prosody analysis in speech
1575    ///
1576    /// # Examples
1577    ///
1578    /// ```no_run
1579    /// use voirs_sdk::audio::AudioBuffer;
1580    ///
1581    /// let buffer = AudioBuffer::mono(vec![0.5; 8192], 22050);
1582    /// let pitch = buffer.detect_pitch_yin(80.0, 400.0, 0.15);
1583    /// if pitch > 0.0 {
1584    ///     println!("Detected pitch: {:.1} Hz", pitch);
1585    /// }
1586    /// ```
1587    pub fn detect_pitch_yin(&self, min_freq: f32, max_freq: f32, threshold: f32) -> f32 {
1588        let sample_rate = self.sample_rate as f32;
1589        let min_period = (sample_rate / max_freq) as usize;
1590        let max_period = (sample_rate / min_freq) as usize;
1591
1592        // Need sufficient samples
1593        let buffer_size = (max_period * 2).min(self.samples.len());
1594        if buffer_size < max_period {
1595            return 0.0;
1596        }
1597
1598        // Step 1: Calculate difference function
1599        let mut diff = vec![0.0_f32; max_period + 1];
1600        diff[0] = 1.0; // By definition
1601
1602        // Note: tau is used for both arithmetic (buffer_size - tau) and indexing (diff[tau], samples[i + tau])
1603        #[allow(clippy::needless_range_loop)]
1604        for tau in 1..=max_period {
1605            let mut sum = 0.0;
1606            for i in 0..(buffer_size - tau) {
1607                let delta = self.samples[i] - self.samples[i + tau];
1608                sum += delta * delta;
1609            }
1610            diff[tau] = sum;
1611        }
1612
1613        // Step 2: Cumulative mean normalized difference function
1614        let mut cmnd = vec![0.0_f32; max_period + 1];
1615        cmnd[0] = 1.0;
1616
1617        let mut running_sum = 0.0;
1618        for tau in 1..=max_period {
1619            running_sum += diff[tau];
1620            cmnd[tau] = if running_sum > 0.0 {
1621                diff[tau] * tau as f32 / running_sum
1622            } else {
1623                1.0
1624            };
1625        }
1626
1627        // Step 3: Find first minimum below threshold
1628        let mut tau_estimate = 0;
1629        for tau in min_period..=max_period {
1630            if cmnd[tau] < threshold {
1631                // Look for local minimum
1632                if tau > 0
1633                    && tau < max_period
1634                    && cmnd[tau] < cmnd[tau - 1]
1635                    && cmnd[tau] < cmnd[tau + 1]
1636                {
1637                    tau_estimate = tau;
1638                    break;
1639                }
1640            }
1641        }
1642
1643        if tau_estimate == 0 {
1644            // No pitch found below threshold
1645            return 0.0;
1646        }
1647
1648        // Step 4: Parabolic interpolation for sub-sample accuracy
1649        let tau = tau_estimate;
1650        let refined_tau = if tau > 0 && tau < max_period {
1651            let alpha = cmnd[tau - 1];
1652            let beta = cmnd[tau];
1653            let gamma = cmnd[tau + 1];
1654
1655            // Parabolic interpolation
1656            let adjustment = if (alpha - 2.0 * beta + gamma).abs() > 1e-10 {
1657                0.5 * (alpha - gamma) / (alpha - 2.0 * beta + gamma)
1658            } else {
1659                0.0
1660            };
1661
1662            tau as f32 + adjustment
1663        } else {
1664            tau as f32
1665        };
1666
1667        // Convert period to frequency
1668        sample_rate / refined_tau
1669    }
1670}
1671
1672#[cfg(test)]
1673#[path = "utilities_tests.rs"]
1674mod tests;
voirs_sdk/audio/utilities.rs

voirs_sdk/audio/
utilities.rs