voirs_sdk/audio/utilities.rs
1//! Audio buffer utility methods for common operations.
2//!
3//! This module provides convenient utility methods for AudioBuffer manipulation,
4//! including concatenation, mixing, normalization, and effects.
5
6use super::AudioBuffer;
7use crate::error::Result;
8use crate::VoirsError;
9
10/// Audio buffer utilities
11impl AudioBuffer {
12 /// Concatenate multiple audio buffers
13 ///
14 /// Buffers must have the same sample rate and channel configuration.
15 ///
16 /// # Examples
17 ///
18 /// ```no_run
19 /// use voirs_sdk::audio::AudioBuffer;
20 ///
21 /// let buf1 = AudioBuffer::mono(vec![1.0, 2.0, 3.0], 22050);
22 /// let buf2 = AudioBuffer::mono(vec![4.0, 5.0, 6.0], 22050);
23 /// let buf3 = AudioBuffer::mono(vec![7.0, 8.0, 9.0], 22050);
24 ///
25 /// let concatenated = AudioBuffer::concatenate(&[buf1, buf2, buf3]).unwrap();
26 /// assert_eq!(concatenated.len(), 9);
27 /// ```
28 pub fn concatenate(buffers: &[AudioBuffer]) -> Result<Self> {
29 if buffers.is_empty() {
30 return Err(VoirsError::AudioError {
31 buffer_info: None,
32 message: "Cannot concatenate empty buffer list".to_string(),
33 });
34 }
35
36 let first = &buffers[0];
37 let sample_rate = first.sample_rate;
38 let channels = first.channels;
39
40 // Validate all buffers have same format
41 for buffer in buffers.iter().skip(1) {
42 if buffer.sample_rate != sample_rate {
43 return Err(VoirsError::AudioError {
44 buffer_info: None,
45 message: format!(
46 "Sample rate mismatch: {} != {}",
47 buffer.sample_rate, sample_rate
48 ),
49 });
50 }
51 if buffer.channels != channels {
52 return Err(VoirsError::AudioError {
53 buffer_info: None,
54 message: format!(
55 "Channel count mismatch: {} != {}",
56 buffer.channels, channels
57 ),
58 });
59 }
60 }
61
62 // Concatenate samples
63 let total_samples: usize = buffers.iter().map(|b| b.samples.len()).sum();
64 let mut samples = Vec::with_capacity(total_samples);
65
66 for buffer in buffers {
67 samples.extend_from_slice(&buffer.samples);
68 }
69
70 Ok(Self::new(samples, sample_rate, channels))
71 }
72
73 /// Pad buffer with silence
74 ///
75 /// Adds silence before and/or after the audio.
76 ///
77 /// # Arguments
78 ///
79 /// * `before_seconds` - Silence duration before audio
80 /// * `after_seconds` - Silence duration after audio
81 ///
82 /// # Examples
83 ///
84 /// ```no_run
85 /// use voirs_sdk::audio::AudioBuffer;
86 ///
87 /// let mut buffer = AudioBuffer::mono(vec![1.0; 100], 22050);
88 /// buffer.pad(0.1, 0.2); // Add 0.1s before, 0.2s after
89 /// ```
90 pub fn pad(&mut self, before_seconds: f32, after_seconds: f32) {
91 let before_samples =
92 (before_seconds * self.sample_rate as f32 * self.channels as f32) as usize;
93 let after_samples =
94 (after_seconds * self.sample_rate as f32 * self.channels as f32) as usize;
95
96 let mut new_samples =
97 Vec::with_capacity(before_samples + self.samples.len() + after_samples);
98 new_samples.resize(before_samples, 0.0);
99 new_samples.extend_from_slice(&self.samples);
100 new_samples.resize(new_samples.len() + after_samples, 0.0);
101
102 self.samples = new_samples;
103 self.update_metadata();
104 }
105
106 /// Check if buffer contains clipping (samples outside [-1.0, 1.0])
107 ///
108 /// # Examples
109 ///
110 /// ```no_run
111 /// use voirs_sdk::audio::AudioBuffer;
112 ///
113 /// let buffer = AudioBuffer::mono(vec![0.5, 1.5, 0.3], 22050);
114 /// assert!(buffer.has_clipping());
115 /// ```
116 pub fn has_clipping(&self) -> bool {
117 self.samples.iter().any(|&s| s.abs() > 1.0)
118 }
119
120 /// Count number of clipped samples
121 ///
122 /// # Examples
123 ///
124 /// ```no_run
125 /// use voirs_sdk::audio::AudioBuffer;
126 ///
127 /// let buffer = AudioBuffer::mono(vec![0.5, 1.5, -1.2, 0.3], 22050);
128 /// assert_eq!(buffer.count_clipped_samples(), 2);
129 /// ```
130 pub fn count_clipped_samples(&self) -> usize {
131 self.samples.iter().filter(|&&s| s.abs() > 1.0).count()
132 }
133
134 /// Get RMS (Root Mean Square) level in dB
135 ///
136 /// Returns -∞ for silence.
137 ///
138 /// # Examples
139 ///
140 /// ```no_run
141 /// use voirs_sdk::audio::AudioBuffer;
142 ///
143 /// let buffer = AudioBuffer::mono(vec![0.5; 1000], 22050);
144 /// let rms_db = buffer.rms_db();
145 /// ```
146 pub fn rms_db(&self) -> f32 {
147 if self.metadata.rms_amplitude > 0.0 {
148 20.0 * self.metadata.rms_amplitude.log10()
149 } else {
150 f32::NEG_INFINITY
151 }
152 }
153
154 /// Get peak level in dB
155 ///
156 /// # Examples
157 ///
158 /// ```no_run
159 /// use voirs_sdk::audio::AudioBuffer;
160 ///
161 /// let buffer = AudioBuffer::mono(vec![0.5; 1000], 22050);
162 /// let peak_db = buffer.peak_db();
163 /// ```
164 pub fn peak_db(&self) -> f32 {
165 if self.metadata.peak_amplitude > 0.0 {
166 20.0 * self.metadata.peak_amplitude.log10()
167 } else {
168 f32::NEG_INFINITY
169 }
170 }
171
172 /// Calculate zero-crossing rate (ZCR)
173 ///
174 /// ZCR is the rate at which the signal changes sign, useful for
175 /// voice activity detection and audio classification.
176 ///
177 /// # Returns
178 ///
179 /// The zero-crossing rate as a fraction of the total samples.
180 ///
181 /// # Examples
182 ///
183 /// ```no_run
184 /// use voirs_sdk::audio::AudioBuffer;
185 ///
186 /// let buffer = AudioBuffer::mono(vec![0.5, -0.3, 0.2, -0.1, 0.4], 22050);
187 /// let zcr = buffer.zero_crossing_rate();
188 /// println!("Zero-crossing rate: {:.4}", zcr);
189 /// ```
190 pub fn zero_crossing_rate(&self) -> f32 {
191 if self.samples.len() < 2 {
192 return 0.0;
193 }
194
195 let mut crossings = 0;
196 for i in 1..self.samples.len() {
197 if (self.samples[i] >= 0.0 && self.samples[i - 1] < 0.0)
198 || (self.samples[i] < 0.0 && self.samples[i - 1] >= 0.0)
199 {
200 crossings += 1;
201 }
202 }
203
204 crossings as f32 / (self.samples.len() - 1) as f32
205 }
206
207 /// Calculate spectral centroid
208 ///
209 /// The spectral centroid is the "center of mass" of the spectrum,
210 /// indicating where the majority of the signal's energy is concentrated.
211 /// Higher values indicate brighter sounds.
212 ///
213 /// # Returns
214 ///
215 /// The spectral centroid in Hz, or 0.0 if calculation fails.
216 ///
217 /// # Examples
218 ///
219 /// ```no_run
220 /// use voirs_sdk::audio::AudioBuffer;
221 ///
222 /// let buffer = AudioBuffer::mono(vec![0.5; 1024], 22050);
223 /// let centroid = buffer.spectral_centroid();
224 /// println!("Spectral centroid: {:.2} Hz", centroid);
225 /// ```
226 pub fn spectral_centroid(&self) -> f32 {
227 // Need at least enough samples for meaningful analysis
228 if self.samples.len() < 64 {
229 return 0.0;
230 }
231
232 // Use next power of 2 for FFT efficiency
233 let fft_size = self.samples.len().next_power_of_two().min(2048);
234 let samples_to_analyze = self.samples.len().min(fft_size);
235
236 // Prepare complex buffer for FFT (as f64 for scirs2-fft)
237 let input: Vec<f64> = self.samples[..samples_to_analyze]
238 .iter()
239 .map(|&s| s as f64)
240 .chain(std::iter::repeat(0.0))
241 .take(fft_size)
242 .collect();
243
244 // Perform FFT
245 let spectrum = match scirs2_fft::fft(&input, Some(fft_size)) {
246 Ok(result) => result,
247 Err(_) => return 0.0,
248 };
249
250 // Calculate magnitude spectrum
251 let magnitudes: Vec<f32> = spectrum
252 .iter()
253 .take(fft_size / 2)
254 .map(|c| c.norm() as f32)
255 .collect();
256
257 // Calculate weighted sum for centroid
258 let mut weighted_sum = 0.0;
259 let mut magnitude_sum = 0.0;
260
261 for (i, &mag) in magnitudes.iter().enumerate() {
262 let freq = i as f32 * self.sample_rate as f32 / fft_size as f32;
263 weighted_sum += freq * mag;
264 magnitude_sum += mag;
265 }
266
267 if magnitude_sum > 0.0 {
268 weighted_sum / magnitude_sum
269 } else {
270 0.0
271 }
272 }
273
274 /// Calculate spectral rolloff
275 ///
276 /// The spectral rolloff is the frequency below which a specified percentage
277 /// (typically 85%) of the total spectral energy is contained.
278 ///
279 /// # Arguments
280 ///
281 /// * `threshold` - The energy threshold (0.0 to 1.0), typically 0.85
282 ///
283 /// # Returns
284 ///
285 /// The rolloff frequency in Hz, or 0.0 if calculation fails.
286 ///
287 /// # Examples
288 ///
289 /// ```no_run
290 /// use voirs_sdk::audio::AudioBuffer;
291 ///
292 /// let buffer = AudioBuffer::mono(vec![0.5; 1024], 22050);
293 /// let rolloff = buffer.spectral_rolloff(0.85);
294 /// println!("Spectral rolloff: {:.2} Hz", rolloff);
295 /// ```
296 pub fn spectral_rolloff(&self, threshold: f32) -> f32 {
297 if self.samples.len() < 64 {
298 return 0.0;
299 }
300
301 let fft_size = self.samples.len().next_power_of_two().min(2048);
302 let samples_to_analyze = self.samples.len().min(fft_size);
303
304 let input: Vec<f64> = self.samples[..samples_to_analyze]
305 .iter()
306 .map(|&s| s as f64)
307 .chain(std::iter::repeat(0.0))
308 .take(fft_size)
309 .collect();
310
311 let spectrum = match scirs2_fft::fft(&input, Some(fft_size)) {
312 Ok(result) => result,
313 Err(_) => return 0.0,
314 };
315
316 let magnitudes: Vec<f32> = spectrum
317 .iter()
318 .take(fft_size / 2)
319 .map(|c| c.norm() as f32)
320 .collect();
321
322 // Calculate total energy
323 let total_energy: f32 = magnitudes.iter().sum();
324 if total_energy == 0.0 {
325 return 0.0;
326 }
327
328 // Find rolloff frequency
329 let target_energy = total_energy * threshold;
330 let mut cumulative_energy = 0.0;
331
332 for (i, &mag) in magnitudes.iter().enumerate() {
333 cumulative_energy += mag;
334 if cumulative_energy >= target_energy {
335 return i as f32 * self.sample_rate as f32 / fft_size as f32;
336 }
337 }
338
339 // If we didn't reach threshold, return nyquist frequency
340 self.sample_rate as f32 / 2.0
341 }
342
343 /// Calculate signal-to-noise ratio (SNR) in dB
344 ///
345 /// Estimates SNR by comparing signal power to noise floor power.
346 /// Uses a simple heuristic: assumes the quietest 10% of frames represent noise.
347 ///
348 /// # Returns
349 ///
350 /// SNR in dB, or 0.0 if calculation fails.
351 ///
352 /// # Examples
353 ///
354 /// ```no_run
355 /// use voirs_sdk::audio::AudioBuffer;
356 ///
357 /// let buffer = AudioBuffer::mono(vec![0.5; 1000], 22050);
358 /// let snr = buffer.signal_to_noise_ratio();
359 /// println!("SNR: {:.2} dB", snr);
360 /// ```
361 pub fn signal_to_noise_ratio(&self) -> f32 {
362 if self.samples.len() < 100 {
363 return 0.0;
364 }
365
366 // Split audio into frames
367 let frame_size = 512;
368 let hop_size = frame_size / 2;
369 let num_frames = (self.samples.len() - frame_size) / hop_size + 1;
370
371 if num_frames < 10 {
372 return 0.0;
373 }
374
375 // Calculate RMS for each frame
376 let mut frame_rms: Vec<f32> = Vec::with_capacity(num_frames);
377
378 for frame_idx in 0..num_frames {
379 let start = frame_idx * hop_size;
380 let end = (start + frame_size).min(self.samples.len());
381
382 let rms: f32 =
383 self.samples[start..end].iter().map(|&s| s * s).sum::<f32>() / (end - start) as f32;
384
385 frame_rms.push(rms.sqrt());
386 }
387
388 // Sort to find noise floor (bottom 10%), treating NaN as max value
389 frame_rms.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
390 let noise_threshold_idx = (frame_rms.len() as f32 * 0.1) as usize;
391 let noise_floor = frame_rms[..noise_threshold_idx.max(1)].iter().sum::<f32>()
392 / noise_threshold_idx.max(1) as f32;
393
394 // Signal power is mean of all frames
395 let signal_power = frame_rms.iter().sum::<f32>() / frame_rms.len() as f32;
396
397 if noise_floor > 0.0 && signal_power > 0.0 {
398 20.0 * (signal_power / noise_floor).log10()
399 } else {
400 0.0
401 }
402 }
403
404 /// Calculate crest factor (peak-to-RMS ratio) in dB
405 ///
406 /// Crest factor indicates the dynamic range of the audio.
407 /// Higher values indicate more dynamic content with prominent peaks.
408 ///
409 /// # Returns
410 ///
411 /// Crest factor in dB.
412 ///
413 /// # Examples
414 ///
415 /// ```no_run
416 /// use voirs_sdk::audio::AudioBuffer;
417 ///
418 /// let buffer = AudioBuffer::mono(vec![0.5; 1000], 22050);
419 /// let crest = buffer.crest_factor();
420 /// println!("Crest factor: {:.2} dB", crest);
421 /// ```
422 pub fn crest_factor(&self) -> f32 {
423 let peak = self.metadata.peak_amplitude;
424 let rms = self.metadata.rms_amplitude;
425
426 if rms > 0.0 {
427 20.0 * (peak / rms).log10()
428 } else {
429 f32::INFINITY
430 }
431 }
432
433 /// Detect silence segments in the audio
434 ///
435 /// Returns a list of (start_time, end_time) tuples in seconds representing
436 /// detected silence segments.
437 ///
438 /// # Arguments
439 ///
440 /// * `threshold_db` - Silence threshold in dB (e.g., -40.0)
441 /// * `min_duration` - Minimum silence duration in seconds
442 ///
443 /// # Returns
444 ///
445 /// Vector of (start, end) time pairs in seconds.
446 ///
447 /// # Examples
448 ///
449 /// ```no_run
450 /// use voirs_sdk::audio::AudioBuffer;
451 ///
452 /// let buffer = AudioBuffer::mono(vec![0.5; 1000], 22050);
453 /// let silences = buffer.detect_silence(-40.0, 0.1);
454 /// for (start, end) in silences {
455 /// println!("Silence: {:.2}s to {:.2}s", start, end);
456 /// }
457 /// ```
458 pub fn detect_silence(&self, threshold_db: f32, min_duration: f32) -> Vec<(f32, f32)> {
459 let threshold_amplitude = 10.0_f32.powf(threshold_db / 20.0);
460 let min_samples = (min_duration * self.sample_rate as f32) as usize;
461
462 let frame_size = 512;
463 let hop_size = frame_size / 2;
464
465 let mut silence_segments = Vec::new();
466 let mut silence_start: Option<usize> = None;
467
468 let mut frame_idx = 0;
469 while frame_idx * hop_size < self.samples.len() {
470 let start = frame_idx * hop_size;
471 let end = (start + frame_size).min(self.samples.len());
472
473 // Calculate frame RMS
474 let rms = (self.samples[start..end].iter().map(|&s| s * s).sum::<f32>()
475 / (end - start) as f32)
476 .sqrt();
477
478 if rms < threshold_amplitude {
479 // Silent frame
480 if silence_start.is_none() {
481 silence_start = Some(start);
482 }
483 } else {
484 // Non-silent frame
485 if let Some(start_sample) = silence_start {
486 let duration_samples = start - start_sample;
487 if duration_samples >= min_samples {
488 let start_time = start_sample as f32 / self.sample_rate as f32;
489 let end_time = start as f32 / self.sample_rate as f32;
490 silence_segments.push((start_time, end_time));
491 }
492 silence_start = None;
493 }
494 }
495
496 frame_idx += 1;
497 }
498
499 // Handle trailing silence
500 if let Some(start_sample) = silence_start {
501 let duration_samples = self.samples.len() - start_sample;
502 if duration_samples >= min_samples {
503 let start_time = start_sample as f32 / self.sample_rate as f32;
504 let end_time = self.samples.len() as f32 / self.sample_rate as f32;
505 silence_segments.push((start_time, end_time));
506 }
507 }
508
509 silence_segments
510 }
511
512 /// Calculate Mel-Frequency Cepstral Coefficients (MFCCs)
513 ///
514 /// MFCCs are widely used in speech and audio processing for feature extraction.
515 /// They represent the short-term power spectrum of a sound on a mel scale.
516 ///
517 /// # Arguments
518 ///
519 /// * `num_coeffs` - Number of MFCC coefficients to extract (typically 13)
520 /// * `num_filters` - Number of mel-scale filters (typically 26-40)
521 /// * `fft_size` - FFT size (power of 2, typically 512-2048)
522 ///
523 /// # Returns
524 ///
525 /// Vector of MFCC coefficients, or empty vector if calculation fails.
526 ///
527 /// # Examples
528 ///
529 /// ```no_run
530 /// use voirs_sdk::audio::AudioBuffer;
531 ///
532 /// let buffer = AudioBuffer::mono(vec![0.5; 1024], 22050);
533 /// let mfccs = buffer.mfcc(13, 26, 512);
534 /// println!("MFCC coefficients: {:?}", mfccs);
535 /// ```
536 pub fn mfcc(&self, num_coeffs: usize, num_filters: usize, fft_size: usize) -> Vec<f32> {
537 if self.samples.len() < fft_size || !fft_size.is_power_of_two() {
538 return Vec::new();
539 }
540
541 // Prepare input with windowing (Hamming window)
542 let window: Vec<f64> = (0..fft_size)
543 .map(|i| {
544 0.54 - 0.46 * (2.0 * std::f64::consts::PI * i as f64 / (fft_size - 1) as f64).cos()
545 })
546 .collect();
547
548 let samples_to_analyze = self.samples.len().min(fft_size);
549 let input: Vec<f64> = self.samples[..samples_to_analyze]
550 .iter()
551 .enumerate()
552 .map(|(i, &s)| s as f64 * window[i])
553 .chain(std::iter::repeat(0.0))
554 .take(fft_size)
555 .collect();
556
557 // Perform FFT
558 let spectrum = match scirs2_fft::fft(&input, Some(fft_size)) {
559 Ok(result) => result,
560 Err(_) => return Vec::new(),
561 };
562
563 // Calculate power spectrum
564 let power_spectrum: Vec<f32> = spectrum
565 .iter()
566 .take(fft_size / 2 + 1)
567 .map(|c| (c.norm() * c.norm()) as f32)
568 .collect();
569
570 // Create mel filterbank
571 let mel_filters =
572 Self::create_mel_filterbank(num_filters, fft_size, self.sample_rate as f32);
573
574 // Apply mel filterbank
575 let mut mel_energies = vec![0.0f32; num_filters];
576 for (filter_idx, filter) in mel_filters.iter().enumerate() {
577 for (bin_idx, &power) in power_spectrum.iter().enumerate() {
578 if bin_idx < filter.len() {
579 mel_energies[filter_idx] += power * filter[bin_idx];
580 }
581 }
582 }
583
584 // Apply log and handle zeros
585 let log_mel: Vec<f32> = mel_energies
586 .iter()
587 .map(|&e| if e > 1e-10 { e.ln() } else { -23.0 }) // ln(1e-10) ≈ -23
588 .collect();
589
590 // Apply DCT (Discrete Cosine Transform) Type-II
591 let mut mfcc_coeffs = vec![0.0f32; num_coeffs];
592 // Note: We need the index i for DCT calculation, not for direct indexing
593 #[allow(clippy::needless_range_loop)]
594 for i in 0..num_coeffs {
595 let mut sum = 0.0;
596 for (k, &log_e) in log_mel.iter().enumerate() {
597 sum += log_e
598 * ((std::f32::consts::PI * i as f32 * (k as f32 + 0.5)) / num_filters as f32)
599 .cos();
600 }
601 mfcc_coeffs[i] = sum * (2.0 / num_filters as f32).sqrt();
602 }
603
604 mfcc_coeffs
605 }
606
607 /// Create mel-scale filterbank
608 ///
609 /// Helper function for MFCC calculation.
610 fn create_mel_filterbank(
611 num_filters: usize,
612 fft_size: usize,
613 sample_rate: f32,
614 ) -> Vec<Vec<f32>> {
615 // Mel scale conversion functions
616 let hz_to_mel = |hz: f32| 2595.0 * (1.0 + hz / 700.0).log10();
617 let mel_to_hz = |mel: f32| 700.0 * (10.0_f32.powf(mel / 2595.0) - 1.0);
618
619 let nyquist = sample_rate / 2.0;
620 let mel_min = hz_to_mel(0.0);
621 let mel_max = hz_to_mel(nyquist);
622
623 // Create mel-spaced frequency points
624 let mel_points: Vec<f32> = (0..=num_filters + 1)
625 .map(|i| mel_min + (mel_max - mel_min) * i as f32 / (num_filters + 1) as f32)
626 .map(mel_to_hz)
627 .collect();
628
629 // Convert to FFT bin indices
630 let bin_points: Vec<usize> = mel_points
631 .iter()
632 .map(|&freq| ((fft_size + 1) as f32 * freq / sample_rate).floor() as usize)
633 .collect();
634
635 // Create triangular filters
636 let mut filterbank = Vec::with_capacity(num_filters);
637 for i in 0..num_filters {
638 let mut filter = vec![0.0f32; fft_size / 2 + 1];
639
640 let left = bin_points[i];
641 let center = bin_points[i + 1];
642 let right = bin_points[i + 2];
643
644 // Rising slope
645 // Note: We need k for arithmetic, not just indexing
646 #[allow(clippy::needless_range_loop)]
647 for k in left..center {
648 if center != left {
649 filter[k] = (k - left) as f32 / (center - left) as f32;
650 }
651 }
652
653 // Falling slope
654 // Note: We need k for arithmetic, not just indexing
655 #[allow(clippy::needless_range_loop)]
656 for k in center..right {
657 if right != center {
658 filter[k] = (right - k) as f32 / (right - center) as f32;
659 }
660 }
661
662 filterbank.push(filter);
663 }
664
665 filterbank
666 }
667
668 /// Detect pitch using autocorrelation
669 ///
670 /// Uses the autocorrelation method to estimate the fundamental frequency (F0)
671 /// of the audio signal. This is more robust than simple zero-crossing rate.
672 ///
673 /// # Arguments
674 ///
675 /// * `min_freq` - Minimum expected frequency in Hz (e.g., 80 for male voice)
676 /// * `max_freq` - Maximum expected frequency in Hz (e.g., 400 for female voice)
677 ///
678 /// # Returns
679 ///
680 /// Estimated pitch in Hz, or 0.0 if no pitch detected.
681 ///
682 /// # Examples
683 ///
684 /// ```no_run
685 /// use voirs_sdk::audio::AudioBuffer;
686 ///
687 /// let buffer = AudioBuffer::mono(vec![0.5; 1024], 22050);
688 /// let pitch = buffer.detect_pitch_autocorr(80.0, 400.0);
689 /// println!("Detected pitch: {:.2} Hz", pitch);
690 /// ```
691 pub fn detect_pitch_autocorr(&self, min_freq: f32, max_freq: f32) -> f32 {
692 if self.samples.len() < 1024 {
693 return 0.0;
694 }
695
696 let sample_rate = self.sample_rate as f32;
697 let min_lag = (sample_rate / max_freq) as usize;
698 let max_lag = (sample_rate / min_freq) as usize;
699
700 // Use a reasonable window for analysis
701 let window_size = (max_lag * 3).min(self.samples.len());
702 let samples = &self.samples[..window_size];
703
704 // Compute autocorrelation using normalized method
705 let mut max_corr = 0.0;
706 let mut best_lag = 0;
707
708 for lag in min_lag..=max_lag.min(window_size / 2) {
709 let mut sum = 0.0;
710 let mut energy1 = 0.0;
711 let mut energy2 = 0.0;
712
713 for i in 0..(window_size - lag) {
714 let s1 = samples[i];
715 let s2 = samples[i + lag];
716 sum += s1 * s2;
717 energy1 += s1 * s1;
718 energy2 += s2 * s2;
719 }
720
721 // Normalized autocorrelation
722 let corr = if energy1 > 0.0 && energy2 > 0.0 {
723 sum / (energy1 * energy2).sqrt()
724 } else {
725 0.0
726 };
727
728 if corr > max_corr {
729 max_corr = corr;
730 best_lag = lag;
731 }
732 }
733
734 // Confidence threshold for pitch detection
735 if max_corr > 0.5 && best_lag > 0 {
736 sample_rate / best_lag as f32
737 } else {
738 0.0
739 }
740 }
741
742 /// Calculate spectral flux
743 ///
744 /// Spectral flux measures the rate of change in the power spectrum,
745 /// useful for detecting onsets and transients in audio.
746 ///
747 /// # Arguments
748 ///
749 /// * `prev_buffer` - Previous audio buffer for comparison (optional)
750 /// * `fft_size` - FFT size for spectral analysis
751 ///
752 /// # Returns
753 ///
754 /// Spectral flux value, or 0.0 if calculation fails.
755 ///
756 /// # Examples
757 ///
758 /// ```no_run
759 /// use voirs_sdk::audio::AudioBuffer;
760 ///
761 /// let buffer1 = AudioBuffer::mono(vec![0.5; 1024], 22050);
762 /// let buffer2 = AudioBuffer::mono(vec![0.6; 1024], 22050);
763 /// let flux = buffer2.spectral_flux(Some(&buffer1), 512);
764 /// println!("Spectral flux: {:.4}", flux);
765 /// ```
766 pub fn spectral_flux(&self, prev_buffer: Option<&AudioBuffer>, fft_size: usize) -> f32 {
767 if self.samples.len() < fft_size || !fft_size.is_power_of_two() {
768 return 0.0;
769 }
770
771 // Get current spectrum
772 let current_spectrum = self.get_magnitude_spectrum(fft_size);
773
774 // If no previous buffer, return 0
775 let prev_spectrum = match prev_buffer {
776 Some(buf) => buf.get_magnitude_spectrum(fft_size),
777 None => return 0.0,
778 };
779
780 if current_spectrum.is_empty() || prev_spectrum.is_empty() {
781 return 0.0;
782 }
783
784 // Calculate flux as sum of squared differences (positive only)
785 let mut flux = 0.0;
786 let len = current_spectrum.len().min(prev_spectrum.len());
787
788 for i in 0..len {
789 let diff = current_spectrum[i] - prev_spectrum[i];
790 if diff > 0.0 {
791 flux += diff * diff;
792 }
793 }
794
795 flux.sqrt()
796 }
797
798 /// Get magnitude spectrum (helper for spectral flux)
799 fn get_magnitude_spectrum(&self, fft_size: usize) -> Vec<f32> {
800 let samples_to_analyze = self.samples.len().min(fft_size);
801 let input: Vec<f64> = self.samples[..samples_to_analyze]
802 .iter()
803 .map(|&s| s as f64)
804 .chain(std::iter::repeat(0.0))
805 .take(fft_size)
806 .collect();
807
808 match scirs2_fft::fft(&input, Some(fft_size)) {
809 Ok(spectrum) => spectrum
810 .iter()
811 .take(fft_size / 2)
812 .map(|c| c.norm() as f32)
813 .collect(),
814 Err(_) => Vec::new(),
815 }
816 }
817
818 /// Estimate formant frequencies
819 ///
820 /// Formants are resonant frequencies of the vocal tract, crucial for
821 /// vowel identification and speaker characteristics. This uses LPC
822 /// (Linear Predictive Coding) analysis to estimate the first 4 formants.
823 ///
824 /// # Arguments
825 ///
826 /// * `num_formants` - Number of formants to estimate (typically 3-4)
827 ///
828 /// # Returns
829 ///
830 /// Vector of estimated formant frequencies in Hz.
831 ///
832 /// # Examples
833 ///
834 /// ```no_run
835 /// use voirs_sdk::audio::AudioBuffer;
836 ///
837 /// let buffer = AudioBuffer::mono(vec![0.5; 1024], 22050);
838 /// let formants = buffer.estimate_formants(4);
839 /// println!("Formants: {:?} Hz", formants);
840 /// ```
841 pub fn estimate_formants(&self, num_formants: usize) -> Vec<f32> {
842 if self.samples.len() < 512 {
843 return Vec::new();
844 }
845
846 // Use LPC order = 2 + sample_rate/1000 (common rule of thumb)
847 let lpc_order = 2 + (self.sample_rate / 1000) as usize;
848
849 // Window the signal (use up to 1024 samples)
850 let window_size = self.samples.len().min(1024);
851 let samples = &self.samples[..window_size];
852
853 // Apply pre-emphasis filter (high-pass)
854 let pre_emphasis = 0.97;
855 let mut emphasized = vec![samples[0]];
856 for i in 1..samples.len() {
857 emphasized.push(samples[i] - pre_emphasis * samples[i - 1]);
858 }
859
860 // Compute autocorrelation
861 let mut autocorr = vec![0.0; lpc_order + 1];
862 for lag in 0..=lpc_order {
863 let mut sum = 0.0;
864 for i in 0..(emphasized.len() - lag) {
865 sum += emphasized[i] * emphasized[i + lag];
866 }
867 autocorr[lag] = sum;
868 }
869
870 // Solve for LPC coefficients using Levinson-Durbin algorithm
871 let lpc_coeffs = self.levinson_durbin(&autocorr, lpc_order);
872
873 // Find roots of LPC polynomial to get formants
874 // For simplicity, we use peak-picking in the frequency response
875 let fft_size = 1024;
876 let num_bins = fft_size / 2;
877 let mut freq_response = vec![0.0; num_bins];
878
879 // Note: We need bin index for omega calculation, not just for indexing
880 #[allow(clippy::needless_range_loop)]
881 for bin in 0..num_bins {
882 let omega = 2.0 * std::f32::consts::PI * bin as f32 / fft_size as f32;
883 let mut real_part = 1.0;
884 let mut imag_part = 0.0;
885
886 for (k, &coeff) in lpc_coeffs.iter().enumerate() {
887 let angle = omega * (k + 1) as f32;
888 real_part -= coeff * angle.cos();
889 imag_part += coeff * angle.sin();
890 }
891
892 freq_response[bin] = 1.0 / (real_part * real_part + imag_part * imag_part).sqrt();
893 }
894
895 // Find peaks in frequency response
896 let mut formants = Vec::new();
897 for i in 1..(num_bins - 1) {
898 if freq_response[i] > freq_response[i - 1] && freq_response[i] > freq_response[i + 1] {
899 let freq = i as f32 * self.sample_rate as f32 / fft_size as f32;
900 // Typical formant range: 200 Hz to 4000 Hz
901 if freq >= 200.0 && freq <= 4000.0 {
902 formants.push(freq);
903 if formants.len() >= num_formants {
904 break;
905 }
906 }
907 }
908 }
909
910 formants
911 }
912
913 /// Levinson-Durbin algorithm for LPC coefficient estimation
914 ///
915 /// Solves the Yule-Walker equations efficiently.
916 fn levinson_durbin(&self, autocorr: &[f32], order: usize) -> Vec<f32> {
917 let mut lpc = vec![0.0; order];
918 let mut error = autocorr[0];
919
920 for i in 0..order {
921 let mut lambda = 0.0;
922 for j in 0..i {
923 lambda -= lpc[j] * autocorr[i - j];
924 }
925 lambda -= autocorr[i + 1];
926 lambda /= error;
927
928 // Update LPC coefficients
929 let mut new_lpc = vec![0.0; order];
930 new_lpc[i] = lambda;
931 for j in 0..i {
932 new_lpc[j] = lpc[j] + lambda * lpc[i - 1 - j];
933 }
934 lpc = new_lpc;
935
936 error *= 1.0 - lambda * lambda;
937 }
938
939 lpc
940 }
941
942 /// Calculate Jitter (pitch period irregularity)
943 ///
944 /// Jitter measures the cycle-to-cycle variation in fundamental frequency (F0),
945 /// expressed as a percentage. It's a crucial indicator of voice quality and
946 /// pathology. Higher jitter indicates more irregular vocal fold vibration.
947 ///
948 /// This implements the Jitter (local) metric, which is the average absolute
949 /// difference between consecutive periods, divided by the average period.
950 ///
951 /// # Arguments
952 ///
953 /// * `min_freq` - Minimum expected pitch (e.g., 75 Hz for male voice)
954 /// * `max_freq` - Maximum expected pitch (e.g., 500 Hz for female voice)
955 ///
956 /// # Returns
957 ///
958 /// Jitter percentage (0-100), or 0.0 if calculation fails. Typical values:
959 /// - < 1.0%: Normal voice quality
960 /// - 1.0-2.0%: Mild irregularity
961 /// - > 2.0%: Potential voice pathology
962 ///
963 /// # Examples
964 ///
965 /// ```no_run
966 /// use voirs_sdk::audio::AudioBuffer;
967 ///
968 /// let buffer = AudioBuffer::mono(vec![0.5; 4096], 22050);
969 /// let jitter = buffer.calculate_jitter(75.0, 500.0);
970 /// println!("Jitter: {:.2}%", jitter);
971 /// ```
972 pub fn calculate_jitter(&self, min_freq: f32, max_freq: f32) -> f32 {
973 if self.samples.len() < 2048 {
974 return 0.0;
975 }
976
977 let sample_rate = self.sample_rate as f32;
978 let min_period = (sample_rate / max_freq) as usize;
979 let max_period = (sample_rate / min_freq) as usize;
980
981 // Find pitch periods using autocorrelation
982 let window_size = (max_period * 4).min(self.samples.len());
983 let mut periods = Vec::new();
984
985 // Slide through the signal to find multiple periods
986 let hop_size = max_period / 2;
987 for start_idx in (0..self.samples.len() - window_size).step_by(hop_size) {
988 let window = &self.samples[start_idx..start_idx + window_size];
989
990 // Find period using autocorrelation for this window
991 let mut max_corr = 0.0;
992 let mut best_period = 0;
993
994 for period in min_period..=max_period.min(window_size / 2) {
995 let mut sum = 0.0;
996 let mut energy1 = 0.0;
997 let mut energy2 = 0.0;
998
999 for i in 0..(window_size - period) {
1000 let s1 = window[i];
1001 let s2 = window[i + period];
1002 sum += s1 * s2;
1003 energy1 += s1 * s1;
1004 energy2 += s2 * s2;
1005 }
1006
1007 let corr = if energy1 > 0.0 && energy2 > 0.0 {
1008 sum / (energy1 * energy2).sqrt()
1009 } else {
1010 0.0
1011 };
1012
1013 if corr > max_corr {
1014 max_corr = corr;
1015 best_period = period;
1016 }
1017 }
1018
1019 // Only include periods with high confidence
1020 if max_corr > 0.6 && best_period > 0 {
1021 periods.push(best_period as f32);
1022 }
1023 }
1024
1025 // Need at least 3 periods to calculate jitter
1026 if periods.len() < 3 {
1027 return 0.0;
1028 }
1029
1030 // Calculate jitter (local): mean absolute difference between consecutive periods
1031 let mut sum_diff = 0.0;
1032 for i in 1..periods.len() {
1033 sum_diff += (periods[i] - periods[i - 1]).abs();
1034 }
1035
1036 let mean_diff = sum_diff / (periods.len() - 1) as f32;
1037 let mean_period: f32 = periods.iter().sum::<f32>() / periods.len() as f32;
1038
1039 if mean_period > 0.0 {
1040 (mean_diff / mean_period) * 100.0 // Convert to percentage
1041 } else {
1042 0.0
1043 }
1044 }
1045
1046 /// Calculate Shimmer (amplitude variation)
1047 ///
1048 /// Shimmer measures the cycle-to-cycle variation in amplitude, expressed
1049 /// as a percentage. It's an important indicator of voice quality and vocal
1050 /// fold irregularities. Higher shimmer indicates unstable voice production.
1051 ///
1052 /// This implements the Shimmer (local) metric, which is the average absolute
1053 /// difference between consecutive peak amplitudes, divided by the average amplitude.
1054 ///
1055 /// # Arguments
1056 ///
1057 /// * `min_freq` - Minimum expected pitch for period detection
1058 /// * `max_freq` - Maximum expected pitch for period detection
1059 ///
1060 /// # Returns
1061 ///
1062 /// Shimmer percentage (0-100), or 0.0 if calculation fails. Typical values:
1063 /// - < 3.0%: Normal voice quality
1064 /// - 3.0-6.0%: Mild amplitude variation
1065 /// - > 6.0%: Potential voice pathology
1066 ///
1067 /// # Examples
1068 ///
1069 /// ```no_run
1070 /// use voirs_sdk::audio::AudioBuffer;
1071 ///
1072 /// let buffer = AudioBuffer::mono(vec![0.5; 4096], 22050);
1073 /// let shimmer = buffer.calculate_shimmer(75.0, 500.0);
1074 /// println!("Shimmer: {:.2}%", shimmer);
1075 /// ```
1076 pub fn calculate_shimmer(&self, min_freq: f32, max_freq: f32) -> f32 {
1077 if self.samples.len() < 2048 {
1078 return 0.0;
1079 }
1080
1081 let sample_rate = self.sample_rate as f32;
1082 let min_period = (sample_rate / max_freq) as usize;
1083 let max_period = (sample_rate / min_freq) as usize;
1084
1085 // Find peak amplitudes for each period
1086 let window_size = (max_period * 4).min(self.samples.len());
1087 let mut peak_amplitudes = Vec::new();
1088
1089 let hop_size = max_period / 2;
1090 for start_idx in (0..self.samples.len() - window_size).step_by(hop_size) {
1091 let window = &self.samples[start_idx..start_idx + window_size];
1092
1093 // Find period using autocorrelation
1094 let mut max_corr = 0.0;
1095 let mut best_period = 0;
1096
1097 for period in min_period..=max_period.min(window_size / 2) {
1098 let mut sum = 0.0;
1099 let mut energy1 = 0.0;
1100 let mut energy2 = 0.0;
1101
1102 for i in 0..(window_size - period) {
1103 let s1 = window[i];
1104 let s2 = window[i + period];
1105 sum += s1 * s2;
1106 energy1 += s1 * s1;
1107 energy2 += s2 * s2;
1108 }
1109
1110 let corr = if energy1 > 0.0 && energy2 > 0.0 {
1111 sum / (energy1 * energy2).sqrt()
1112 } else {
1113 0.0
1114 };
1115
1116 if corr > max_corr {
1117 max_corr = corr;
1118 best_period = period;
1119 }
1120 }
1121
1122 // Find peak amplitude in this period
1123 if max_corr > 0.6 && best_period > 0 {
1124 let period_samples = &window[..best_period.min(window.len())];
1125 let peak = period_samples
1126 .iter()
1127 .map(|&s| s.abs())
1128 .max_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
1129 .unwrap_or(0.0);
1130 peak_amplitudes.push(peak);
1131 }
1132 }
1133
1134 // Need at least 3 peaks to calculate shimmer
1135 if peak_amplitudes.len() < 3 {
1136 return 0.0;
1137 }
1138
1139 // Calculate shimmer: mean absolute difference between consecutive peaks
1140 let mut sum_diff = 0.0;
1141 for i in 1..peak_amplitudes.len() {
1142 sum_diff += (peak_amplitudes[i] - peak_amplitudes[i - 1]).abs();
1143 }
1144
1145 let mean_diff = sum_diff / (peak_amplitudes.len() - 1) as f32;
1146 let mean_amplitude: f32 =
1147 peak_amplitudes.iter().sum::<f32>() / peak_amplitudes.len() as f32;
1148
1149 if mean_amplitude > 0.0 {
1150 (mean_diff / mean_amplitude) * 100.0 // Convert to percentage
1151 } else {
1152 0.0
1153 }
1154 }
1155
1156 /// Calculate Harmonic-to-Noise Ratio (HNR)
1157 ///
1158 /// HNR measures the ratio of harmonic (periodic) to noise (aperiodic) energy
1159 /// in the voice signal. It's a fundamental measure of voice quality.
1160 /// Higher HNR indicates clearer, more periodic voice production.
1161 ///
1162 /// This uses autocorrelation-based method to separate harmonic and noise components.
1163 ///
1164 /// # Arguments
1165 ///
1166 /// * `min_freq` - Minimum expected pitch
1167 /// * `max_freq` - Maximum expected pitch
1168 ///
1169 /// # Returns
1170 ///
1171 /// HNR in decibels (dB). Typical values:
1172 /// - > 20 dB: Excellent voice quality
1173 /// - 10-20 dB: Good voice quality
1174 /// - 5-10 dB: Fair voice quality
1175 /// - < 5 dB: Poor voice quality or pathology
1176 ///
1177 /// # Examples
1178 ///
1179 /// ```no_run
1180 /// use voirs_sdk::audio::AudioBuffer;
1181 ///
1182 /// let buffer = AudioBuffer::mono(vec![0.5; 4096], 22050);
1183 /// let hnr = buffer.calculate_hnr(75.0, 500.0);
1184 /// println!("HNR: {:.2} dB", hnr);
1185 /// ```
1186 pub fn calculate_hnr(&self, min_freq: f32, max_freq: f32) -> f32 {
1187 if self.samples.len() < 2048 {
1188 return 0.0;
1189 }
1190
1191 let sample_rate = self.sample_rate as f32;
1192 let min_period = (sample_rate / max_freq) as usize;
1193 let max_period = (sample_rate / min_freq) as usize;
1194
1195 // Use autocorrelation to find the fundamental period
1196 let window_size = (max_period * 3).min(self.samples.len());
1197 let samples = &self.samples[..window_size];
1198
1199 let mut max_corr = 0.0;
1200
1201 for period in min_period..=max_period.min(window_size / 2) {
1202 let mut sum = 0.0;
1203 let mut energy1 = 0.0;
1204 let mut energy2 = 0.0;
1205
1206 for i in 0..(window_size - period) {
1207 let s1 = samples[i];
1208 let s2 = samples[i + period];
1209 sum += s1 * s2;
1210 energy1 += s1 * s1;
1211 energy2 += s2 * s2;
1212 }
1213
1214 let corr = if energy1 > 0.0 && energy2 > 0.0 {
1215 sum / (energy1 * energy2).sqrt()
1216 } else {
1217 0.0
1218 };
1219
1220 if corr > max_corr {
1221 max_corr = corr;
1222 // Note: We only need max_corr for HNR calculation, not the period itself
1223 }
1224 }
1225
1226 // If no strong periodicity found, voice is mostly noise
1227 if max_corr < 0.3 {
1228 return 0.0;
1229 }
1230
1231 // Calculate HNR using the autocorrelation peak
1232 // HNR = 10 * log10(max_corr / (1 - max_corr))
1233 // This formula relates autocorrelation to harmonic/noise ratio
1234 if max_corr >= 0.99 {
1235 // Avoid division by very small numbers
1236 return 30.0; // Very high HNR
1237 }
1238
1239 let hnr_linear = max_corr / (1.0 - max_corr);
1240 10.0 * hnr_linear.log10()
1241 }
1242
1243 /// Calculate Delta MFCCs (first-order temporal derivatives)
1244 ///
1245 /// Delta coefficients represent the rate of change of MFCCs over time,
1246 /// capturing dynamic spectral information. These are essential for
1247 /// improving speech recognition accuracy.
1248 ///
1249 /// # Arguments
1250 ///
1251 /// * `mfcc_frames` - Vector of MFCC coefficient vectors from consecutive frames
1252 /// * `delta_window` - Number of frames to use for delta calculation (typically 2)
1253 ///
1254 /// # Returns
1255 ///
1256 /// Vector of delta MFCC vectors, one per input frame.
1257 ///
1258 /// # Examples
1259 ///
1260 /// ```no_run
1261 /// use voirs_sdk::audio::AudioBuffer;
1262 ///
1263 /// let buffer = AudioBuffer::mono(vec![0.5; 8192], 22050);
1264 /// // Extract MFCCs from multiple frames...
1265 /// let mfcc_frames = vec![
1266 /// buffer.mfcc(13, 26, 512),
1267 /// buffer.mfcc(13, 26, 512),
1268 /// buffer.mfcc(13, 26, 512),
1269 /// ];
1270 /// let delta_mfccs = AudioBuffer::calculate_delta_mfcc(&mfcc_frames, 2);
1271 /// ```
1272 pub fn calculate_delta_mfcc(mfcc_frames: &[Vec<f32>], delta_window: usize) -> Vec<Vec<f32>> {
1273 if mfcc_frames.is_empty() {
1274 return Vec::new();
1275 }
1276
1277 let num_frames = mfcc_frames.len();
1278 let num_coeffs = mfcc_frames[0].len();
1279 let mut delta_mfccs = Vec::with_capacity(num_frames);
1280
1281 for frame_idx in 0..num_frames {
1282 let mut delta_coeffs = vec![0.0; num_coeffs];
1283
1284 // Calculate delta using regression formula
1285 // delta[t] = (sum(n * mfcc[t+n]) - sum(n * mfcc[t-n])) / (2 * sum(n²))
1286 let mut numerator = vec![0.0; num_coeffs];
1287 let mut denominator = 0.0;
1288
1289 for n in 1..=delta_window {
1290 let n_f32 = n as f32;
1291
1292 // Future frame (t+n)
1293 let future_idx = (frame_idx + n).min(num_frames - 1);
1294 // Past frame (t-n)
1295 let past_idx = frame_idx.saturating_sub(n);
1296
1297 // Note: We need coeff_idx for indexing both numerator and mfcc_frames
1298 #[allow(clippy::needless_range_loop)]
1299 for coeff_idx in 0..num_coeffs {
1300 numerator[coeff_idx] += n_f32
1301 * (mfcc_frames[future_idx][coeff_idx] - mfcc_frames[past_idx][coeff_idx]);
1302 }
1303
1304 denominator += n_f32 * n_f32;
1305 }
1306
1307 // Normalize by denominator
1308 if denominator > 0.0 {
1309 // Note: We need coeff_idx for indexing both delta_coeffs and numerator
1310 #[allow(clippy::needless_range_loop)]
1311 for coeff_idx in 0..num_coeffs {
1312 delta_coeffs[coeff_idx] = numerator[coeff_idx] / (2.0 * denominator);
1313 }
1314 }
1315
1316 delta_mfccs.push(delta_coeffs);
1317 }
1318
1319 delta_mfccs
1320 }
1321
1322 /// Calculate Delta-Delta MFCCs (second-order temporal derivatives)
1323 ///
1324 /// Delta-Delta (acceleration) coefficients represent the rate of change of
1325 /// Delta coefficients, capturing the dynamics of spectral dynamics.
1326 /// Combined with MFCCs and Deltas, they form a powerful feature set for ASR.
1327 ///
1328 /// # Arguments
1329 ///
1330 /// * `delta_mfccs` - Vector of delta MFCC coefficient vectors
1331 /// * `delta_window` - Number of frames to use (typically 2)
1332 ///
1333 /// # Returns
1334 ///
1335 /// Vector of delta-delta MFCC vectors.
1336 ///
1337 /// # Examples
1338 ///
1339 /// ```no_run
1340 /// use voirs_sdk::audio::AudioBuffer;
1341 ///
1342 /// let buffer = AudioBuffer::mono(vec![0.5; 8192], 22050);
1343 /// let mfcc_frames = vec![
1344 /// buffer.mfcc(13, 26, 512),
1345 /// buffer.mfcc(13, 26, 512),
1346 /// buffer.mfcc(13, 26, 512),
1347 /// ];
1348 /// let delta_mfccs = AudioBuffer::calculate_delta_mfcc(&mfcc_frames, 2);
1349 /// let delta_delta_mfccs = AudioBuffer::calculate_delta_delta_mfcc(&delta_mfccs, 2);
1350 /// ```
1351 pub fn calculate_delta_delta_mfcc(
1352 delta_mfccs: &[Vec<f32>],
1353 delta_window: usize,
1354 ) -> Vec<Vec<f32>> {
1355 // Delta-delta is just the delta of delta coefficients
1356 Self::calculate_delta_mfcc(delta_mfccs, delta_window)
1357 }
1358
1359 /// Calculate Chroma features (pitch class representation)
1360 ///
1361 /// Chroma features, also known as pitch class profiles or chromagrams, represent
1362 /// the intensity of the 12 pitch classes (C, C#, D, ..., B) regardless of octave.
1363 /// This is particularly useful for music information retrieval, chord recognition,
1364 /// and key detection.
1365 ///
1366 /// The algorithm maps each frequency bin to one of 12 pitch classes using:
1367 /// pitch_class = 12 * log2(freq / ref_freq) mod 12
1368 ///
1369 /// # Arguments
1370 ///
1371 /// * `fft_size` - FFT size (must be power of 2, typically 2048 or 4096 for music)
1372 /// * `ref_freq` - Reference frequency for A4 (default: 440.0 Hz)
1373 ///
1374 /// # Returns
1375 ///
1376 /// 12-element vector representing energy in each pitch class (C=0, C#=1, ..., B=11).
1377 /// Returns empty vector if insufficient samples.
1378 ///
1379 /// # Applications
1380 ///
1381 /// - Music information retrieval
1382 /// - Chord recognition and key detection
1383 /// - Cover song identification
1384 /// - Music similarity analysis
1385 /// - Tonal music analysis
1386 ///
1387 /// # Examples
1388 ///
1389 /// ```no_run
1390 /// use voirs_sdk::audio::AudioBuffer;
1391 ///
1392 /// let buffer = AudioBuffer::mono(vec![0.5; 8192], 22050);
1393 /// let chroma = buffer.chroma_features(2048, 440.0);
1394 /// assert_eq!(chroma.len(), 12);
1395 /// ```
1396 pub fn chroma_features(&self, fft_size: usize, ref_freq: f32) -> Vec<f32> {
1397 if self.samples.len() < fft_size || !fft_size.is_power_of_two() {
1398 return Vec::new();
1399 }
1400
1401 // Get magnitude spectrum
1402 let magnitudes = self.get_magnitude_spectrum(fft_size);
1403
1404 // Initialize 12 pitch class bins
1405 let mut chroma = vec![0.0_f32; 12];
1406
1407 let sample_rate = self.sample_rate as f32;
1408 let freq_resolution = sample_rate / fft_size as f32;
1409
1410 // Map each frequency bin to a pitch class
1411 for (bin, &magnitude) in magnitudes.iter().enumerate().skip(1) {
1412 let freq = bin as f32 * freq_resolution;
1413
1414 // Skip very low frequencies (below 20 Hz)
1415 if freq < 20.0 {
1416 continue;
1417 }
1418
1419 // Calculate pitch class: 12 * log2(freq / ref_freq) mod 12
1420 let pitch_class_float = 12.0 * (freq / ref_freq).log2();
1421 let pitch_class = pitch_class_float.rem_euclid(12.0) as usize % 12;
1422
1423 // Accumulate magnitude in corresponding pitch class
1424 chroma[pitch_class] += magnitude;
1425 }
1426
1427 // Normalize chroma vector
1428 let max_chroma = chroma.iter().cloned().fold(0.0_f32, f32::max);
1429 if max_chroma > 0.0 {
1430 for c in &mut chroma {
1431 *c /= max_chroma;
1432 }
1433 }
1434
1435 chroma
1436 }
1437
1438 /// Calculate Spectral Contrast
1439 ///
1440 /// Spectral contrast measures the difference between peaks and valleys in the
1441 /// spectrum across multiple frequency bands. This provides a robust timbre
1442 /// representation, particularly useful for music genre classification and
1443 /// audio texture analysis.
1444 ///
1445 /// The spectrum is divided into sub-bands, and for each band:
1446 /// - Peak: Mean of top 20% of magnitudes
1447 /// - Valley: Mean of bottom 20% of magnitudes
1448 /// - Contrast: Peak - Valley (in dB)
1449 ///
1450 /// # Arguments
1451 ///
1452 /// * `fft_size` - FFT size (must be power of 2, typically 2048)
1453 /// * `num_bands` - Number of frequency bands (typically 6-8)
1454 ///
1455 /// # Returns
1456 ///
1457 /// Vector of contrast values (in dB) for each frequency band.
1458 /// Returns empty vector if insufficient samples.
1459 ///
1460 /// # Applications
1461 ///
1462 /// - Music genre classification
1463 /// - Audio texture analysis
1464 /// - Instrument recognition
1465 /// - Timbre characterization
1466 /// - Sound quality assessment
1467 ///
1468 /// # Examples
1469 ///
1470 /// ```no_run
1471 /// use voirs_sdk::audio::AudioBuffer;
1472 ///
1473 /// let buffer = AudioBuffer::mono(vec![0.5; 8192], 22050);
1474 /// let contrast = buffer.spectral_contrast(2048, 6);
1475 /// assert_eq!(contrast.len(), 6);
1476 /// ```
1477 pub fn spectral_contrast(&self, fft_size: usize, num_bands: usize) -> Vec<f32> {
1478 if self.samples.len() < fft_size || !fft_size.is_power_of_two() || num_bands == 0 {
1479 return Vec::new();
1480 }
1481
1482 // Get magnitude spectrum
1483 let magnitudes = self.get_magnitude_spectrum(fft_size);
1484
1485 // Divide spectrum into logarithmically-spaced frequency bands
1486 let mut contrasts = Vec::with_capacity(num_bands);
1487
1488 let num_bins = magnitudes.len();
1489 let band_edges = (0..=num_bands)
1490 .map(|i| {
1491 let ratio = i as f32 / num_bands as f32;
1492 // Logarithmic spacing
1493 (ratio * (num_bins as f32).ln()).exp() as usize
1494 })
1495 .collect::<Vec<_>>();
1496
1497 // Calculate contrast for each band
1498 for band_idx in 0..num_bands {
1499 let start = band_edges[band_idx];
1500 let end = band_edges[band_idx + 1].min(num_bins);
1501
1502 if start >= end {
1503 contrasts.push(0.0);
1504 continue;
1505 }
1506
1507 // Extract band magnitudes
1508 let mut band_mags: Vec<f32> = magnitudes[start..end].to_vec();
1509 band_mags.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
1510
1511 let band_len = band_mags.len();
1512 if band_len < 5 {
1513 contrasts.push(0.0);
1514 continue;
1515 }
1516
1517 // Calculate peak (top 20%) and valley (bottom 20%)
1518 let percentile_idx = (band_len as f32 * 0.2) as usize;
1519
1520 // Valley: mean of bottom 20%
1521 let valley_sum: f32 = band_mags[..percentile_idx].iter().sum();
1522 let valley = valley_sum / percentile_idx as f32;
1523
1524 // Peak: mean of top 20%
1525 let peak_sum: f32 = band_mags[band_len - percentile_idx..].iter().sum();
1526 let peak = peak_sum / percentile_idx as f32;
1527
1528 // Calculate contrast in dB
1529 let contrast = if valley > 1e-10 {
1530 20.0 * (peak / valley).log10()
1531 } else if peak > 1e-10 {
1532 60.0 // Maximum contrast when valley is near zero
1533 } else {
1534 0.0
1535 };
1536
1537 contrasts.push(contrast);
1538 }
1539
1540 contrasts
1541 }
1542
1543 /// Detect pitch using the YIN algorithm
1544 ///
1545 /// The YIN algorithm is a robust pitch detection method that improves upon
1546 /// autocorrelation by using a cumulative mean normalized difference function.
1547 /// It provides more accurate pitch detection, especially for noisy signals.
1548 ///
1549 /// YIN steps:
1550 /// 1. Calculate difference function
1551 /// 2. Apply cumulative mean normalization
1552 /// 3. Find first minimum below threshold
1553 /// 4. Apply parabolic interpolation for sub-sample accuracy
1554 ///
1555 /// Reference: "YIN, a fundamental frequency estimator for speech and music"
1556 /// by Alain de Cheveigné and Hideki Kawahara (2002)
1557 ///
1558 /// # Arguments
1559 ///
1560 /// * `min_freq` - Minimum frequency to search (Hz)
1561 /// * `max_freq` - Maximum frequency to search (Hz)
1562 /// * `threshold` - Threshold for minimum detection (typically 0.1-0.2)
1563 ///
1564 /// # Returns
1565 ///
1566 /// Detected fundamental frequency in Hz, or 0.0 if no pitch detected.
1567 ///
1568 /// # Applications
1569 ///
1570 /// - High-accuracy pitch tracking
1571 /// - Music transcription
1572 /// - Singing voice analysis
1573 /// - Instrument tuning
1574 /// - Prosody analysis in speech
1575 ///
1576 /// # Examples
1577 ///
1578 /// ```no_run
1579 /// use voirs_sdk::audio::AudioBuffer;
1580 ///
1581 /// let buffer = AudioBuffer::mono(vec![0.5; 8192], 22050);
1582 /// let pitch = buffer.detect_pitch_yin(80.0, 400.0, 0.15);
1583 /// if pitch > 0.0 {
1584 /// println!("Detected pitch: {:.1} Hz", pitch);
1585 /// }
1586 /// ```
1587 pub fn detect_pitch_yin(&self, min_freq: f32, max_freq: f32, threshold: f32) -> f32 {
1588 let sample_rate = self.sample_rate as f32;
1589 let min_period = (sample_rate / max_freq) as usize;
1590 let max_period = (sample_rate / min_freq) as usize;
1591
1592 // Need sufficient samples
1593 let buffer_size = (max_period * 2).min(self.samples.len());
1594 if buffer_size < max_period {
1595 return 0.0;
1596 }
1597
1598 // Step 1: Calculate difference function
1599 let mut diff = vec![0.0_f32; max_period + 1];
1600 diff[0] = 1.0; // By definition
1601
1602 // Note: tau is used for both arithmetic (buffer_size - tau) and indexing (diff[tau], samples[i + tau])
1603 #[allow(clippy::needless_range_loop)]
1604 for tau in 1..=max_period {
1605 let mut sum = 0.0;
1606 for i in 0..(buffer_size - tau) {
1607 let delta = self.samples[i] - self.samples[i + tau];
1608 sum += delta * delta;
1609 }
1610 diff[tau] = sum;
1611 }
1612
1613 // Step 2: Cumulative mean normalized difference function
1614 let mut cmnd = vec![0.0_f32; max_period + 1];
1615 cmnd[0] = 1.0;
1616
1617 let mut running_sum = 0.0;
1618 for tau in 1..=max_period {
1619 running_sum += diff[tau];
1620 cmnd[tau] = if running_sum > 0.0 {
1621 diff[tau] * tau as f32 / running_sum
1622 } else {
1623 1.0
1624 };
1625 }
1626
1627 // Step 3: Find first minimum below threshold
1628 let mut tau_estimate = 0;
1629 for tau in min_period..=max_period {
1630 if cmnd[tau] < threshold {
1631 // Look for local minimum
1632 if tau > 0
1633 && tau < max_period
1634 && cmnd[tau] < cmnd[tau - 1]
1635 && cmnd[tau] < cmnd[tau + 1]
1636 {
1637 tau_estimate = tau;
1638 break;
1639 }
1640 }
1641 }
1642
1643 if tau_estimate == 0 {
1644 // No pitch found below threshold
1645 return 0.0;
1646 }
1647
1648 // Step 4: Parabolic interpolation for sub-sample accuracy
1649 let tau = tau_estimate;
1650 let refined_tau = if tau > 0 && tau < max_period {
1651 let alpha = cmnd[tau - 1];
1652 let beta = cmnd[tau];
1653 let gamma = cmnd[tau + 1];
1654
1655 // Parabolic interpolation
1656 let adjustment = if (alpha - 2.0 * beta + gamma).abs() > 1e-10 {
1657 0.5 * (alpha - gamma) / (alpha - 2.0 * beta + gamma)
1658 } else {
1659 0.0
1660 };
1661
1662 tau as f32 + adjustment
1663 } else {
1664 tau as f32
1665 };
1666
1667 // Convert period to frequency
1668 sample_rate / refined_tau
1669 }
1670}
1671
1672#[cfg(test)]
1673#[path = "utilities_tests.rs"]
1674mod tests;