use std::num::NonZeroUsize;
use crate::operations::traits::AudioPitchAnalysis;
use crate::operations::types::PitchDetectionMethod;
use crate::{
AudioSampleError, AudioSampleResult, AudioSamples, AudioTypeConversion, ParameterError,
StandardSample,
};
use non_empty_slice::NonEmptySlice;
use spectrograms::{ChromaParams, SpectrogramPlanner, StftParams, WindowType};
impl<T> AudioPitchAnalysis for AudioSamples<'_, T>
where
T: StandardSample,
Self: AudioTypeConversion<Sample = T>,
{
#[inline]
fn detect_pitch_yin(
&self,
threshold: f64,
min_frequency: f64,
max_frequency: f64,
) -> AudioSampleResult<Option<f64>> {
if self.is_multi_channel() {
return Err(AudioSampleError::unsupported(
"YIN pitch detection is only supported for mono audio samples",
));
}
if !(0.0..=1.0).contains(&threshold) {
return Err(AudioSampleError::Parameter(ParameterError::invalid_value(
"threshold",
"Threshold must be between 0.0 and 1.0",
)));
}
if min_frequency <= 0.0 || max_frequency <= min_frequency {
return Err(AudioSampleError::Parameter(ParameterError::invalid_value(
"frequency_range",
"Invalid frequency range",
)));
}
let samples = self.to_format::<f64>();
let samples_slice = samples
.as_slice()
.expect("Same to unwrap since mono samples always return a contiguous slice of memory");
let sample_rate_f = self.sample_rate_hz();
let min_tau = (sample_rate_f / max_frequency) as usize;
let max_tau = (sample_rate_f / min_frequency) as usize;
if max_tau >= samples_slice.len() / 2 {
return Ok(None);
}
let samples = non_empty_slice::non_empty_slice!(samples_slice);
let result = yin_pitch_detection(samples, min_tau, max_tau, threshold);
Ok(result.map(|tau| sample_rate_f / tau))
}
#[inline]
fn detect_pitch_autocorr(
&self,
min_frequency: f64,
max_frequency: f64,
) -> AudioSampleResult<Option<f64>> {
if self.is_multi_channel() {
return Err(AudioSampleError::unsupported(
"Autocorrelation pitch detection is only supported for mono audio samples",
));
}
if min_frequency <= 1.0 || max_frequency <= min_frequency {
return Err(AudioSampleError::Parameter(ParameterError::invalid_value(
"frequency_range",
format!("Invalid frequency range {min_frequency} - {max_frequency}"),
)));
}
let samples = self.to_format::<f64>();
let samples_slice = samples
.as_slice()
.expect("Same to unwrap since mono samples always return a contiguous slice of memory");
let sample_rate = self.sample_rate_hz();
let min_tau = (sample_rate / max_frequency) as usize;
let max_tau = (sample_rate / min_frequency) as usize;
if max_tau >= samples.len().get() / 2 {
return Ok(None);
}
let samples = unsafe { NonEmptySlice::new_unchecked(samples_slice) };
let result = autocorr_pitch_detection(samples, min_tau, max_tau);
Ok(result.map(|tau| sample_rate / tau))
}
#[inline]
fn track_pitch(
&self,
window_size: NonZeroUsize,
hop_size: NonZeroUsize,
method: PitchDetectionMethod,
threshold: f64,
min_frequency: f64,
max_frequency: f64,
) -> AudioSampleResult<Vec<(f64, Option<f64>)>> {
if self.is_multi_channel() {
return Err(AudioSampleError::unsupported(
"Pitch tracking is only supported for mono audio samples",
));
}
if window_size.get() <= hop_size.get() {
return Err(AudioSampleError::Parameter(ParameterError::invalid_value(
"window_hop_size",
"Window size must be greater than or equal to hop size",
)));
}
if self.len() < window_size {
return Err(AudioSampleError::Parameter(ParameterError::invalid_value(
"window_size",
"Window size cannot be larger than the total number of samples",
)));
}
let window_size = window_size.get();
let hop_size = hop_size.get();
let samples = self.as_f64();
let samples_slice = samples
.as_slice()
.expect("Same to unwrap since mono samples always return a contiguous slice of memory");
let sample_rate = self.sample_rate_hz();
let total_results = (0..samples_slice.len()).step_by(hop_size).count();
let mut results = Vec::with_capacity(total_results);
for start in (0..samples_slice.len()).step_by(hop_size) {
let end = (start + window_size).min(samples_slice.len());
if end - start < window_size / 2 {
break; }
let window = &samples_slice[start..end];
let window = unsafe { NonEmptySlice::new_unchecked(window) };
let time_seconds = start as f64 / sample_rate;
let window_data = window.to_non_empty_vec();
let window_audio =
AudioSamples::from_mono_vec(window_data, self.sample_rate).into_owned();
let frequency: Option<f64> = match method {
PitchDetectionMethod::Yin => {
window_audio.detect_pitch_yin(threshold, min_frequency, max_frequency)?
}
PitchDetectionMethod::Autocorrelation => {
window_audio.detect_pitch_autocorr(min_frequency, max_frequency)?
}
_ => {
eprintln!("Pitch detection method {method:?} not implemented yet");
None
} };
results.push((time_seconds, frequency));
}
Ok(results)
}
#[inline]
fn harmonic_to_noise_ratio(
&self,
fundamental_freq: f64,
num_harmonics: NonZeroUsize,
n_fft: Option<NonZeroUsize>,
window_type: Option<WindowType>,
) -> AudioSampleResult<f64> {
if self.is_multi_channel() {
return Err(AudioSampleError::Unsupported(
"Harmonic-to-noise ratio analysis is only supported for mono audio samples"
.to_string(),
));
}
if fundamental_freq <= 0.0 {
return Err(AudioSampleError::Parameter(ParameterError::invalid_value(
"harmonics_config",
"Invalid fundamental frequency",
)));
}
let sample_rate = self.sample_rate_hz();
let samples = self.as_f64();
let samples_slice = samples
.as_slice()
.expect("Same to unwrap since mono samples always return a contiguous slice of memory");
let samples_slice = unsafe { NonEmptySlice::new_unchecked(samples_slice) };
let planner = SpectrogramPlanner::new();
let n_fft = n_fft.map_or(samples.len(), |n| n);
let spectrum = planner.compute_power_spectrum(
samples_slice,
n_fft,
window_type.unwrap_or_default(),
)?;
let freq_resolution = sample_rate / n_fft.get() as f64;
let mut harmonic_power = 0.0;
let mut total_power = 0.0;
for (i, &power) in spectrum.iter().enumerate() {
let freq = i as f64 * freq_resolution;
total_power += power;
for harmonic in 1..=num_harmonics.get() {
let harmonic_freq = fundamental_freq * harmonic as f64;
if (freq - harmonic_freq).abs() < freq_resolution {
harmonic_power += power;
break;
}
}
}
let noise_power = total_power - harmonic_power;
if noise_power <= 0.0 {
return Ok(f64::INFINITY);
}
Ok(10.0 * (harmonic_power / noise_power).log10())
}
#[inline]
fn harmonic_analysis(
&self,
fundamental_freq: f64,
num_harmonics: NonZeroUsize,
tolerance: f64,
n_fft: Option<NonZeroUsize>,
window_type: Option<WindowType>,
) -> AudioSampleResult<Vec<f64>> {
if self.is_multi_channel() {
return Err(AudioSampleError::Unsupported(
"Harmonic analysis is only supported for mono audio samples".to_string(),
));
}
if fundamental_freq <= 0.0 {
return Err(AudioSampleError::Parameter(ParameterError::invalid_value(
"harmonics_config",
"Invalid fundamental frequency",
)));
}
if !(0.0..=1.0).contains(&tolerance) {
return Err(AudioSampleError::Parameter(ParameterError::invalid_value(
"tolerance",
"Tolerance must be between 0.0 and 1.0",
)));
}
let samples = self.as_f64();
let samples_slice = samples
.as_slice()
.expect("Same to unwrap since mono samples always return a contiguous slice of memory");
let samples_slice = unsafe { NonEmptySlice::new_unchecked(samples_slice) };
let sample_rate = self.sample_rate_hz();
let planner = SpectrogramPlanner::new();
let n_fft = n_fft.map_or(samples.len(), |n| n);
let spectrum = planner.compute_power_spectrum(
samples_slice,
n_fft,
window_type.unwrap_or_default(),
)?;
let freq_resolution = sample_rate / n_fft.get() as f64;
let mut harmonic_magnitudes = vec![0.0; num_harmonics.get()];
for harmonic in 1..=num_harmonics.get() {
let harmonic_freq = fundamental_freq * harmonic as f64;
let target_bin = (harmonic_freq / freq_resolution).round() as usize;
let tolerance_bins = (tolerance * harmonic_freq / freq_resolution) as usize;
let start_bin = target_bin.saturating_sub(tolerance_bins);
let end_bin = (target_bin + tolerance_bins).min(spectrum.len().get() - 1);
let max_magnitude = spectrum[start_bin..=end_bin]
.iter()
.fold(0.0, |acc: f64, &x| acc.max(x));
harmonic_magnitudes[harmonic - 1] = max_magnitude;
}
if harmonic_magnitudes[0] > 0.0 {
let fundamental_magnitude = harmonic_magnitudes[0];
for magnitude in &mut harmonic_magnitudes {
*magnitude /= fundamental_magnitude;
}
}
Ok(harmonic_magnitudes)
}
#[inline]
fn estimate_key(&self, stft_params: &StftParams) -> AudioSampleResult<(usize, f64)> {
let samples = self.to_format::<f64>();
let samples_slice = samples
.as_slice()
.expect("Same to unwrap since mono samples always return a contiguous slice of memory");
let samples_slice = unsafe { NonEmptySlice::new_unchecked(samples_slice) };
let sample_rate_f = self.sample_rate_hz();
let major_profile: [f64; 12] = [
6.35, 2.23, 3.48, 2.33, 4.38, 4.09, 2.52, 5.19, 2.39, 3.66, 2.29, 2.88,
];
let minor_profile: [f64; 12] = [
6.33, 2.68, 3.52, 5.38, 2.60, 3.53, 2.54, 4.75, 3.98, 2.69, 3.34, 3.17,
];
let chroma_params = ChromaParams::music_standard();
let chromagram =
spectrograms::chromagram(samples_slice, stft_params, sample_rate_f, &chroma_params)?;
let mut chroma_sum = vec![0.0; 12];
for frame_idx in 0..chromagram.n_frames().get() {
for (pitch_class, value) in chroma_sum.iter_mut().enumerate().take(12) {
*value += chromagram.data[[pitch_class, frame_idx]];
}
}
let num_frames = chromagram.n_frames().get() as f64;
for value in &mut chroma_sum {
*value /= num_frames;
}
let chroma_sum_total: f64 = chroma_sum.iter().sum();
if chroma_sum_total > 0.0 {
for value in &mut chroma_sum {
*value /= chroma_sum_total;
}
}
let mut best_correlation = -1.0;
let mut best_key = 0;
let mut is_major = true;
for tonic in 0..12 {
let correlation = calculate_correlation(&chroma_sum, &major_profile, tonic);
if correlation > best_correlation {
best_correlation = correlation;
best_key = tonic;
is_major = true;
}
}
for tonic in 0..12 {
let correlation = calculate_correlation(&chroma_sum, &minor_profile, tonic);
if correlation > best_correlation {
best_correlation = correlation;
best_key = tonic;
is_major = false;
}
}
let encoded_key = if is_major { best_key } else { best_key + 12 };
let confidence = f64::midpoint(best_correlation, 1.0);
Ok((encoded_key, confidence))
}
}
fn calculate_correlation(chroma: &[f64], profile: &[f64], tonic: usize) -> f64 {
debug_assert_eq!(chroma.len(), 12);
debug_assert_eq!(profile.len(), 12);
let mut rotated_profile = [0.0; 12];
for i in 0..12 {
rotated_profile[i] = profile[(i + tonic) % 12];
}
let chroma_mean: f64 = chroma.iter().sum::<f64>() / 12.0;
let profile_mean: f64 = rotated_profile.iter().sum::<f64>() / 12.0;
let mut numerator = 0.0;
let mut chroma_variance = 0.0;
let mut profile_variance = 0.0;
for i in 0..12 {
let chroma_dev = chroma[i] - chroma_mean;
let profile_dev = rotated_profile[i] - profile_mean;
numerator += chroma_dev * profile_dev;
chroma_variance += chroma_dev * chroma_dev;
profile_variance += profile_dev * profile_dev;
}
let denominator = (chroma_variance * profile_variance).sqrt();
if denominator > 0.0 {
numerator / denominator
} else {
0.0
}
}
#[inline]
#[must_use]
pub fn yin_pitch_detection(
samples: &NonEmptySlice<f64>,
min_tau: usize,
max_tau: usize,
threshold: f64,
) -> Option<f64> {
let n = samples.len().get();
if max_tau >= n / 2 {
return None;
}
let mut diff_fn = vec![0.0; max_tau + 1];
for tau in min_tau..=max_tau {
let mut sum = 0.0;
for j in 0..(n - tau) {
let delta = samples[j] - samples[j + tau];
sum += delta * delta;
}
diff_fn[tau] = sum;
}
let mut cmnd = vec![1.0; max_tau + 1];
let mut running_sum = 0.0;
for tau in 1..=max_tau {
running_sum += diff_fn[tau];
if running_sum > 0.0 {
cmnd[tau] = diff_fn[tau] / (running_sum / tau as f64);
}
}
for tau in min_tau..=max_tau {
if cmnd[tau] < threshold {
let mut min_tau = tau;
let mut min_val = cmnd[tau];
let start = tau.saturating_sub(5).max(min_tau);
let end = (tau + 5).min(max_tau);
for (t, &val) in cmnd.iter().enumerate().skip(start).take(end - start + 1) {
if val < min_val {
min_val = val;
min_tau = t;
}
}
return Some(min_tau as f64);
}
}
None
}
#[inline]
#[must_use]
pub fn autocorr_pitch_detection(samples: &[f64], min_tau: usize, max_tau: usize) -> Option<f64> {
let n = samples.len();
if max_tau >= n / 2 {
return None;
}
let mut max_corr = 0.0;
let mut best_tau = 0;
for tau in min_tau..=max_tau {
let mut corr = 0.0;
for i in 0..(n - tau) {
corr += samples[i] * samples[i + tau];
}
if corr > max_corr {
max_corr = corr;
best_tau = tau;
}
}
if best_tau > 0 {
Some(best_tau as f64)
} else {
None
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::operations::traits::AudioPitchAnalysis;
use crate::sample_rate;
use ndarray::Array1;
use non_empty_slice::{NonEmptyVec, non_empty_vec};
use std::f64::consts::PI;
#[test]
fn test_pitch_detection_yin() {
let sample_rate = 44100;
let duration = 1.0; let frequency = 440.0; let samples_count = (sample_rate as f64 * duration) as usize;
let mut samples = Vec::new();
for i in 0..samples_count {
let t = i as f64 / sample_rate as f64;
let value = (2.0 * PI * frequency * t).sin();
samples.push(value as f64);
}
let samples = NonEmptyVec::new(samples).unwrap();
let audio: AudioSamples<'_, f64> =
AudioSamples::from_mono_vec(samples, sample_rate!(44100));
let detected_pitch = audio.detect_pitch_yin(0.1, 80.0, 1000.0).unwrap();
assert!(detected_pitch.is_some());
let pitch: f64 = detected_pitch.unwrap();
assert!((pitch - 440.0).abs() < 10.0); }
#[test]
fn test_pitch_detection_autocorr() {
let sample_rate = 44100;
let duration = 1.0; let frequency = 220.0; let samples_count = (sample_rate as f64 * duration) as usize;
let mut samples = Vec::new();
for i in 0..samples_count {
let t = i as f64 / sample_rate as f64;
let value = (2.0 * PI * frequency * t).sin();
samples.push(value as f32);
}
let samples = NonEmptyVec::new(samples).unwrap();
let audio: AudioSamples<'_, f32> =
AudioSamples::from_mono_vec(samples, sample_rate!(44100));
let detected_pitch = audio.detect_pitch_autocorr(80.0, 1000.0).unwrap();
assert!(detected_pitch.is_some());
let pitch: f64 = detected_pitch.unwrap();
assert!((pitch - 220.0).abs() < 10.0); }
#[test]
fn test_pitch_tracking() {
let sample_rate = 44100;
let duration = 0.5; let frequency = 440.0; let samples_count = (sample_rate as f64 * duration) as usize;
let mut samples = Vec::new();
for i in 0..samples_count {
let t = i as f64 / sample_rate as f64;
let value = (2.0 * PI * frequency * t).sin();
samples.push(value as f32);
}
let samples = NonEmptyVec::new(samples).unwrap();
let audio: AudioSamples<'_, f32> =
AudioSamples::from_mono_vec(samples, sample_rate!(44100));
let window_size = NonZeroUsize::new(2048).unwrap();
let hop_size = NonZeroUsize::new(512).unwrap();
let pitch_track = audio
.track_pitch(
window_size,
hop_size,
PitchDetectionMethod::Yin,
0.1,
80.0,
1000.0,
)
.unwrap();
assert!(!pitch_track.is_empty());
let detected_pitches: Vec<f64> =
pitch_track.iter().filter_map(|(_, pitch)| *pitch).collect();
assert!(!detected_pitches.is_empty());
let avg_pitch = detected_pitches.iter().sum::<f64>() / detected_pitches.len() as f64;
assert!((avg_pitch - 440.0).abs() < 20.0); }
#[test]
fn test_harmonic_analysis() {
let sample_rate = 44100;
let duration = 1.0; let frequency = 220.0; let samples_count = (sample_rate as f64 * duration) as usize;
let mut samples = Vec::new();
for i in 0..samples_count {
let t = i as f64 / sample_rate as f64;
let mut value = 0.0;
for harmonic in 1..10 {
value += (2.0 * PI * frequency * harmonic as f64 * t).sin() / harmonic as f64;
}
samples.push(value as f32);
}
let samples = NonEmptyVec::new(samples).unwrap();
let audio: AudioSamples<'_, f32> =
AudioSamples::from_mono_vec(samples, sample_rate!(44100));
let harmonics = audio
.harmonic_analysis(frequency, NonZeroUsize::new(5).unwrap(), 0.1, None, None)
.unwrap();
assert_eq!(harmonics.len(), 5);
assert!((harmonics[0] - 1.0).abs() < 0.1);
for i in 1..harmonics.len() {
assert!(harmonics[i] > 0.0);
assert!(harmonics[i] <= harmonics[0]); }
}
#[test]
fn test_silence_detection() {
let audio = AudioSamples::new_mono(Array1::<f32>::zeros(44100).into(), sample_rate!(44100))
.unwrap();
let detected_pitch = audio.detect_pitch_yin(0.1, 80.0, 1000.0).unwrap();
assert!(detected_pitch.is_none());
let detected_pitch_autocorr = audio.detect_pitch_autocorr(80.0, 1000.0).unwrap();
assert!(detected_pitch_autocorr.is_none());
}
#[test]
fn test_noise_robustness() {
let sample_rate = 44100;
let duration = 1.0; let frequency = 440.0; let samples_count = (sample_rate as f64 * duration) as usize;
let mut samples = Vec::new();
for i in 0..samples_count {
let t = i as f64 / sample_rate as f64;
let sine_value = (2.0 * PI * frequency * t).sin();
let noise = (i as f64 * 0.1).sin() * 0.1; let value = sine_value + noise;
samples.push(value as f32);
}
let samples = NonEmptyVec::new(samples).unwrap();
let audio: AudioSamples<'_, f32> =
AudioSamples::from_mono_vec(samples, sample_rate!(44100));
let detected_pitch = audio.detect_pitch_yin(0.1, 80.0, 1000.0).unwrap();
assert!(detected_pitch.is_some());
let pitch: f64 = detected_pitch.unwrap();
assert!((pitch - 440.0).abs() < 20.0); }
#[test]
fn test_parameter_validation() {
let audio: AudioSamples<'_, f32> =
AudioSamples::from_mono_vec(non_empty_vec![1.0f32, 2.0, 3.0], sample_rate!(44100));
assert!(audio.detect_pitch_yin(-0.1, 80.0, 1000.0).is_err());
assert!(audio.detect_pitch_yin(1.5, 80.0, 1000.0).is_err());
assert!(audio.detect_pitch_yin(0.1, -80.0, 1000.0).is_err());
assert!(audio.detect_pitch_yin(0.1, 1000.0, 800.0).is_err());
assert!(
audio
.harmonic_analysis(-440.0, NonZeroUsize::new(5).unwrap(), 0.1, None, None)
.is_err()
);
assert!(
audio
.harmonic_analysis(440.0, NonZeroUsize::new(5).unwrap(), -0.1, None, None)
.is_err()
);
assert!(
audio
.harmonic_analysis(440.0, NonZeroUsize::new(5).unwrap(), 1.5, None, None)
.is_err()
);
}
}